]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/reiser4-for-2.6.23.patch
Load libata prior udev at installer because some SATA doesnt autoload it
[people/pmueller/ipfire-2.x.git] / src / patches / reiser4-for-2.6.23.patch
CommitLineData
202f35e2
AF
1diff -urN linux-2.6.23.orig/arch/i386/lib/usercopy.c linux-2.6.23/arch/i386/lib/usercopy.c
2--- linux-2.6.23.orig/arch/i386/lib/usercopy.c 2007-10-10 00:31:38.000000000 +0400
3+++ linux-2.6.23/arch/i386/lib/usercopy.c 2007-12-04 20:02:08.041841326 +0300
4@@ -817,6 +817,7 @@
5 #endif
6 return n;
7 }
8+EXPORT_SYMBOL(__copy_from_user_ll_nocache);
9
10 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
11 unsigned long n)
12@@ -831,6 +832,7 @@
13 #endif
14 return n;
15 }
16+EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
17
18 /**
19 * copy_to_user: - Copy a block of data into user space.
20diff -urN linux-2.6.23.orig/Documentation/Changes linux-2.6.23/Documentation/Changes
21--- linux-2.6.23.orig/Documentation/Changes 2007-10-10 00:31:38.000000000 +0400
22+++ linux-2.6.23/Documentation/Changes 2007-12-04 20:02:08.041841326 +0300
23@@ -36,6 +36,7 @@
24 o e2fsprogs 1.29 # tune2fs
25 o jfsutils 1.1.3 # fsck.jfs -V
26 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
27+o reiser4progs 1.0.0 # fsck.reiser4 -V
28 o xfsprogs 2.6.0 # xfs_db -V
29 o pcmciautils 004 # pccardctl -V
30 o quota-tools 3.09 # quota -V
31@@ -145,6 +146,13 @@
32 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
33 reiserfsck. These utils work on both i386 and alpha platforms.
34
35+Reiser4progs
36+------------
37+
38+The reiser4progs package contains utilities for the reiser4 file system.
39+Detailed instructions are provided in the README file located at:
40+<ftp://ftp.namesys.com/pub/reiser4progs/README>.
41+
42 Xfsprogs
43 --------
44
45@@ -323,6 +331,10 @@
46 -------------
47 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
48
49+Reiser4progs
50+------------
51+o <ftp://ftp.namesys.com/pub/reiser4progs/>
52+
53 Xfsprogs
54 --------
55 o <ftp://oss.sgi.com/projects/xfs/download/>
56diff -urN linux-2.6.23.orig/Documentation/filesystems/reiser4.txt linux-2.6.23/Documentation/filesystems/reiser4.txt
57--- linux-2.6.23.orig/Documentation/filesystems/reiser4.txt 1970-01-01 03:00:00.000000000 +0300
58+++ linux-2.6.23/Documentation/filesystems/reiser4.txt 2007-12-04 20:02:08.041841326 +0300
59@@ -0,0 +1,75 @@
60+Reiser4 filesystem
61+==================
62+Reiser4 is a file system based on dancing tree algorithms, and is
63+described at http://www.namesys.com
64+
65+
66+References
67+==========
68+web page http://namesys.com/v4/v4.html
69+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
70+userland tools ftp://ftp.namesys.com/pub/reiser4progs/
71+install page http://www.namesys.com/install_v4.html
72+
73+Compile options
74+===============
75+Enable reiser4 debug mode
76+ This checks everything imaginable while reiser4
77+ runs
78+
79+Mount options
80+=============
81+tmgr.atom_max_size=N
82+ Atoms containing more than N blocks will be forced to commit.
83+ N is decimal.
84+ Default is nr_free_pagecache_pages() / 2 at mount time.
85+
86+tmgr.atom_max_age=N
87+ Atoms older than N seconds will be forced to commit. N is decimal.
88+ Default is 600.
89+
90+tmgr.atom_max_flushers=N
91+ Limit of concurrent flushers for one atom. 0 means no limit.
92+ Default is 0.
93+
94+tree.cbk_cache.nr_slots=N
95+ Number of slots in the cbk cache.
96+
97+flush.relocate_threshold=N
98+ If flush finds more than N adjacent dirty leaf-level blocks it
99+ will force them to be relocated.
100+ Default is 64.
101+
102+flush.relocate_distance=N
103+ If flush finds can find a block allocation closer than at most
104+ N from the preceder it will relocate to that position.
105+ Default is 64.
106+
107+flush.scan_maxnodes=N
108+ The maximum number of nodes to scan left on a level during
109+ flush.
110+ Default is 10000.
111+
112+optimal_io_size=N
113+ Preferred IO size. This value is used to set st_blksize of
114+ struct stat.
115+ Default is 65536.
116+
117+bsdgroups
118+ Turn on BSD-style gid assignment.
119+
120+32bittimes
121+ By default file in reiser4 have 64 bit timestamps. Files
122+ created when filesystem is mounted with 32bittimes mount
123+ option will get 32 bit timestamps.
124+
125+mtflush
126+ Turn off concurrent flushing.
127+
128+nopseudo
129+ Disable pseudo files support. See
130+ http://namesys.com/v4/pseudo.html for more about pseudo files.
131+
132+dont_load_bitmap
133+ Don't load all bitmap blocks at mount time, it is useful for
134+ machines with tiny RAM and large disks.
135diff -urN linux-2.6.23.orig/fs/fs-writeback.c linux-2.6.23/fs/fs-writeback.c
136--- linux-2.6.23.orig/fs/fs-writeback.c 2007-10-10 00:31:38.000000000 +0400
137+++ linux-2.6.23/fs/fs-writeback.c 2007-12-04 20:02:08.045842355 +0300
138@@ -296,8 +296,6 @@
139 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
140 * that it can be located for waiting on in __writeback_single_inode().
141 *
142- * Called under inode_lock.
143- *
144 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
145 * This function assumes that the blockdev superblock's inodes are backed by
146 * a variety of queues, so all inodes are searched. For other superblocks,
147@@ -313,11 +311,13 @@
148 * on the writer throttling path, and we get decent balancing between many
149 * throttled threads: we don't want them all piling up on __wait_on_inode.
150 */
151-static void
152-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
153+void
154+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
155 {
156 const unsigned long start = jiffies; /* livelock avoidance */
157
158+ spin_lock(&inode_lock);
159+
160 if (!wbc->for_kupdate || list_empty(&sb->s_io))
161 list_splice_init(&sb->s_dirty, &sb->s_io);
162
163@@ -397,8 +397,19 @@
164 if (wbc->nr_to_write <= 0)
165 break;
166 }
167+ spin_unlock(&inode_lock);
168 return; /* Leave any unwritten inodes on s_io */
169 }
170+EXPORT_SYMBOL(generic_sync_sb_inodes);
171+
172+static void
173+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
174+{
175+ if (sb->s_op->sync_inodes)
176+ sb->s_op->sync_inodes(sb, wbc);
177+ else
178+ generic_sync_sb_inodes(sb, wbc);
179+}
180
181 /*
182 * Start writeback of dirty pagecache data against all unlocked inodes.
183@@ -439,11 +450,8 @@
184 * be unmounted by the time it is released.
185 */
186 if (down_read_trylock(&sb->s_umount)) {
187- if (sb->s_root) {
188- spin_lock(&inode_lock);
189+ if (sb->s_root)
190 sync_sb_inodes(sb, wbc);
191- spin_unlock(&inode_lock);
192- }
193 up_read(&sb->s_umount);
194 }
195 spin_lock(&sb_lock);
196@@ -481,9 +489,7 @@
197 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
198 nr_dirty + nr_unstable;
199 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
200- spin_lock(&inode_lock);
201 sync_sb_inodes(sb, &wbc);
202- spin_unlock(&inode_lock);
203 }
204
205 /*
206diff -urN linux-2.6.23.orig/fs/Kconfig linux-2.6.23/fs/Kconfig
207--- linux-2.6.23.orig/fs/Kconfig 2007-10-10 00:31:38.000000000 +0400
208+++ linux-2.6.23/fs/Kconfig 2007-12-04 20:02:08.045842355 +0300
209@@ -272,6 +272,8 @@
210 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
211 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
212
213+source "fs/reiser4/Kconfig"
214+
215 config REISERFS_FS
216 tristate "Reiserfs support"
217 help
218diff -urN linux-2.6.23.orig/fs/Makefile linux-2.6.23/fs/Makefile
219--- linux-2.6.23.orig/fs/Makefile 2007-10-10 00:31:38.000000000 +0400
220+++ linux-2.6.23/fs/Makefile 2007-12-04 20:02:08.049843385 +0300
221@@ -66,6 +66,7 @@
222
223 # Do not add any filesystems before this line
224 obj-$(CONFIG_REISERFS_FS) += reiserfs/
225+obj-$(CONFIG_REISER4_FS) += reiser4/
226 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
227 obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
228 obj-$(CONFIG_JBD) += jbd/
229diff -urN linux-2.6.23.orig/fs/reiser4/as_ops.c linux-2.6.23/fs/reiser4/as_ops.c
230--- linux-2.6.23.orig/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300
231+++ linux-2.6.23/fs/reiser4/as_ops.c 2007-12-04 16:49:30.000000000 +0300
232@@ -0,0 +1,377 @@
233+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
234+
235+/* Interface to VFS. Reiser4 address_space_operations are defined here. */
236+
237+#include "forward.h"
238+#include "debug.h"
239+#include "dformat.h"
240+#include "coord.h"
241+#include "plugin/item/item.h"
242+#include "plugin/file/file.h"
243+#include "plugin/security/perm.h"
244+#include "plugin/disk_format/disk_format.h"
245+#include "plugin/plugin.h"
246+#include "plugin/plugin_set.h"
247+#include "plugin/object.h"
248+#include "txnmgr.h"
249+#include "jnode.h"
250+#include "znode.h"
251+#include "block_alloc.h"
252+#include "tree.h"
253+#include "vfs_ops.h"
254+#include "inode.h"
255+#include "page_cache.h"
256+#include "ktxnmgrd.h"
257+#include "super.h"
258+#include "reiser4.h"
259+#include "entd.h"
260+
261+#include <linux/profile.h>
262+#include <linux/types.h>
263+#include <linux/mount.h>
264+#include <linux/vfs.h>
265+#include <linux/mm.h>
266+#include <linux/buffer_head.h>
267+#include <linux/dcache.h>
268+#include <linux/list.h>
269+#include <linux/pagemap.h>
270+#include <linux/slab.h>
271+#include <linux/seq_file.h>
272+#include <linux/init.h>
273+#include <linux/module.h>
274+#include <linux/writeback.h>
275+#include <linux/backing-dev.h>
276+#include <linux/quotaops.h>
277+#include <linux/security.h>
278+
279+/* address space operations */
280+
281+/**
282+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
283+ * @page: page to be dirtied
284+ *
285+ * Operation of struct address_space_operations. This implementation is used by
286+ * unix and cryptcompress file plugins.
287+ *
288+ * This is called when reiser4 page gets dirtied outside of reiser4, for
289+ * example, when dirty bit is moved from pte to physical page.
290+ *
291+ * Tags page in the mapping's page tree with special tag so that it is possible
292+ * to do all the reiser4 specific work wrt dirty pages (jnode creation,
293+ * capturing by an atom) later because it can not be done in the contexts where
294+ * set_page_dirty is called.
295+ */
296+int reiser4_set_page_dirty(struct page *page)
297+{
298+ /* this page can be unformatted only */
299+ assert("vs-1734", (page->mapping &&
300+ page->mapping->host &&
301+ reiser4_get_super_fake(page->mapping->host->i_sb) !=
302+ page->mapping->host
303+ && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
304+ page->mapping->host
305+ && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
306+ page->mapping->host));
307+
308+ if (!TestSetPageDirty(page)) {
309+ struct address_space *mapping = page->mapping;
310+
311+ if (mapping) {
312+ write_lock_irq(&mapping->tree_lock);
313+
314+ /* check for race with truncate */
315+ if (page->mapping) {
316+ assert("vs-1652", page->mapping == mapping);
317+ if (mapping_cap_account_dirty(mapping))
318+ inc_zone_page_state(page,
319+ NR_FILE_DIRTY);
320+ radix_tree_tag_set(&mapping->page_tree,
321+ page->index,
322+ PAGECACHE_TAG_REISER4_MOVED);
323+ }
324+ write_unlock_irq(&mapping->tree_lock);
325+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
326+ }
327+ }
328+ return 0;
329+}
330+
331+/* ->invalidatepage method for reiser4 */
332+
333+/*
334+ * this is called for each truncated page from
335+ * truncate_inode_pages()->truncate_{complete,partial}_page().
336+ *
337+ * At the moment of call, page is under lock, and outstanding io (if any) has
338+ * completed.
339+ */
340+
341+/**
342+ * reiser4_invalidatepage
343+ * @page: page to invalidate
344+ * @offset: starting offset for partial invalidation
345+ *
346+ */
347+void reiser4_invalidatepage(struct page *page, unsigned long offset)
348+{
349+ int ret = 0;
350+ reiser4_context *ctx;
351+ struct inode *inode;
352+ jnode *node;
353+
354+ /*
355+ * This is called to truncate file's page.
356+ *
357+ * Originally, reiser4 implemented truncate in a standard way
358+ * (vmtruncate() calls ->invalidatepage() on all truncated pages
359+ * first, then file system ->truncate() call-back is invoked).
360+ *
361+ * This lead to the problem when ->invalidatepage() was called on a
362+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
363+ * process. That is, truncate was bypassing transactions. To avoid
364+ * this, try_capture_page_to_invalidate() call was added here.
365+ *
366+ * After many troubles with vmtruncate() based truncate (including
367+ * races with flush, tail conversion, etc.) it was re-written in the
368+ * top-to-bottom style: items are killed in reiser4_cut_tree_object()
369+ * and pages belonging to extent are invalidated in kill_hook_extent().
370+ * So probably now additional call to capture is not needed here.
371+ */
372+
373+ assert("nikita-3137", PageLocked(page));
374+ assert("nikita-3138", !PageWriteback(page));
375+ inode = page->mapping->host;
376+
377+ /*
378+ * ->invalidatepage() should only be called for the unformatted
379+ * jnodes. Destruction of all other types of jnodes is performed
380+ * separately. But, during some corner cases (like handling errors
381+ * during mount) it is simpler to let ->invalidatepage to be called on
382+ * them. Check for this, and do nothing.
383+ */
384+ if (reiser4_get_super_fake(inode->i_sb) == inode)
385+ return;
386+ if (reiser4_get_cc_fake(inode->i_sb) == inode)
387+ return;
388+ if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
389+ return;
390+ assert("vs-1426", PagePrivate(page));
391+ assert("vs-1427",
392+ page->mapping == jnode_get_mapping(jnode_by_page(page)));
393+ assert("", jprivate(page) != NULL);
394+ assert("", ergo(inode_file_plugin(inode) !=
395+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
396+ offset == 0));
397+
398+ ctx = reiser4_init_context(inode->i_sb);
399+ if (IS_ERR(ctx))
400+ return;
401+
402+ node = jprivate(page);
403+ spin_lock_jnode(node);
404+ if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
405+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
406+ /* there is not need to capture */
407+ jref(node);
408+ JF_SET(node, JNODE_HEARD_BANSHEE);
409+ page_clear_jnode(page, node);
410+ reiser4_uncapture_jnode(node);
411+ unhash_unformatted_jnode(node);
412+ jput(node);
413+ reiser4_exit_context(ctx);
414+ return;
415+ }
416+ spin_unlock_jnode(node);
417+
418+ /* capture page being truncated. */
419+ ret = try_capture_page_to_invalidate(page);
420+ if (ret != 0)
421+ warning("nikita-3141", "Cannot capture: %i", ret);
422+
423+ if (offset == 0) {
424+ /* remove jnode from transaction and detach it from page. */
425+ jref(node);
426+ JF_SET(node, JNODE_HEARD_BANSHEE);
427+ /* page cannot be detached from jnode concurrently, because it
428+ * is locked */
429+ reiser4_uncapture_page(page);
430+
431+ /* this detaches page from jnode, so that jdelete will not try
432+ * to lock page which is already locked */
433+ spin_lock_jnode(node);
434+ page_clear_jnode(page, node);
435+ spin_unlock_jnode(node);
436+ unhash_unformatted_jnode(node);
437+
438+ jput(node);
439+ }
440+
441+ reiser4_exit_context(ctx);
442+}
443+
444+/* help function called from reiser4_releasepage(). It returns true if jnode
445+ * can be detached from its page and page released. */
446+int jnode_is_releasable(jnode * node /* node to check */ )
447+{
448+ assert("nikita-2781", node != NULL);
449+ assert_spin_locked(&(node->guard));
450+ assert_spin_locked(&(node->load));
451+
452+ /* is some thread is currently using jnode page, later cannot be
453+ * detached */
454+ if (atomic_read(&node->d_count) != 0) {
455+ return 0;
456+ }
457+
458+ assert("vs-1214", !jnode_is_loaded(node));
459+
460+ /*
461+ * can only release page if real block number is assigned to it. Simple
462+ * check for ->atom wouldn't do, because it is possible for node to be
463+ * clean, not it atom yet, and still having fake block number. For
464+ * example, node just created in jinit_new().
465+ */
466+ if (reiser4_blocknr_is_fake(jnode_get_block(node)))
467+ return 0;
468+
469+ /*
470+ * pages prepared for write can not be released anyway, so avoid
471+ * detaching jnode from the page
472+ */
473+ if (JF_ISSET(node, JNODE_WRITE_PREPARED))
474+ return 0;
475+
476+ /*
477+ * dirty jnode cannot be released. It can however be submitted to disk
478+ * as part of early flushing, but only after getting flush-prepped.
479+ */
480+ if (JF_ISSET(node, JNODE_DIRTY))
481+ return 0;
482+
483+ /* overwrite set is only written by log writer. */
484+ if (JF_ISSET(node, JNODE_OVRWR))
485+ return 0;
486+
487+ /* jnode is already under writeback */
488+ if (JF_ISSET(node, JNODE_WRITEBACK))
489+ return 0;
490+
491+ /* don't flush bitmaps or journal records */
492+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
493+ return 0;
494+
495+ return 1;
496+}
497+
498+/*
499+ * ->releasepage method for reiser4
500+ *
501+ * This is called by VM scanner when it comes across clean page. What we have
502+ * to do here is to check whether page can really be released (freed that is)
503+ * and if so, detach jnode from it and remove page from the page cache.
504+ *
505+ * Check for releasability is done by releasable() function.
506+ */
507+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
508+{
509+ jnode *node;
510+
511+ assert("nikita-2257", PagePrivate(page));
512+ assert("nikita-2259", PageLocked(page));
513+ assert("nikita-2892", !PageWriteback(page));
514+ assert("nikita-3019", reiser4_schedulable());
515+
516+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
517+ is not clear what to do in this case. A lot of deadlocks seems be
518+ possible. */
519+
520+ node = jnode_by_page(page);
521+ assert("nikita-2258", node != NULL);
522+ assert("reiser4-4", page->mapping != NULL);
523+ assert("reiser4-5", page->mapping->host != NULL);
524+
525+ if (PageDirty(page))
526+ return 0;
527+
528+ /* extra page reference is used by reiser4 to protect
529+ * jnode<->page link from this ->releasepage(). */
530+ if (page_count(page) > 3)
531+ return 0;
532+
533+ /* releasable() needs jnode lock, because it looks at the jnode fields
534+ * and we need jload_lock here to avoid races with jload(). */
535+ spin_lock_jnode(node);
536+ spin_lock(&(node->load));
537+ if (jnode_is_releasable(node)) {
538+ struct address_space *mapping;
539+
540+ mapping = page->mapping;
541+ jref(node);
542+ /* there is no need to synchronize against
543+ * jnode_extent_write() here, because pages seen by
544+ * jnode_extent_write() are !releasable(). */
545+ page_clear_jnode(page, node);
546+ spin_unlock(&(node->load));
547+ spin_unlock_jnode(node);
548+
549+ /* we are under memory pressure so release jnode also. */
550+ jput(node);
551+
552+ return 1;
553+ } else {
554+ spin_unlock(&(node->load));
555+ spin_unlock_jnode(node);
556+ assert("nikita-3020", reiser4_schedulable());
557+ return 0;
558+ }
559+}
560+
561+int reiser4_readpage(struct file *file, struct page *page)
562+{
563+ assert("edward-1533", PageLocked(page));
564+ assert("edward-1534", !PageUptodate(page));
565+ assert("edward-1535", page->mapping && page->mapping->host);
566+
567+ return inode_file_plugin(page->mapping->host)->readpage(file, page);
568+}
569+
570+int reiser4_readpages(struct file *file, struct address_space *mapping,
571+ struct list_head *pages, unsigned nr_pages)
572+{
573+ return inode_file_plugin(mapping->host)->readpages(file, mapping,
574+ pages, nr_pages);
575+}
576+
577+int reiser4_writepages(struct address_space *mapping,
578+ struct writeback_control *wbc)
579+{
580+ return inode_file_plugin(mapping->host)->writepages(mapping, wbc);
581+}
582+
583+int reiser4_prepare_write(struct file *file, struct page *page,
584+ unsigned from, unsigned to)
585+{
586+ return inode_file_plugin(file->f_dentry->d_inode)->prepare_write(file,
587+ page,
588+ from,
589+ to);
590+}
591+
592+int reiser4_commit_write(struct file *file, struct page *page,
593+ unsigned from, unsigned to)
594+{
595+ return inode_file_plugin(file->f_dentry->d_inode)->commit_write(file,
596+ page,
597+ from,
598+ to);
599+}
600+
601+/* Make Linus happy.
602+ Local variables:
603+ c-indentation-style: "K&R"
604+ mode-name: "LC"
605+ c-basic-offset: 8
606+ tab-width: 8
607+ fill-column: 120
608+ End:
609+*/
610diff -urN linux-2.6.23.orig/fs/reiser4/block_alloc.c linux-2.6.23/fs/reiser4/block_alloc.c
611--- linux-2.6.23.orig/fs/reiser4/block_alloc.c 1970-01-01 03:00:00.000000000 +0300
612+++ linux-2.6.23/fs/reiser4/block_alloc.c 2007-12-04 16:49:30.000000000 +0300
613@@ -0,0 +1,1137 @@
614+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
615+
616+#include "debug.h"
617+#include "dformat.h"
618+#include "plugin/plugin.h"
619+#include "txnmgr.h"
620+#include "znode.h"
621+#include "block_alloc.h"
622+#include "tree.h"
623+#include "super.h"
624+
625+#include <linux/types.h> /* for __u?? */
626+#include <linux/fs.h> /* for struct super_block */
627+#include <linux/spinlock.h>
628+
629+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
630+
631+/* We need to be able to reserve enough disk space to ensure that an atomic
632+ operation will have enough disk space to flush (see flush.c and
633+ http://namesys.com/v4/v4.html) and commit it once it is started.
634+
635+ In our design a call for reserving disk space may fail but not an actual
636+ block allocation.
637+
638+ All free blocks, already allocated blocks, and all kinds of reserved blocks
639+ are counted in different per-fs block counters.
640+
641+ A reiser4 super block's set of block counters currently is:
642+
643+ free -- free blocks,
644+ used -- already allocated blocks,
645+
646+ grabbed -- initially reserved for performing an fs operation, those blocks
647+ are taken from free blocks, then grabbed disk space leaks from grabbed
648+ blocks counter to other counters like "fake allocated", "flush
649+ reserved", "used", the rest of not used grabbed space is returned to
650+ free space at the end of fs operation;
651+
652+ fake allocated -- counts all nodes without real disk block numbers assigned,
653+ we have separate accounting for formatted and unformatted
654+ nodes (for easier debugging);
655+
656+ flush reserved -- disk space needed for flushing and committing an atom.
657+ Each dirty already allocated block could be written as a
658+ part of atom's overwrite set or as a part of atom's
659+ relocate set. In both case one additional block is needed,
660+ it is used as a wandered block if we do overwrite or as a
661+ new location for a relocated block.
662+
663+ In addition, blocks in some states are counted on per-thread and per-atom
664+ basis. A reiser4 context has a counter of blocks grabbed by this transaction
665+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
666+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
667+ blocks, which are reserved for flush processing and atom commit. */
668+
669+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
670+ number of blocks to grab for most expensive case of balancing when the leaf
671+ node we insert new item to gets split and new leaf node is allocated.
672+
673+ So, we need to grab blocks for
674+
675+ 1) one block for possible dirtying the node we insert an item to. That block
676+ would be used for node relocation at flush time or for allocating of a
677+ wandered one, it depends what will be a result (what set, relocate or
678+ overwrite the node gets assigned to) of the node processing by the flush
679+ algorithm.
680+
681+ 2) one block for either allocating a new node, or dirtying of right or left
682+ clean neighbor, only one case may happen.
683+
684+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
685+ node, and creation of new node. have I forgotten something? email me.
686+
687+ These grabbed blocks are counted in both reiser4 context "grabbed blocks"
688+ counter and in the fs-wide one (both ctx->grabbed_blocks and
689+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
690+ decremented by 2.
691+
692+ Suppose both two blocks were spent for dirtying of an already allocated clean
693+ node (one block went from "grabbed" to "flush reserved") and for new block
694+ allocating (one block went from "grabbed" to "fake allocated formatted").
695+
696+ Inserting of a child pointer to the parent node caused parent node to be
697+ split, the balancing code takes care about this grabbing necessary space
698+ immediately by calling reiser4_grab with BA_RESERVED flag set which means
699+ "can use the 5% reserved disk space".
700+
701+ At this moment insertion completes and grabbed blocks (if they were not used)
702+ should be returned to the free space counter.
703+
704+ However the atom life-cycle is not completed. The atom had one "flush
705+ reserved" block added by our insertion and the new fake allocated node is
706+ counted as a "fake allocated formatted" one. The atom has to be fully
707+ processed by flush before commit. Suppose that the flush moved the first,
708+ already allocated node to the atom's overwrite list, the new fake allocated
709+ node, obviously, went into the atom relocate set. The reiser4 flush
710+ allocates the new node using one unit from "fake allocated formatted"
711+ counter, the log writer uses one from "flush reserved" for wandered block
712+ allocation.
713+
714+ And, it is not the end. When the wandered block is deallocated after the
715+ atom gets fully played (see wander.c for term description), the disk space
716+ occupied for it is returned to free blocks. */
717+
718+/* BLOCK NUMBERS */
719+
720+/* Any reiser4 node has a block number assigned to it. We use these numbers for
721+ indexing in hash tables, so if a block has not yet been assigned a location
722+ on disk we need to give it a temporary fake block number.
723+
724+ Current implementation of reiser4 uses 64-bit integers for block numbers. We
725+ use highest bit in 64-bit block number to distinguish fake and real block
726+ numbers. So, only 63 bits may be used to addressing of real device
727+ blocks. That "fake" block numbers space is divided into subspaces of fake
728+ block numbers for data blocks and for shadow (working) bitmap blocks.
729+
730+ Fake block numbers for data blocks are generated by a cyclic counter, which
731+ gets incremented after each real block allocation. We assume that it is
732+ impossible to overload this counter during one transaction life. */
733+
734+/* Initialize a blocknr hint. */
735+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
736+{
737+ memset(hint, 0, sizeof(reiser4_blocknr_hint));
738+}
739+
740+/* Release any resources of a blocknr hint. */
741+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
742+{
743+ /* No resources should be freed in current blocknr_hint implementation. */
744+}
745+
746+/* see above for explanation of fake block number. */
747+/* Audited by: green(2002.06.11) */
748+int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
749+{
750+ /* The reason for not simply returning result of '&' operation is that
751+ while return value is (possibly 32bit) int, the reiser4_block_nr is
752+ at least 64 bits long, and high bit (which is the only possible
753+ non zero bit after the masking) would be stripped off */
754+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
755+}
756+
757+/* Static functions for <reiser4 super block>/<reiser4 context> block counters
758+ arithmetic. Mostly, they are isolated to not to code same assertions in
759+ several places. */
760+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
761+{
762+ BUG_ON(ctx->grabbed_blocks < count);
763+ assert("zam-527", ctx->grabbed_blocks >= count);
764+ ctx->grabbed_blocks -= count;
765+}
766+
767+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
768+{
769+ ctx->grabbed_blocks += count;
770+}
771+
772+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
773+{
774+ assert("zam-525", sbinfo->blocks_grabbed >= count);
775+ sbinfo->blocks_grabbed -= count;
776+}
777+
778+/* Decrease the counter of block reserved for flush in super block. */
779+static void
780+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
781+{
782+ assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
783+ sbinfo->blocks_flush_reserved -= count;
784+}
785+
786+static void
787+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
788+ reiser4_ba_flags_t flags)
789+{
790+ if (flags & BA_FORMATTED) {
791+ assert("zam-806", sbinfo->blocks_fake_allocated >= count);
792+ sbinfo->blocks_fake_allocated -= count;
793+ } else {
794+ assert("zam-528",
795+ sbinfo->blocks_fake_allocated_unformatted >= count);
796+ sbinfo->blocks_fake_allocated_unformatted -= count;
797+ }
798+}
799+
800+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
801+{
802+ assert("zam-530",
803+ sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
804+ sbinfo->blocks_used -= count;
805+}
806+
807+static void
808+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
809+{
810+ assert("edward-501", sbinfo->blocks_clustered >= count);
811+ sbinfo->blocks_clustered -= count;
812+}
813+
814+/* Increase the counter of block reserved for flush in atom. */
815+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
816+{
817+ assert("zam-772", atom != NULL);
818+ assert_spin_locked(&(atom->alock));
819+ atom->flush_reserved += count;
820+}
821+
822+/* Decrease the counter of block reserved for flush in atom. */
823+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
824+{
825+ assert("zam-774", atom != NULL);
826+ assert_spin_locked(&(atom->alock));
827+ assert("nikita-2790", atom->flush_reserved >= count);
828+ atom->flush_reserved -= count;
829+}
830+
831+/* super block has 6 counters: free, used, grabbed, fake allocated
832+ (formatted and unformatted) and flush reserved. Their sum must be
833+ number of blocks on a device. This function checks this */
834+int reiser4_check_block_counters(const struct super_block *super)
835+{
836+ __u64 sum;
837+
838+ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
839+ reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
840+ reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
841+ reiser4_clustered_blocks(super);
842+ if (reiser4_block_count(super) != sum) {
843+ printk("super block counters: "
844+ "used %llu, free %llu, "
845+ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
846+ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
847+ (unsigned long long)reiser4_data_blocks(super),
848+ (unsigned long long)reiser4_free_blocks(super),
849+ (unsigned long long)reiser4_grabbed_blocks(super),
850+ (unsigned long long)reiser4_fake_allocated(super),
851+ (unsigned long long)
852+ reiser4_fake_allocated_unformatted(super),
853+ (unsigned long long)reiser4_flush_reserved(super),
854+ (unsigned long long)reiser4_clustered_blocks(super),
855+ (unsigned long long)sum,
856+ (unsigned long long)reiser4_block_count(super));
857+ return 0;
858+ }
859+ return 1;
860+}
861+
862+/* Adjust "working" free blocks counter for number of blocks we are going to
863+ allocate. Record number of grabbed blocks in fs-wide and per-thread
864+ counters. This function should be called before bitmap scanning or
865+ allocating fake block numbers
866+
867+ @super -- pointer to reiser4 super block;
868+ @count -- number of blocks we reserve;
869+
870+ @return -- 0 if success, -ENOSPC, if all
871+ free blocks are preserved or already allocated.
872+*/
873+
874+static int
875+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
876+{
877+ __u64 free_blocks;
878+ int ret = 0, use_reserved = flags & BA_RESERVED;
879+ reiser4_super_info_data *sbinfo;
880+
881+ assert("vs-1276", ctx == get_current_context());
882+
883+ /* Do not grab anything on ro-mounted fs. */
884+ if (rofs_super(ctx->super)) {
885+ ctx->grab_enabled = 0;
886+ return 0;
887+ }
888+
889+ sbinfo = get_super_private(ctx->super);
890+
891+ spin_lock_reiser4_super(sbinfo);
892+
893+ free_blocks = sbinfo->blocks_free;
894+
895+ if ((use_reserved && free_blocks < count) ||
896+ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
897+ ret = RETERR(-ENOSPC);
898+ goto unlock_and_ret;
899+ }
900+
901+ add_to_ctx_grabbed(ctx, count);
902+
903+ sbinfo->blocks_grabbed += count;
904+ sbinfo->blocks_free -= count;
905+
906+#if REISER4_DEBUG
907+ if (ctx->grabbed_initially == 0)
908+ ctx->grabbed_initially = count;
909+#endif
910+
911+ assert("nikita-2986", reiser4_check_block_counters(ctx->super));
912+
913+ /* disable grab space in current context */
914+ ctx->grab_enabled = 0;
915+
916+ unlock_and_ret:
917+ spin_unlock_reiser4_super(sbinfo);
918+
919+ return ret;
920+}
921+
922+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
923+{
924+ int ret;
925+ reiser4_context *ctx;
926+
927+ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
928+ lock_stack_isclean(get_current_lock_stack
929+ ())));
930+ ctx = get_current_context();
931+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
932+ return 0;
933+ }
934+
935+ ret = reiser4_grab(ctx, count, flags);
936+ if (ret == -ENOSPC) {
937+
938+ /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
939+ if (flags & BA_CAN_COMMIT) {
940+ txnmgr_force_commit_all(ctx->super, 0);
941+ ctx->grab_enabled = 1;
942+ ret = reiser4_grab(ctx, count, flags);
943+ }
944+ }
945+ /*
946+ * allocation from reserved pool cannot fail. This is severe error.
947+ */
948+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
949+ return ret;
950+}
951+
952+/*
953+ * SPACE RESERVED FOR UNLINK/TRUNCATE
954+ *
955+ * Unlink and truncate require space in transaction (to update stat data, at
956+ * least). But we don't want rm(1) to fail with "No space on device" error.
957+ *
958+ * Solution is to reserve 5% of disk space for truncates and
959+ * unlinks. Specifically, normal space grabbing requests don't grab space from
960+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
961+ * drain it. Per super block delete mutex is used to allow only one
962+ * thread at a time to grab from reserved area.
963+ *
964+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
965+ * flag.
966+ *
967+ */
968+
969+int reiser4_grab_reserved(struct super_block *super,
970+ __u64 count, reiser4_ba_flags_t flags)
971+{
972+ reiser4_super_info_data *sbinfo = get_super_private(super);
973+
974+ assert("nikita-3175", flags & BA_CAN_COMMIT);
975+
976+ /* Check the delete mutex already taken by us, we assume that
977+ * reading of machine word is atomic. */
978+ if (sbinfo->delete_mutex_owner == current) {
979+ if (reiser4_grab_space
980+ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
981+ warning("zam-1003",
982+ "nested call of grab_reserved fails count=(%llu)",
983+ (unsigned long long)count);
984+ reiser4_release_reserved(super);
985+ return RETERR(-ENOSPC);
986+ }
987+ return 0;
988+ }
989+
990+ if (reiser4_grab_space(count, flags)) {
991+ mutex_lock(&sbinfo->delete_mutex);
992+ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
993+ sbinfo->delete_mutex_owner = current;
994+
995+ if (reiser4_grab_space(count, flags | BA_RESERVED)) {
996+ warning("zam-833",
997+ "reserved space is not enough (%llu)",
998+ (unsigned long long)count);
999+ reiser4_release_reserved(super);
1000+ return RETERR(-ENOSPC);
1001+ }
1002+ }
1003+ return 0;
1004+}
1005+
1006+void reiser4_release_reserved(struct super_block *super)
1007+{
1008+ reiser4_super_info_data *info;
1009+
1010+ info = get_super_private(super);
1011+ if (info->delete_mutex_owner == current) {
1012+ info->delete_mutex_owner = NULL;
1013+ mutex_unlock(&info->delete_mutex);
1014+ }
1015+}
1016+
1017+static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
1018+{
1019+ reiser4_context *ctx;
1020+ reiser4_super_info_data *sbinfo;
1021+
1022+ ctx = get_current_context();
1023+ sub_from_ctx_grabbed(ctx, count);
1024+
1025+ sbinfo = get_super_private(ctx->super);
1026+ spin_lock_reiser4_super(sbinfo);
1027+
1028+ sub_from_sb_grabbed(sbinfo, count);
1029+ /* return sbinfo locked */
1030+ return sbinfo;
1031+}
1032+
1033+/* is called after @count fake block numbers are allocated and pointer to
1034+ those blocks are inserted into tree. */
1035+static void grabbed2fake_allocated_formatted(void)
1036+{
1037+ reiser4_super_info_data *sbinfo;
1038+
1039+ sbinfo = grabbed2fake_allocated_head(1);
1040+ sbinfo->blocks_fake_allocated++;
1041+
1042+ assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1043+
1044+ spin_unlock_reiser4_super(sbinfo);
1045+}
1046+
1047+/**
1048+ * grabbed2fake_allocated_unformatted
1049+ * @count:
1050+ *
1051+ */
1052+static void grabbed2fake_allocated_unformatted(int count)
1053+{
1054+ reiser4_super_info_data *sbinfo;
1055+
1056+ sbinfo = grabbed2fake_allocated_head(count);
1057+ sbinfo->blocks_fake_allocated_unformatted += count;
1058+
1059+ assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1060+
1061+ spin_unlock_reiser4_super(sbinfo);
1062+}
1063+
1064+void grabbed2cluster_reserved(int count)
1065+{
1066+ reiser4_context *ctx;
1067+ reiser4_super_info_data *sbinfo;
1068+
1069+ ctx = get_current_context();
1070+ sub_from_ctx_grabbed(ctx, count);
1071+
1072+ sbinfo = get_super_private(ctx->super);
1073+ spin_lock_reiser4_super(sbinfo);
1074+
1075+ sub_from_sb_grabbed(sbinfo, count);
1076+ sbinfo->blocks_clustered += count;
1077+
1078+ assert("edward-504", reiser4_check_block_counters(ctx->super));
1079+
1080+ spin_unlock_reiser4_super(sbinfo);
1081+}
1082+
1083+void cluster_reserved2grabbed(int count)
1084+{
1085+ reiser4_context *ctx;
1086+ reiser4_super_info_data *sbinfo;
1087+
1088+ ctx = get_current_context();
1089+
1090+ sbinfo = get_super_private(ctx->super);
1091+ spin_lock_reiser4_super(sbinfo);
1092+
1093+ sub_from_cluster_reserved(sbinfo, count);
1094+ sbinfo->blocks_grabbed += count;
1095+
1096+ assert("edward-505", reiser4_check_block_counters(ctx->super));
1097+
1098+ spin_unlock_reiser4_super(sbinfo);
1099+ add_to_ctx_grabbed(ctx, count);
1100+}
1101+
1102+void cluster_reserved2free(int count)
1103+{
1104+ reiser4_context *ctx;
1105+ reiser4_super_info_data *sbinfo;
1106+
1107+ ctx = get_current_context();
1108+ sbinfo = get_super_private(ctx->super);
1109+
1110+ cluster_reserved2grabbed(count);
1111+ grabbed2free(ctx, sbinfo, count);
1112+}
1113+
1114+static DEFINE_SPINLOCK(fake_lock);
1115+static reiser4_block_nr fake_gen = 0;
1116+
1117+/**
1118+ * assign_fake_blocknr
1119+ * @blocknr:
1120+ * @count:
1121+ *
1122+ * Obtain a fake block number for new node which will be used to refer to
1123+ * this newly allocated node until real allocation is done.
1124+ */
1125+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1126+{
1127+ spin_lock(&fake_lock);
1128+ *blocknr = fake_gen;
1129+ fake_gen += count;
1130+ spin_unlock(&fake_lock);
1131+
1132+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1133+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1134+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1135+ assert("zam-394", zlook(current_tree, blocknr) == NULL);
1136+}
1137+
1138+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1139+{
1140+ assign_fake_blocknr(blocknr, 1);
1141+ grabbed2fake_allocated_formatted();
1142+ return 0;
1143+}
1144+
1145+/**
1146+ * fake_blocknrs_unformatted
1147+ * @count: number of fake numbers to get
1148+ *
1149+ * Allocates @count fake block numbers which will be assigned to jnodes
1150+ */
1151+reiser4_block_nr fake_blocknr_unformatted(int count)
1152+{
1153+ reiser4_block_nr blocknr;
1154+
1155+ assign_fake_blocknr(&blocknr, count);
1156+ grabbed2fake_allocated_unformatted(count);
1157+
1158+ return blocknr;
1159+}
1160+
1161+/* adjust sb block counters, if real (on-disk) block allocation immediately
1162+ follows grabbing of free disk space. */
1163+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1164+ __u64 count)
1165+{
1166+ sub_from_ctx_grabbed(ctx, count);
1167+
1168+ spin_lock_reiser4_super(sbinfo);
1169+
1170+ sub_from_sb_grabbed(sbinfo, count);
1171+ sbinfo->blocks_used += count;
1172+
1173+ assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1174+
1175+ spin_unlock_reiser4_super(sbinfo);
1176+}
1177+
1178+/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1179+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1180+ reiser4_ba_flags_t flags)
1181+{
1182+ spin_lock_reiser4_super(sbinfo);
1183+
1184+ sub_from_sb_fake_allocated(sbinfo, count, flags);
1185+ sbinfo->blocks_used += count;
1186+
1187+ assert("nikita-2680",
1188+ reiser4_check_block_counters(reiser4_get_current_sb()));
1189+
1190+ spin_unlock_reiser4_super(sbinfo);
1191+}
1192+
1193+static void flush_reserved2used(txn_atom * atom, __u64 count)
1194+{
1195+ reiser4_super_info_data *sbinfo;
1196+
1197+ assert("zam-787", atom != NULL);
1198+ assert_spin_locked(&(atom->alock));
1199+
1200+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1201+
1202+ sbinfo = get_current_super_private();
1203+ spin_lock_reiser4_super(sbinfo);
1204+
1205+ sub_from_sb_flush_reserved(sbinfo, count);
1206+ sbinfo->blocks_used += count;
1207+
1208+ assert("zam-789",
1209+ reiser4_check_block_counters(reiser4_get_current_sb()));
1210+
1211+ spin_unlock_reiser4_super(sbinfo);
1212+}
1213+
1214+/* update the per fs blocknr hint default value. */
1215+void
1216+update_blocknr_hint_default(const struct super_block *s,
1217+ const reiser4_block_nr * block)
1218+{
1219+ reiser4_super_info_data *sbinfo = get_super_private(s);
1220+
1221+ assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1222+
1223+ spin_lock_reiser4_super(sbinfo);
1224+ if (*block < sbinfo->block_count) {
1225+ sbinfo->blocknr_hint_default = *block;
1226+ } else {
1227+ warning("zam-676",
1228+ "block number %llu is too large to be used in a blocknr hint\n",
1229+ (unsigned long long)*block);
1230+ dump_stack();
1231+ DEBUGON(1);
1232+ }
1233+ spin_unlock_reiser4_super(sbinfo);
1234+}
1235+
1236+/* get current value of the default blocknr hint. */
1237+void get_blocknr_hint_default(reiser4_block_nr * result)
1238+{
1239+ reiser4_super_info_data *sbinfo = get_current_super_private();
1240+
1241+ spin_lock_reiser4_super(sbinfo);
1242+ *result = sbinfo->blocknr_hint_default;
1243+ assert("zam-677", *result < sbinfo->block_count);
1244+ spin_unlock_reiser4_super(sbinfo);
1245+}
1246+
1247+/* Allocate "real" disk blocks by calling a proper space allocation plugin
1248+ * method. Blocks are allocated in one contiguous disk region. The plugin
1249+ * independent part accounts blocks by subtracting allocated amount from grabbed
1250+ * or fake block counter and add the same amount to the counter of allocated
1251+ * blocks.
1252+ *
1253+ * @hint -- a reiser4 blocknr hint object which contains further block
1254+ * allocation hints and parameters (search start, a stage of block
1255+ * which will be mapped to disk, etc.),
1256+ * @blk -- an out parameter for the beginning of the allocated region,
1257+ * @len -- in/out parameter, it should contain the maximum number of allocated
1258+ * blocks, after block allocation completes, it contains the length of
1259+ * allocated disk region.
1260+ * @flags -- see reiser4_ba_flags_t description.
1261+ *
1262+ * @return -- 0 if success, error code otherwise.
1263+ */
1264+int
1265+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1266+ reiser4_block_nr * len, reiser4_ba_flags_t flags)
1267+{
1268+ __u64 needed = *len;
1269+ reiser4_context *ctx;
1270+ reiser4_super_info_data *sbinfo;
1271+ int ret;
1272+
1273+ assert("zam-986", hint != NULL);
1274+
1275+ ctx = get_current_context();
1276+ sbinfo = get_super_private(ctx->super);
1277+
1278+ /* For write-optimized data we use default search start value, which is
1279+ * close to last write location. */
1280+ if (flags & BA_USE_DEFAULT_SEARCH_START) {
1281+ get_blocknr_hint_default(&hint->blk);
1282+ }
1283+
1284+ /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1285+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1286+ if (hint->block_stage == BLOCK_NOT_COUNTED) {
1287+ ret = reiser4_grab_space_force(*len, flags);
1288+ if (ret != 0)
1289+ return ret;
1290+ }
1291+
1292+ ret =
1293+ sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1294+ hint, (int)needed, blk, len);
1295+
1296+ if (!ret) {
1297+ assert("zam-680", *blk < reiser4_block_count(ctx->super));
1298+ assert("zam-681",
1299+ *blk + *len <= reiser4_block_count(ctx->super));
1300+
1301+ if (flags & BA_PERMANENT) {
1302+ /* we assume that current atom exists at this moment */
1303+ txn_atom *atom = get_current_atom_locked();
1304+ atom->nr_blocks_allocated += *len;
1305+ spin_unlock_atom(atom);
1306+ }
1307+
1308+ switch (hint->block_stage) {
1309+ case BLOCK_NOT_COUNTED:
1310+ case BLOCK_GRABBED:
1311+ grabbed2used(ctx, sbinfo, *len);
1312+ break;
1313+ case BLOCK_UNALLOCATED:
1314+ fake_allocated2used(sbinfo, *len, flags);
1315+ break;
1316+ case BLOCK_FLUSH_RESERVED:
1317+ {
1318+ txn_atom *atom = get_current_atom_locked();
1319+ flush_reserved2used(atom, *len);
1320+ spin_unlock_atom(atom);
1321+ }
1322+ break;
1323+ default:
1324+ impossible("zam-531", "wrong block stage");
1325+ }
1326+ } else {
1327+ assert("zam-821",
1328+ ergo(hint->max_dist == 0
1329+ && !hint->backward, ret != -ENOSPC));
1330+ if (hint->block_stage == BLOCK_NOT_COUNTED)
1331+ grabbed2free(ctx, sbinfo, needed);
1332+ }
1333+
1334+ return ret;
1335+}
1336+
1337+/* used -> fake_allocated -> grabbed -> free */
1338+
1339+/* adjust sb block counters when @count unallocated blocks get unmapped from
1340+ disk */
1341+static void
1342+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1343+ int formatted)
1344+{
1345+ spin_lock_reiser4_super(sbinfo);
1346+
1347+ if (formatted)
1348+ sbinfo->blocks_fake_allocated += count;
1349+ else
1350+ sbinfo->blocks_fake_allocated_unformatted += count;
1351+
1352+ sub_from_sb_used(sbinfo, count);
1353+
1354+ assert("nikita-2681",
1355+ reiser4_check_block_counters(reiser4_get_current_sb()));
1356+
1357+ spin_unlock_reiser4_super(sbinfo);
1358+}
1359+
1360+static void
1361+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1362+ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1363+{
1364+ assert("nikita-2791", atom != NULL);
1365+ assert_spin_locked(&(atom->alock));
1366+
1367+ add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1368+
1369+ spin_lock_reiser4_super(sbinfo);
1370+
1371+ sbinfo->blocks_flush_reserved += count;
1372+ /*add_to_sb_flush_reserved(sbinfo, count); */
1373+ sub_from_sb_used(sbinfo, count);
1374+
1375+ assert("nikita-2681",
1376+ reiser4_check_block_counters(reiser4_get_current_sb()));
1377+
1378+ spin_unlock_reiser4_super(sbinfo);
1379+}
1380+
1381+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1382+static void
1383+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1384+ __u64 count, reiser4_ba_flags_t flags)
1385+{
1386+ add_to_ctx_grabbed(ctx, count);
1387+
1388+ spin_lock_reiser4_super(sbinfo);
1389+
1390+ assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1391+
1392+ sbinfo->blocks_grabbed += count;
1393+ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1394+
1395+ assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1396+
1397+ spin_unlock_reiser4_super(sbinfo);
1398+}
1399+
1400+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1401+{
1402+ reiser4_context *ctx;
1403+ reiser4_super_info_data *sbinfo;
1404+
1405+ ctx = get_current_context();
1406+ sbinfo = get_super_private(ctx->super);
1407+
1408+ fake_allocated2grabbed(ctx, sbinfo, count, flags);
1409+ grabbed2free(ctx, sbinfo, count);
1410+}
1411+
1412+void grabbed2free_mark(__u64 mark)
1413+{
1414+ reiser4_context *ctx;
1415+ reiser4_super_info_data *sbinfo;
1416+
1417+ ctx = get_current_context();
1418+ sbinfo = get_super_private(ctx->super);
1419+
1420+ assert("nikita-3007", (__s64) mark >= 0);
1421+ assert("nikita-3006", ctx->grabbed_blocks >= mark);
1422+ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1423+}
1424+
1425+/**
1426+ * grabbed2free - adjust grabbed and free block counters
1427+ * @ctx: context to update grabbed block counter of
1428+ * @sbinfo: super block to update grabbed and free block counters of
1429+ * @count: number of blocks to adjust counters by
1430+ *
1431+ * Decreases context's and per filesystem's counters of grabbed
1432+ * blocks. Increases per filesystem's counter of free blocks.
1433+ */
1434+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1435+ __u64 count)
1436+{
1437+ sub_from_ctx_grabbed(ctx, count);
1438+
1439+ spin_lock_reiser4_super(sbinfo);
1440+
1441+ sub_from_sb_grabbed(sbinfo, count);
1442+ sbinfo->blocks_free += count;
1443+ assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1444+
1445+ spin_unlock_reiser4_super(sbinfo);
1446+}
1447+
1448+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1449+{
1450+ reiser4_context *ctx;
1451+ reiser4_super_info_data *sbinfo;
1452+
1453+ assert("vs-1095", atom);
1454+
1455+ ctx = get_current_context();
1456+ sbinfo = get_super_private(ctx->super);
1457+
1458+ sub_from_ctx_grabbed(ctx, count);
1459+
1460+ add_to_atom_flush_reserved_nolock(atom, count);
1461+
1462+ spin_lock_reiser4_super(sbinfo);
1463+
1464+ sbinfo->blocks_flush_reserved += count;
1465+ sub_from_sb_grabbed(sbinfo, count);
1466+
1467+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
1468+
1469+ spin_unlock_reiser4_super(sbinfo);
1470+}
1471+
1472+void grabbed2flush_reserved(__u64 count)
1473+{
1474+ txn_atom *atom = get_current_atom_locked();
1475+
1476+ grabbed2flush_reserved_nolock(atom, count);
1477+
1478+ spin_unlock_atom(atom);
1479+}
1480+
1481+void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1482+{
1483+ reiser4_context *ctx;
1484+ reiser4_super_info_data *sbinfo;
1485+
1486+ assert("nikita-2788", atom != NULL);
1487+ assert_spin_locked(&(atom->alock));
1488+
1489+ ctx = get_current_context();
1490+ sbinfo = get_super_private(ctx->super);
1491+
1492+ add_to_ctx_grabbed(ctx, count);
1493+
1494+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1495+
1496+ spin_lock_reiser4_super(sbinfo);
1497+
1498+ sbinfo->blocks_grabbed += count;
1499+ sub_from_sb_flush_reserved(sbinfo, count);
1500+
1501+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
1502+
1503+ spin_unlock_reiser4_super(sbinfo);
1504+}
1505+
1506+/**
1507+ * all_grabbed2free - releases all blocks grabbed in context
1508+ *
1509+ * Decreases context's and super block's grabbed block counters by number of
1510+ * blocks grabbed by current context and increases super block's free block
1511+ * counter correspondingly.
1512+ */
1513+void all_grabbed2free(void)
1514+{
1515+ reiser4_context *ctx = get_current_context();
1516+
1517+ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1518+}
1519+
1520+/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1521+ after freeing, @count blocks become "grabbed". */
1522+static void
1523+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1524+ __u64 count)
1525+{
1526+ add_to_ctx_grabbed(ctx, count);
1527+
1528+ spin_lock_reiser4_super(sbinfo);
1529+
1530+ sbinfo->blocks_grabbed += count;
1531+ sub_from_sb_used(sbinfo, count);
1532+
1533+ assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1534+
1535+ spin_unlock_reiser4_super(sbinfo);
1536+}
1537+
1538+/* this used to be done through used2grabbed and grabbed2free*/
1539+static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1540+{
1541+ spin_lock_reiser4_super(sbinfo);
1542+
1543+ sbinfo->blocks_free += count;
1544+ sub_from_sb_used(sbinfo, count);
1545+
1546+ assert("nikita-2685",
1547+ reiser4_check_block_counters(reiser4_get_current_sb()));
1548+
1549+ spin_unlock_reiser4_super(sbinfo);
1550+}
1551+
1552+#if REISER4_DEBUG
1553+
1554+/* check "allocated" state of given block range */
1555+static void
1556+reiser4_check_blocks(const reiser4_block_nr * start,
1557+ const reiser4_block_nr * len, int desired)
1558+{
1559+ sa_check_blocks(start, len, desired);
1560+}
1561+
1562+/* check "allocated" state of given block */
1563+void reiser4_check_block(const reiser4_block_nr * block, int desired)
1564+{
1565+ const reiser4_block_nr one = 1;
1566+
1567+ reiser4_check_blocks(block, &one, desired);
1568+}
1569+
1570+#endif
1571+
1572+/* Blocks deallocation function may do an actual deallocation through space
1573+ plugin allocation or store deleted block numbers in atom's delete_set data
1574+ structure depend on @defer parameter. */
1575+
1576+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1577+ will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1578+ freed but disk space is still grabbed by current thread, or these blocks must
1579+ not be counted in any reiser4 sb block counters, see block_stage_t comment */
1580+
1581+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1582+ distinguish blocks allocated for unformatted and formatted nodes */
1583+
1584+int
1585+reiser4_dealloc_blocks(const reiser4_block_nr * start,
1586+ const reiser4_block_nr * len,
1587+ block_stage_t target_stage, reiser4_ba_flags_t flags)
1588+{
1589+ txn_atom *atom = NULL;
1590+ int ret;
1591+ reiser4_context *ctx;
1592+ reiser4_super_info_data *sbinfo;
1593+
1594+ ctx = get_current_context();
1595+ sbinfo = get_super_private(ctx->super);
1596+
1597+ if (REISER4_DEBUG) {
1598+ assert("zam-431", *len != 0);
1599+ assert("zam-432", *start != 0);
1600+ assert("zam-558", !reiser4_blocknr_is_fake(start));
1601+
1602+ spin_lock_reiser4_super(sbinfo);
1603+ assert("zam-562", *start < sbinfo->block_count);
1604+ spin_unlock_reiser4_super(sbinfo);
1605+ }
1606+
1607+ if (flags & BA_DEFER) {
1608+ blocknr_set_entry *bsep = NULL;
1609+
1610+ /* storing deleted block numbers in a blocknr set
1611+ datastructure for further actual deletion */
1612+ do {
1613+ atom = get_current_atom_locked();
1614+ assert("zam-430", atom != NULL);
1615+
1616+ ret =
1617+ blocknr_set_add_extent(atom, &atom->delete_set,
1618+ &bsep, start, len);
1619+
1620+ if (ret == -ENOMEM)
1621+ return ret;
1622+
1623+ /* This loop might spin at most two times */
1624+ } while (ret == -E_REPEAT);
1625+
1626+ assert("zam-477", ret == 0);
1627+ assert("zam-433", atom != NULL);
1628+
1629+ spin_unlock_atom(atom);
1630+
1631+ } else {
1632+ assert("zam-425", get_current_super_private() != NULL);
1633+ sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1634+ *start, *len);
1635+
1636+ if (flags & BA_PERMANENT) {
1637+ /* These blocks were counted as allocated, we have to revert it
1638+ * back if allocation is discarded. */
1639+ txn_atom *atom = get_current_atom_locked();
1640+ atom->nr_blocks_allocated -= *len;
1641+ spin_unlock_atom(atom);
1642+ }
1643+
1644+ switch (target_stage) {
1645+ case BLOCK_NOT_COUNTED:
1646+ assert("vs-960", flags & BA_FORMATTED);
1647+ /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1648+ used2free(sbinfo, *len);
1649+ break;
1650+
1651+ case BLOCK_GRABBED:
1652+ used2grabbed(ctx, sbinfo, *len);
1653+ break;
1654+
1655+ case BLOCK_UNALLOCATED:
1656+ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1657+ break;
1658+
1659+ case BLOCK_FLUSH_RESERVED:{
1660+ txn_atom *atom;
1661+
1662+ atom = get_current_atom_locked();
1663+ used2flush_reserved(sbinfo, atom, *len,
1664+ flags & BA_FORMATTED);
1665+ spin_unlock_atom(atom);
1666+ break;
1667+ }
1668+ default:
1669+ impossible("zam-532", "wrong block stage");
1670+ }
1671+ }
1672+
1673+ return 0;
1674+}
1675+
1676+/* wrappers for block allocator plugin methods */
1677+int reiser4_pre_commit_hook(void)
1678+{
1679+ assert("zam-502", get_current_super_private() != NULL);
1680+ sa_pre_commit_hook();
1681+ return 0;
1682+}
1683+
1684+/* an actor which applies delete set to block allocator data */
1685+static int
1686+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1687+ const reiser4_block_nr * b, void *data UNUSED_ARG)
1688+{
1689+ reiser4_context *ctx;
1690+ reiser4_super_info_data *sbinfo;
1691+
1692+ __u64 len = 1;
1693+
1694+ ctx = get_current_context();
1695+ sbinfo = get_super_private(ctx->super);
1696+
1697+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1698+ assert("zam-552", sbinfo != NULL);
1699+
1700+ if (b != NULL)
1701+ len = *b;
1702+
1703+ if (REISER4_DEBUG) {
1704+ spin_lock_reiser4_super(sbinfo);
1705+
1706+ assert("zam-554", *a < reiser4_block_count(ctx->super));
1707+ assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1708+
1709+ spin_unlock_reiser4_super(sbinfo);
1710+ }
1711+
1712+ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1713+ /* adjust sb block counters */
1714+ used2free(sbinfo, len);
1715+ return 0;
1716+}
1717+
1718+void reiser4_post_commit_hook(void)
1719+{
1720+ txn_atom *atom;
1721+
1722+ atom = get_current_atom_locked();
1723+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1724+ spin_unlock_atom(atom);
1725+
1726+ /* do the block deallocation which was deferred
1727+ until commit is done */
1728+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1729+
1730+ assert("zam-504", get_current_super_private() != NULL);
1731+ sa_post_commit_hook();
1732+}
1733+
1734+void reiser4_post_write_back_hook(void)
1735+{
1736+ assert("zam-504", get_current_super_private() != NULL);
1737+
1738+ sa_post_commit_hook();
1739+}
1740+
1741+/*
1742+ Local variables:
1743+ c-indentation-style: "K&R"
1744+ mode-name: "LC"
1745+ c-basic-offset: 8
1746+ tab-width: 8
1747+ fill-column: 120
1748+ scroll-step: 1
1749+ End:
1750+*/
1751diff -urN linux-2.6.23.orig/fs/reiser4/block_alloc.h linux-2.6.23/fs/reiser4/block_alloc.h
1752--- linux-2.6.23.orig/fs/reiser4/block_alloc.h 1970-01-01 03:00:00.000000000 +0300
1753+++ linux-2.6.23/fs/reiser4/block_alloc.h 2007-12-04 16:49:30.000000000 +0300
1754@@ -0,0 +1,175 @@
1755+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1756+
1757+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
1758+#define __FS_REISER4_BLOCK_ALLOC_H__
1759+
1760+#include "dformat.h"
1761+#include "forward.h"
1762+
1763+#include <linux/types.h> /* for __u?? */
1764+#include <linux/fs.h>
1765+
1766+/* Mask when is applied to given block number shows is that block number is a fake one */
1767+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
1768+/* Mask which isolates a type of object this fake block number was assigned to */
1769+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1770+
1771+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1772+ against these two values to understand is the object unallocated or bitmap
1773+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1774+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
1775+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
1776+
1777+/* specification how block allocation was counted in sb block counters */
1778+typedef enum {
1779+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
1780+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation
1781+ of this block */
1782+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
1783+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
1784+ ( unallocated formatted or unformatted
1785+ node) */
1786+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
1787+ number assigned */
1788+} block_stage_t;
1789+
1790+/* a hint for block allocator */
1791+struct reiser4_blocknr_hint {
1792+ /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
1793+ is to prevent jnode_flush() calls from interleaving allocations on the same
1794+ bitmap, once a hint is established. */
1795+
1796+ /* search start hint */
1797+ reiser4_block_nr blk;
1798+ /* if not zero, it is a region size we search for free blocks in */
1799+ reiser4_block_nr max_dist;
1800+ /* level for allocation, may be useful have branch-level and higher
1801+ write-optimized. */
1802+ tree_level level;
1803+ /* block allocator assumes that blocks, which will be mapped to disk,
1804+ are in this specified block_stage */
1805+ block_stage_t block_stage;
1806+ /* If direction = 1 allocate blocks in backward direction from the end
1807+ * of disk to the beginning of disk. */
1808+ unsigned int backward:1;
1809+
1810+};
1811+
1812+/* These flags control block allocation/deallocation behavior */
1813+enum reiser4_ba_flags {
1814+ /* do allocatations from reserved (5%) area */
1815+ BA_RESERVED = (1 << 0),
1816+
1817+ /* block allocator can do commit trying to recover free space */
1818+ BA_CAN_COMMIT = (1 << 1),
1819+
1820+ /* if operation will be applied to formatted block */
1821+ BA_FORMATTED = (1 << 2),
1822+
1823+ /* defer actual block freeing until transaction commit */
1824+ BA_DEFER = (1 << 3),
1825+
1826+ /* allocate blocks for permanent fs objects (formatted or unformatted), not
1827+ wandered of log blocks */
1828+ BA_PERMANENT = (1 << 4),
1829+
1830+ /* grab space even it was disabled */
1831+ BA_FORCE = (1 << 5),
1832+
1833+ /* use default start value for free blocks search. */
1834+ BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1835+};
1836+
1837+typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1838+
1839+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1840+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
1841+extern void update_blocknr_hint_default(const struct super_block *,
1842+ const reiser4_block_nr *);
1843+extern void get_blocknr_hint_default(reiser4_block_nr *);
1844+
1845+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1846+
1847+int assign_fake_blocknr_formatted(reiser4_block_nr *);
1848+reiser4_block_nr fake_blocknr_unformatted(int);
1849+
1850+/* free -> grabbed -> fake_allocated -> used */
1851+
1852+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1853+void all_grabbed2free(void);
1854+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
1855+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1856+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1857+void grabbed2flush_reserved(__u64 count);
1858+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1859+ reiser4_block_nr * start,
1860+ reiser4_block_nr * len, reiser4_ba_flags_t flags);
1861+int reiser4_dealloc_blocks(const reiser4_block_nr *,
1862+ const reiser4_block_nr *,
1863+ block_stage_t, reiser4_ba_flags_t flags);
1864+
1865+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
1866+ reiser4_block_nr * start,
1867+ reiser4_ba_flags_t flags)
1868+{
1869+ reiser4_block_nr one = 1;
1870+ return reiser4_alloc_blocks(hint, start, &one, flags);
1871+}
1872+
1873+static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
1874+ block_stage_t stage,
1875+ reiser4_ba_flags_t flags)
1876+{
1877+ const reiser4_block_nr one = 1;
1878+ return reiser4_dealloc_blocks(block, &one, stage, flags);
1879+}
1880+
1881+#define reiser4_grab_space_force(count, flags) \
1882+ reiser4_grab_space(count, flags | BA_FORCE)
1883+
1884+extern void grabbed2free_mark(__u64 mark);
1885+extern int reiser4_grab_reserved(struct super_block *,
1886+ __u64, reiser4_ba_flags_t);
1887+extern void reiser4_release_reserved(struct super_block *super);
1888+
1889+/* grabbed -> fake_allocated */
1890+
1891+/* fake_allocated -> used */
1892+
1893+/* used -> fake_allocated -> grabbed -> free */
1894+
1895+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
1896+
1897+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
1898+
1899+extern void grabbed2cluster_reserved(int count);
1900+extern void cluster_reserved2grabbed(int count);
1901+extern void cluster_reserved2free(int count);
1902+
1903+extern int reiser4_check_block_counters(const struct super_block *);
1904+
1905+#if REISER4_DEBUG
1906+
1907+extern void reiser4_check_block(const reiser4_block_nr *, int);
1908+
1909+#else
1910+
1911+# define reiser4_check_block(beg, val) noop
1912+
1913+#endif
1914+
1915+extern int reiser4_pre_commit_hook(void);
1916+extern void reiser4_post_commit_hook(void);
1917+extern void reiser4_post_write_back_hook(void);
1918+
1919+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
1920+
1921+/* Make Linus happy.
1922+ Local variables:
1923+ c-indentation-style: "K&R"
1924+ mode-name: "LC"
1925+ c-basic-offset: 8
1926+ tab-width: 8
1927+ fill-column: 120
1928+ End:
1929+*/
1930diff -urN linux-2.6.23.orig/fs/reiser4/blocknrset.c linux-2.6.23/fs/reiser4/blocknrset.c
1931--- linux-2.6.23.orig/fs/reiser4/blocknrset.c 1970-01-01 03:00:00.000000000 +0300
1932+++ linux-2.6.23/fs/reiser4/blocknrset.c 2007-12-04 16:49:30.000000000 +0300
1933@@ -0,0 +1,368 @@
1934+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1935+
1936+/* This file contains code for various block number sets used by the atom to
1937+ track the deleted set and wandered block mappings. */
1938+
1939+#include "debug.h"
1940+#include "dformat.h"
1941+#include "txnmgr.h"
1942+#include "context.h"
1943+
1944+#include <linux/slab.h>
1945+
1946+/* The proposed data structure for storing unordered block number sets is a
1947+ list of elements, each of which contains an array of block number or/and
1948+ array of block number pairs. That element called blocknr_set_entry is used
1949+ to store block numbers from the beginning and for extents from the end of
1950+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
1951+ count numbers of blocks and extents.
1952+
1953+ +------------------- blocknr_set_entry->data ------------------+
1954+ |block1|block2| ... <free space> ... |pair3|pair2|pair1|
1955+ +------------------------------------------------------------+
1956+
1957+ When current blocknr_set_entry is full, allocate a new one. */
1958+
1959+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
1960+ * set (single blocks and block extents), in that case blocknr pair represent an
1961+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
1962+ * there represent a (real block) -> (wandered block) mapping. */
1963+
1964+/* Protection: blocknr sets belong to reiser4 atom, and
1965+ * their modifications are performed with the atom lock held */
1966+
1967+/* The total size of a blocknr_set_entry. */
1968+#define BLOCKNR_SET_ENTRY_SIZE 128
1969+
1970+/* The number of blocks that can fit the blocknr data area. */
1971+#define BLOCKNR_SET_ENTRIES_NUMBER \
1972+ ((BLOCKNR_SET_ENTRY_SIZE - \
1973+ 2 * sizeof (unsigned) - \
1974+ sizeof(struct list_head)) / \
1975+ sizeof(reiser4_block_nr))
1976+
1977+/* An entry of the blocknr_set */
1978+struct blocknr_set_entry {
1979+ unsigned nr_singles;
1980+ unsigned nr_pairs;
1981+ struct list_head link;
1982+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
1983+};
1984+
1985+/* A pair of blocks as recorded in the blocknr_set_entry data. */
1986+struct blocknr_pair {
1987+ reiser4_block_nr a;
1988+ reiser4_block_nr b;
1989+};
1990+
1991+/* Return the number of blocknr slots available in a blocknr_set_entry. */
1992+/* Audited by: green(2002.06.11) */
1993+static unsigned bse_avail(blocknr_set_entry * bse)
1994+{
1995+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
1996+
1997+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
1998+ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
1999+
2000+ return BLOCKNR_SET_ENTRIES_NUMBER - used;
2001+}
2002+
2003+/* Initialize a blocknr_set_entry. */
2004+static void bse_init(blocknr_set_entry *bse)
2005+{
2006+ bse->nr_singles = 0;
2007+ bse->nr_pairs = 0;
2008+ INIT_LIST_HEAD(&bse->link);
2009+}
2010+
2011+/* Allocate and initialize a blocknr_set_entry. */
2012+/* Audited by: green(2002.06.11) */
2013+static blocknr_set_entry *bse_alloc(void)
2014+{
2015+ blocknr_set_entry *e;
2016+
2017+ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
2018+ reiser4_ctx_gfp_mask_get())) == NULL)
2019+ return NULL;
2020+
2021+ bse_init(e);
2022+
2023+ return e;
2024+}
2025+
2026+/* Free a blocknr_set_entry. */
2027+/* Audited by: green(2002.06.11) */
2028+static void bse_free(blocknr_set_entry * bse)
2029+{
2030+ kfree(bse);
2031+}
2032+
2033+/* Add a block number to a blocknr_set_entry */
2034+/* Audited by: green(2002.06.11) */
2035+static void
2036+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2037+{
2038+ assert("jmacd-5099", bse_avail(bse) >= 1);
2039+
2040+ bse->entries[bse->nr_singles++] = *block;
2041+}
2042+
2043+/* Get a pair of block numbers */
2044+/* Audited by: green(2002.06.11) */
2045+static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
2046+ unsigned pno)
2047+{
2048+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2049+
2050+ return (struct blocknr_pair *) (bse->entries +
2051+ BLOCKNR_SET_ENTRIES_NUMBER -
2052+ 2 * (pno + 1));
2053+}
2054+
2055+/* Add a pair of block numbers to a blocknr_set_entry */
2056+/* Audited by: green(2002.06.11) */
2057+static void
2058+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2059+ const reiser4_block_nr * b)
2060+{
2061+ struct blocknr_pair *pair;
2062+
2063+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2064+
2065+ pair = bse_get_pair(bse, bse->nr_pairs++);
2066+
2067+ pair->a = *a;
2068+ pair->b = *b;
2069+}
2070+
2071+/* Add either a block or pair of blocks to the block number set. The first
2072+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2073+ @b is non-NULL a pair is added. The block number set belongs to atom, and
2074+ the call is made with the atom lock held. There may not be enough space in
2075+ the current blocknr_set_entry. If new_bsep points to a non-NULL
2076+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2077+ will be set to NULL. If new_bsep contains NULL then the atom lock will be
2078+ released and a new bse will be allocated in new_bsep. E_REPEAT will be
2079+ returned with the atom unlocked for the operation to be tried again. If
2080+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2081+ used during the call, it will be freed automatically. */
2082+static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2083+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2084+ const reiser4_block_nr *b)
2085+{
2086+ blocknr_set_entry *bse;
2087+ unsigned entries_needed;
2088+
2089+ assert("jmacd-5101", a != NULL);
2090+
2091+ entries_needed = (b == NULL) ? 1 : 2;
2092+ if (list_empty(bset) ||
2093+ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2094+ /* See if a bse was previously allocated. */
2095+ if (*new_bsep == NULL) {
2096+ spin_unlock_atom(atom);
2097+ *new_bsep = bse_alloc();
2098+ return (*new_bsep != NULL) ? -E_REPEAT :
2099+ RETERR(-ENOMEM);
2100+ }
2101+
2102+ /* Put it on the head of the list. */
2103+ list_add(&((*new_bsep)->link), bset);
2104+
2105+ *new_bsep = NULL;
2106+ }
2107+
2108+ /* Add the single or pair. */
2109+ bse = list_entry(bset->next, blocknr_set_entry, link);
2110+ if (b == NULL) {
2111+ bse_put_single(bse, a);
2112+ } else {
2113+ bse_put_pair(bse, a, b);
2114+ }
2115+
2116+ /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2117+ if (*new_bsep != NULL) {
2118+ bse_free(*new_bsep);
2119+ *new_bsep = NULL;
2120+ }
2121+
2122+ return 0;
2123+}
2124+
2125+/* Add an extent to the block set. If the length is 1, it is treated as a
2126+ single block (e.g., reiser4_set_add_block). */
2127+/* Audited by: green(2002.06.11) */
2128+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2129+ kmalloc might schedule. The only exception is atom spinlock, which is
2130+ properly freed. */
2131+int
2132+blocknr_set_add_extent(txn_atom * atom,
2133+ struct list_head * bset,
2134+ blocknr_set_entry ** new_bsep,
2135+ const reiser4_block_nr * start,
2136+ const reiser4_block_nr * len)
2137+{
2138+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2139+ return blocknr_set_add(atom, bset, new_bsep, start,
2140+ *len == 1 ? NULL : len);
2141+}
2142+
2143+/* Add a block pair to the block set. It adds exactly a pair, which is checked
2144+ * by an assertion that both arguments are not null.*/
2145+/* Audited by: green(2002.06.11) */
2146+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2147+ kmalloc might schedule. The only exception is atom spinlock, which is
2148+ properly freed. */
2149+int
2150+blocknr_set_add_pair(txn_atom * atom,
2151+ struct list_head * bset,
2152+ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2153+ const reiser4_block_nr * b)
2154+{
2155+ assert("jmacd-5103", a != NULL && b != NULL);
2156+ return blocknr_set_add(atom, bset, new_bsep, a, b);
2157+}
2158+
2159+/* Initialize a blocknr_set. */
2160+void blocknr_set_init(struct list_head *bset)
2161+{
2162+ INIT_LIST_HEAD(bset);
2163+}
2164+
2165+/* Release the entries of a blocknr_set. */
2166+void blocknr_set_destroy(struct list_head *bset)
2167+{
2168+ blocknr_set_entry *bse;
2169+
2170+ while (!list_empty(bset)) {
2171+ bse = list_entry(bset->next, blocknr_set_entry, link);
2172+ list_del_init(&bse->link);
2173+ bse_free(bse);
2174+ }
2175+}
2176+
2177+/* Merge blocknr_set entries out of @from into @into. */
2178+/* Audited by: green(2002.06.11) */
2179+/* Auditor comments: This merge does not know if merged sets contain
2180+ blocks pairs (As for wandered sets) or extents, so it cannot really merge
2181+ overlapping ranges if there is some. So I believe it may lead to
2182+ some blocks being presented several times in one blocknr_set. To help
2183+ debugging such problems it might help to check for duplicate entries on
2184+ actual processing of this set. Testing this kind of stuff right here is
2185+ also complicated by the fact that these sets are not sorted and going
2186+ through whole set on each element addition is going to be CPU-heavy task */
2187+void blocknr_set_merge(struct list_head * from, struct list_head * into)
2188+{
2189+ blocknr_set_entry *bse_into = NULL;
2190+
2191+ /* If @from is empty, no work to perform. */
2192+ if (list_empty(from))
2193+ return;
2194+ /* If @into is not empty, try merging partial-entries. */
2195+ if (!list_empty(into)) {
2196+
2197+ /* Neither set is empty, pop the front to members and try to combine them. */
2198+ blocknr_set_entry *bse_from;
2199+ unsigned into_avail;
2200+
2201+ bse_into = list_entry(into->next, blocknr_set_entry, link);
2202+ list_del_init(&bse_into->link);
2203+ bse_from = list_entry(from->next, blocknr_set_entry, link);
2204+ list_del_init(&bse_from->link);
2205+
2206+ /* Combine singles. */
2207+ for (into_avail = bse_avail(bse_into);
2208+ into_avail != 0 && bse_from->nr_singles != 0;
2209+ into_avail -= 1) {
2210+ bse_put_single(bse_into,
2211+ &bse_from->entries[--bse_from->
2212+ nr_singles]);
2213+ }
2214+
2215+ /* Combine pairs. */
2216+ for (; into_avail > 1 && bse_from->nr_pairs != 0;
2217+ into_avail -= 2) {
2218+ struct blocknr_pair *pair =
2219+ bse_get_pair(bse_from, --bse_from->nr_pairs);
2220+ bse_put_pair(bse_into, &pair->a, &pair->b);
2221+ }
2222+
2223+ /* If bse_from is empty, delete it now. */
2224+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2225+ bse_free(bse_from);
2226+ } else {
2227+ /* Otherwise, bse_into is full or nearly full (e.g.,
2228+ it could have one slot avail and bse_from has one
2229+ pair left). Push it back onto the list. bse_from
2230+ becomes bse_into, which will be the new partial. */
2231+ list_add(&bse_into->link, into);
2232+ bse_into = bse_from;
2233+ }
2234+ }
2235+
2236+ /* Splice lists together. */
2237+ list_splice_init(from, into->prev);
2238+
2239+ /* Add the partial entry back to the head of the list. */
2240+ if (bse_into != NULL)
2241+ list_add(&bse_into->link, into);
2242+}
2243+
2244+/* Iterate over all blocknr set elements. */
2245+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2246+ blocknr_set_actor_f actor, void *data, int delete)
2247+{
2248+
2249+ blocknr_set_entry *entry;
2250+
2251+ assert("zam-429", atom != NULL);
2252+ assert("zam-430", atom_is_protected(atom));
2253+ assert("zam-431", bset != 0);
2254+ assert("zam-432", actor != NULL);
2255+
2256+ entry = list_entry(bset->next, blocknr_set_entry, link);
2257+ while (bset != &entry->link) {
2258+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2259+ unsigned int i;
2260+ int ret;
2261+
2262+ for (i = 0; i < entry->nr_singles; i++) {
2263+ ret = actor(atom, &entry->entries[i], NULL, data);
2264+
2265+ /* We can't break a loop if delete flag is set. */
2266+ if (ret != 0 && !delete)
2267+ return ret;
2268+ }
2269+
2270+ for (i = 0; i < entry->nr_pairs; i++) {
2271+ struct blocknr_pair *ab;
2272+
2273+ ab = bse_get_pair(entry, i);
2274+
2275+ ret = actor(atom, &ab->a, &ab->b, data);
2276+
2277+ if (ret != 0 && !delete)
2278+ return ret;
2279+ }
2280+
2281+ if (delete) {
2282+ list_del(&entry->link);
2283+ bse_free(entry);
2284+ }
2285+
2286+ entry = tmp;
2287+ }
2288+
2289+ return 0;
2290+}
2291+
2292+/*
2293+ * Local variables:
2294+ * c-indentation-style: "K&R"
2295+ * mode-name: "LC"
2296+ * c-basic-offset: 8
2297+ * tab-width: 8
2298+ * fill-column: 79
2299+ * scroll-step: 1
2300+ * End:
2301+ */
2302diff -urN linux-2.6.23.orig/fs/reiser4/carry.c linux-2.6.23/fs/reiser4/carry.c
2303--- linux-2.6.23.orig/fs/reiser4/carry.c 1970-01-01 03:00:00.000000000 +0300
2304+++ linux-2.6.23/fs/reiser4/carry.c 2007-12-04 16:49:30.000000000 +0300
2305@@ -0,0 +1,1391 @@
2306+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2307+/* Functions to "carry" tree modification(s) upward. */
2308+/* Tree is modified one level at a time. As we modify a level we accumulate a
2309+ set of changes that need to be propagated to the next level. We manage
2310+ node locking such that any searches that collide with carrying are
2311+ restarted, from the root if necessary.
2312+
2313+ Insertion of a new item may result in items being moved among nodes and
2314+ this requires the delimiting key to be updated at the least common parent
2315+ of the nodes modified to preserve search tree invariants. Also, insertion
2316+ may require allocation of a new node. A pointer to the new node has to be
2317+ inserted into some node on the parent level, etc.
2318+
2319+ Tree carrying is meant to be analogous to arithmetic carrying.
2320+
2321+ A carry operation is always associated with some node (&carry_node).
2322+
2323+ Carry process starts with some initial set of operations to be performed
2324+ and an initial set of already locked nodes. Operations are performed one
2325+ by one. Performing each single operation has following possible effects:
2326+
2327+ - content of carry node associated with operation is modified
2328+ - new carry nodes are locked and involved into carry process on this level
2329+ - new carry operations are posted to the next level
2330+
2331+ After all carry operations on this level are done, process is repeated for
2332+ the accumulated sequence on carry operations for the next level. This
2333+ starts by trying to lock (in left to right order) all carry nodes
2334+ associated with carry operations on the parent level. After this, we decide
2335+ whether more nodes are required on the left of already locked set. If so,
2336+ all locks taken on the parent level are released, new carry nodes are
2337+ added, and locking process repeats.
2338+
2339+ It may happen that balancing process fails owing to unrecoverable error on
2340+ some of upper levels of a tree (possible causes are io error, failure to
2341+ allocate new node, etc.). In this case we should unmount the filesystem,
2342+ rebooting if it is the root, and possibly advise the use of fsck.
2343+
2344+ USAGE:
2345+
2346+ int some_tree_operation( znode *node, ... )
2347+ {
2348+ // Allocate on a stack pool of carry objects: operations and nodes.
2349+ // Most carry processes will only take objects from here, without
2350+ // dynamic allocation.
2351+
2352+I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2353+
2354+ carry_pool pool;
2355+ carry_level lowest_level;
2356+ carry_op *op;
2357+
2358+ init_carry_pool( &pool );
2359+ init_carry_level( &lowest_level, &pool );
2360+
2361+ // operation may be one of:
2362+ // COP_INSERT --- insert new item into node
2363+ // COP_CUT --- remove part of or whole node
2364+ // COP_PASTE --- increase size of item
2365+ // COP_DELETE --- delete pointer from parent node
2366+ // COP_UPDATE --- update delimiting key in least
2367+ // common ancestor of two
2368+
2369+ op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2370+ if( IS_ERR( op ) || ( op == NULL ) ) {
2371+ handle error
2372+ } else {
2373+ // fill in remaining fields in @op, according to carry.h:carry_op
2374+ result = carry( &lowest_level, NULL );
2375+ }
2376+ done_carry_pool( &pool );
2377+ }
2378+
2379+ When you are implementing node plugin method that participates in carry
2380+ (shifting, insertion, deletion, etc.), do the following:
2381+
2382+ int foo_node_method( znode *node, ..., carry_level *todo )
2383+ {
2384+ carry_op *op;
2385+
2386+ ....
2387+
2388+ // note, that last argument to reiser4_post_carry() is non-null
2389+ // here, because @op is to be applied to the parent of @node, rather
2390+ // than to the @node itself as in the previous case.
2391+
2392+ op = node_post_carry( todo, operation, node, 1 );
2393+ // fill in remaining fields in @op, according to carry.h:carry_op
2394+
2395+ ....
2396+
2397+ }
2398+
2399+ BATCHING:
2400+
2401+ One of the main advantages of level-by-level balancing implemented here is
2402+ ability to batch updates on a parent level and to peform them more
2403+ efficiently as a result.
2404+
2405+ Description To Be Done (TBD).
2406+
2407+ DIFFICULTIES AND SUBTLE POINTS:
2408+
2409+ 1. complex plumbing is required, because:
2410+
2411+ a. effective allocation through pools is needed
2412+
2413+ b. target of operation is not exactly known when operation is
2414+ posted. This is worked around through bitfields in &carry_node and
2415+ logic in lock_carry_node()
2416+
2417+ c. of interaction with locking code: node should be added into sibling
2418+ list when pointer to it is inserted into its parent, which is some time
2419+ after node was created. Between these moments, node is somewhat in
2420+ suspended state and is only registered in the carry lists
2421+
2422+ 2. whole balancing logic is implemented here, in particular, insertion
2423+ logic is coded in make_space().
2424+
2425+ 3. special cases like insertion (reiser4_add_tree_root()) or deletion
2426+ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2427+ (insert_paste()) have to be handled.
2428+
2429+ 4. there is non-trivial interdependency between allocation of new nodes
2430+ and almost everything else. This is mainly due to the (1.c) above. I shall
2431+ write about this later.
2432+
2433+*/
2434+
2435+#include "forward.h"
2436+#include "debug.h"
2437+#include "key.h"
2438+#include "coord.h"
2439+#include "plugin/item/item.h"
2440+#include "plugin/item/extent.h"
2441+#include "plugin/node/node.h"
2442+#include "jnode.h"
2443+#include "znode.h"
2444+#include "tree_mod.h"
2445+#include "tree_walk.h"
2446+#include "block_alloc.h"
2447+#include "pool.h"
2448+#include "tree.h"
2449+#include "carry.h"
2450+#include "carry_ops.h"
2451+#include "super.h"
2452+#include "reiser4.h"
2453+
2454+#include <linux/types.h>
2455+
2456+/* level locking/unlocking */
2457+static int lock_carry_level(carry_level * level);
2458+static void unlock_carry_level(carry_level * level, int failure);
2459+static void done_carry_level(carry_level * level);
2460+static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2461+
2462+int lock_carry_node(carry_level * level, carry_node * node);
2463+int lock_carry_node_tail(carry_node * node);
2464+
2465+/* carry processing proper */
2466+static int carry_on_level(carry_level * doing, carry_level * todo);
2467+
2468+static carry_op *add_op(carry_level * level, pool_ordering order,
2469+ carry_op * reference);
2470+
2471+/* handlers for carry operations. */
2472+
2473+static void fatal_carry_error(carry_level * doing, int ecode);
2474+static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2475+
2476+static void print_level(const char *prefix, carry_level * level);
2477+
2478+#if REISER4_DEBUG
2479+typedef enum {
2480+ CARRY_TODO,
2481+ CARRY_DOING
2482+} carry_queue_state;
2483+static int carry_level_invariant(carry_level * level, carry_queue_state state);
2484+#endif
2485+
2486+/* main entry point for tree balancing.
2487+
2488+ Tree carry performs operations from @doing and while doing so accumulates
2489+ information about operations to be performed on the next level ("carried"
2490+ to the parent level). Carried operations are performed, causing possibly
2491+ more operations to be carried upward etc. carry() takes care about
2492+ locking and pinning znodes while operating on them.
2493+
2494+ For usage, see comment at the top of fs/reiser4/carry.c
2495+
2496+*/
2497+int reiser4_carry(carry_level * doing /* set of carry operations to be
2498+ * performed */ ,
2499+ carry_level * done /* set of nodes, already performed
2500+ * at the previous level.
2501+ * NULL in most cases */)
2502+{
2503+ int result = 0;
2504+ /* queue of new requests */
2505+ carry_level *todo;
2506+ ON_DEBUG(STORE_COUNTERS);
2507+
2508+ assert("nikita-888", doing != NULL);
2509+ BUG_ON(done != NULL);
2510+
2511+ todo = doing + 1;
2512+ init_carry_level(todo, doing->pool);
2513+
2514+ /* queue of requests preformed on the previous level */
2515+ done = todo + 1;
2516+ init_carry_level(done, doing->pool);
2517+
2518+ /* iterate until there is nothing more to do */
2519+ while (result == 0 && doing->ops_num > 0) {
2520+ carry_level *tmp;
2521+
2522+ /* at this point @done is locked. */
2523+ /* repeat lock/do/unlock while
2524+
2525+ (1) lock_carry_level() fails due to deadlock avoidance, or
2526+
2527+ (2) carry_on_level() decides that more nodes have to
2528+ be involved.
2529+
2530+ (3) some unexpected error occurred while balancing on the
2531+ upper levels. In this case all changes are rolled back.
2532+
2533+ */
2534+ while (1) {
2535+ result = lock_carry_level(doing);
2536+ if (result == 0) {
2537+ /* perform operations from @doing and
2538+ accumulate new requests in @todo */
2539+ result = carry_on_level(doing, todo);
2540+ if (result == 0)
2541+ break;
2542+ else if (result != -E_REPEAT ||
2543+ !doing->restartable) {
2544+ warning("nikita-1043",
2545+ "Fatal error during carry: %i",
2546+ result);
2547+ print_level("done", done);
2548+ print_level("doing", doing);
2549+ print_level("todo", todo);
2550+ /* do some rough stuff like aborting
2551+ all pending transcrashes and thus
2552+ pushing tree back to the consistent
2553+ state. Alternatvely, just panic.
2554+ */
2555+ fatal_carry_error(doing, result);
2556+ return result;
2557+ }
2558+ } else if (result != -E_REPEAT) {
2559+ fatal_carry_error(doing, result);
2560+ return result;
2561+ }
2562+ unlock_carry_level(doing, 1);
2563+ }
2564+ /* at this point @done can be safely unlocked */
2565+ done_carry_level(done);
2566+
2567+ /* cyclically shift queues */
2568+ tmp = done;
2569+ done = doing;
2570+ doing = todo;
2571+ todo = tmp;
2572+ init_carry_level(todo, doing->pool);
2573+
2574+ /* give other threads chance to run */
2575+ reiser4_preempt_point();
2576+ }
2577+ done_carry_level(done);
2578+
2579+ /* all counters, but x_refs should remain the same. x_refs can change
2580+ owing to transaction manager */
2581+ ON_DEBUG(CHECK_COUNTERS);
2582+ return result;
2583+}
2584+
2585+/* perform carry operations on given level.
2586+
2587+ Optimizations proposed by pooh:
2588+
2589+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2590+ required;
2591+
2592+ (2) unlock node if there are no more operations to be performed upon it and
2593+ node didn't add any operation to @todo. This can be implemented by
2594+ attaching to each node two counters: counter of operaions working on this
2595+ node and counter and operations carried upward from this node.
2596+
2597+*/
2598+static int carry_on_level(carry_level * doing /* queue of carry operations to
2599+ * do on this level */ ,
2600+ carry_level * todo /* queue where new carry
2601+ * operations to be performed on
2602+ * the * parent level are
2603+ * accumulated during @doing
2604+ * processing. */ )
2605+{
2606+ int result;
2607+ int (*f) (carry_op *, carry_level *, carry_level *);
2608+ carry_op *op;
2609+ carry_op *tmp_op;
2610+
2611+ assert("nikita-1034", doing != NULL);
2612+ assert("nikita-1035", todo != NULL);
2613+
2614+ /* @doing->nodes are locked. */
2615+
2616+ /* This function can be split into two phases: analysis and modification.
2617+
2618+ Analysis calculates precisely what items should be moved between
2619+ nodes. This information is gathered in some structures attached to
2620+ each carry_node in a @doing queue. Analysis also determines whether
2621+ new nodes are to be allocated etc.
2622+
2623+ After analysis is completed, actual modification is performed. Here
2624+ we can take advantage of "batch modification": if there are several
2625+ operations acting on the same node, modifications can be performed
2626+ more efficiently when batched together.
2627+
2628+ Above is an optimization left for the future.
2629+ */
2630+ /* Important, but delayed optimization: it's possible to batch
2631+ operations together and perform them more efficiently as a
2632+ result. For example, deletion of several neighboring items from a
2633+ node can be converted to a single ->cut() operation.
2634+
2635+ Before processing queue, it should be scanned and "mergeable"
2636+ operations merged.
2637+ */
2638+ result = 0;
2639+ for_all_ops(doing, op, tmp_op) {
2640+ carry_opcode opcode;
2641+
2642+ assert("nikita-1041", op != NULL);
2643+ opcode = op->op;
2644+ assert("nikita-1042", op->op < COP_LAST_OP);
2645+ f = op_dispatch_table[op->op].handler;
2646+ result = f(op, doing, todo);
2647+ /* locking can fail with -E_REPEAT. Any different error is fatal
2648+ and will be handled by fatal_carry_error() sledgehammer.
2649+ */
2650+ if (result != 0)
2651+ break;
2652+ }
2653+ if (result == 0) {
2654+ carry_plugin_info info;
2655+ carry_node *scan;
2656+ carry_node *tmp_scan;
2657+
2658+ info.doing = doing;
2659+ info.todo = todo;
2660+
2661+ assert("nikita-3002",
2662+ carry_level_invariant(doing, CARRY_DOING));
2663+ for_all_nodes(doing, scan, tmp_scan) {
2664+ znode *node;
2665+
2666+ node = reiser4_carry_real(scan);
2667+ assert("nikita-2547", node != NULL);
2668+ if (node_is_empty(node)) {
2669+ result =
2670+ node_plugin_by_node(node)->
2671+ prepare_removal(node, &info);
2672+ if (result != 0)
2673+ break;
2674+ }
2675+ }
2676+ }
2677+ return result;
2678+}
2679+
2680+/* post carry operation
2681+
2682+ This is main function used by external carry clients: node layout plugins
2683+ and tree operations to create new carry operation to be performed on some
2684+ level.
2685+
2686+ New operation will be included in the @level queue. To actually perform it,
2687+ call carry( level, ... ). This function takes write lock on @node. Carry
2688+ manages all its locks by itself, don't worry about this.
2689+
2690+ This function adds operation and node at the end of the queue. It is up to
2691+ caller to guarantee proper ordering of node queue.
2692+
2693+*/
2694+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2695+ * is to be posted at */ ,
2696+ carry_opcode op /* opcode of operation */ ,
2697+ znode * node /* node on which this operation
2698+ * will operate */ ,
2699+ int apply_to_parent_p /* whether operation will
2700+ * operate directly on @node
2701+ * or on it parent. */)
2702+{
2703+ carry_op *result;
2704+ carry_node *child;
2705+
2706+ assert("nikita-1046", level != NULL);
2707+ assert("nikita-1788", znode_is_write_locked(node));
2708+
2709+ result = add_op(level, POOLO_LAST, NULL);
2710+ if (IS_ERR(result))
2711+ return result;
2712+ child = reiser4_add_carry(level, POOLO_LAST, NULL);
2713+ if (IS_ERR(child)) {
2714+ reiser4_pool_free(&level->pool->op_pool, &result->header);
2715+ return (carry_op *) child;
2716+ }
2717+ result->node = child;
2718+ result->op = op;
2719+ child->parent = apply_to_parent_p;
2720+ if (ZF_ISSET(node, JNODE_ORPHAN))
2721+ child->left_before = 1;
2722+ child->node = node;
2723+ return result;
2724+}
2725+
2726+/* initialize carry queue */
2727+void init_carry_level(carry_level * level /* level to initialize */ ,
2728+ carry_pool * pool /* pool @level will allocate objects
2729+ * from */ )
2730+{
2731+ assert("nikita-1045", level != NULL);
2732+ assert("nikita-967", pool != NULL);
2733+
2734+ memset(level, 0, sizeof *level);
2735+ level->pool = pool;
2736+
2737+ INIT_LIST_HEAD(&level->nodes);
2738+ INIT_LIST_HEAD(&level->ops);
2739+}
2740+
2741+/* allocate carry pool and initialize pools within queue */
2742+carry_pool *init_carry_pool(int size)
2743+{
2744+ carry_pool *pool;
2745+
2746+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
2747+ pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
2748+ if (pool == NULL)
2749+ return ERR_PTR(RETERR(-ENOMEM));
2750+
2751+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2752+ (char *)pool->op);
2753+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2754+ NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2755+ return pool;
2756+}
2757+
2758+/* finish with queue pools */
2759+void done_carry_pool(carry_pool * pool /* pool to destroy */ )
2760+{
2761+ reiser4_done_pool(&pool->op_pool);
2762+ reiser4_done_pool(&pool->node_pool);
2763+ kfree(pool);
2764+}
2765+
2766+/* add new carry node to the @level.
2767+
2768+ Returns pointer to the new carry node allocated from pool. It's up to
2769+ callers to maintain proper order in the @level. Assumption is that if carry
2770+ nodes on one level are already sorted and modifications are peroformed from
2771+ left to right, carry nodes added on the parent level will be ordered
2772+ automatically. To control ordering use @order and @reference parameters.
2773+
2774+*/
2775+carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2776+ * node to */ ,
2777+ pool_ordering order /* where to insert:
2778+ * at the beginning of
2779+ * @level,
2780+ * before @reference,
2781+ * after @reference,
2782+ * at the end of @level
2783+ */ ,
2784+ carry_node * reference/* reference node for
2785+ * insertion */)
2786+{
2787+ ON_DEBUG(carry_node * orig_ref = reference);
2788+
2789+ if (order == POOLO_BEFORE) {
2790+ reference = find_left_carry(reference, level);
2791+ if (reference == NULL)
2792+ reference = list_entry(level->nodes.next, carry_node,
2793+ header.level_linkage);
2794+ else
2795+ reference = list_entry(reference->header.level_linkage.next,
2796+ carry_node, header.level_linkage);
2797+ } else if (order == POOLO_AFTER) {
2798+ reference = find_right_carry(reference, level);
2799+ if (reference == NULL)
2800+ reference = list_entry(level->nodes.prev, carry_node,
2801+ header.level_linkage);
2802+ else
2803+ reference = list_entry(reference->header.level_linkage.prev,
2804+ carry_node, header.level_linkage);
2805+ }
2806+ assert("nikita-2209",
2807+ ergo(orig_ref != NULL,
2808+ reiser4_carry_real(reference) ==
2809+ reiser4_carry_real(orig_ref)));
2810+ return reiser4_add_carry(level, order, reference);
2811+}
2812+
2813+carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node
2814+ * to */ ,
2815+ pool_ordering order /* where to insert: at the
2816+ * beginning of @level, before
2817+ * @reference, after @reference,
2818+ * at the end of @level */ ,
2819+ carry_node * reference /* reference node for
2820+ * insertion */ )
2821+{
2822+ carry_node *result;
2823+
2824+ result =
2825+ (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2826+ &level->nodes,
2827+ order, &reference->header);
2828+ if (!IS_ERR(result) && (result != NULL))
2829+ ++level->nodes_num;
2830+ return result;
2831+}
2832+
2833+/* add new carry operation to the @level.
2834+
2835+ Returns pointer to the new carry operations allocated from pool. It's up to
2836+ callers to maintain proper order in the @level. To control ordering use
2837+ @order and @reference parameters.
2838+
2839+*/
2840+static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
2841+ pool_ordering order /* where to insert: at the beginning of
2842+ * @level, before @reference, after
2843+ * @reference, at the end of @level */ ,
2844+ carry_op *
2845+ reference /* reference node for insertion */ )
2846+{
2847+ carry_op *result;
2848+
2849+ result =
2850+ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2851+ order, &reference->header);
2852+ if (!IS_ERR(result) && (result != NULL))
2853+ ++level->ops_num;
2854+ return result;
2855+}
2856+
2857+/* Return node on the right of which @node was created.
2858+
2859+ Each node is created on the right of some existing node (or it is new root,
2860+ which is special case not handled here).
2861+
2862+ @node is new node created on some level, but not yet inserted into its
2863+ parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
2864+
2865+*/
2866+static carry_node *find_begetting_brother(carry_node * node /* node to start search
2867+ * from */ ,
2868+ carry_level * kin UNUSED_ARG /* level to
2869+ * scan */ )
2870+{
2871+ carry_node *scan;
2872+
2873+ assert("nikita-1614", node != NULL);
2874+ assert("nikita-1615", kin != NULL);
2875+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
2876+ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
2877+ ZF_ISSET(reiser4_carry_real(node),
2878+ JNODE_ORPHAN)));
2879+ for (scan = node;;
2880+ scan = list_entry(scan->header.level_linkage.prev, carry_node,
2881+ header.level_linkage)) {
2882+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
2883+ if ((scan->node != node->node) &&
2884+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
2885+ assert("nikita-1618", reiser4_carry_real(scan) != NULL);
2886+ break;
2887+ }
2888+ }
2889+ return scan;
2890+}
2891+
2892+static cmp_t
2893+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
2894+{
2895+ assert("nikita-2199", n1 != NULL);
2896+ assert("nikita-2200", n2 != NULL);
2897+
2898+ if (n1 == n2)
2899+ return EQUAL_TO;
2900+ while (1) {
2901+ n1 = carry_node_next(n1);
2902+ if (carry_node_end(level, n1))
2903+ return GREATER_THAN;
2904+ if (n1 == n2)
2905+ return LESS_THAN;
2906+ }
2907+ impossible("nikita-2201", "End of level reached");
2908+}
2909+
2910+carry_node *find_carry_node(carry_level * level, const znode * node)
2911+{
2912+ carry_node *scan;
2913+ carry_node *tmp_scan;
2914+
2915+ assert("nikita-2202", level != NULL);
2916+ assert("nikita-2203", node != NULL);
2917+
2918+ for_all_nodes(level, scan, tmp_scan) {
2919+ if (reiser4_carry_real(scan) == node)
2920+ return scan;
2921+ }
2922+ return NULL;
2923+}
2924+
2925+znode *reiser4_carry_real(const carry_node * node)
2926+{
2927+ assert("nikita-3061", node != NULL);
2928+
2929+ return node->lock_handle.node;
2930+}
2931+
2932+carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
2933+ const znode * node)
2934+{
2935+ carry_node *base;
2936+ carry_node *scan;
2937+ carry_node *tmp_scan;
2938+ carry_node *proj;
2939+
2940+ base = find_carry_node(doing, node);
2941+ assert("nikita-2204", base != NULL);
2942+
2943+ for_all_nodes(todo, scan, tmp_scan) {
2944+ proj = find_carry_node(doing, scan->node);
2945+ assert("nikita-2205", proj != NULL);
2946+ if (carry_node_cmp(doing, proj, base) != LESS_THAN)
2947+ break;
2948+ }
2949+ return scan;
2950+}
2951+
2952+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
2953+ znode * node)
2954+{
2955+ carry_node *reference;
2956+
2957+ assert("nikita-2994", doing != NULL);
2958+ assert("nikita-2995", todo != NULL);
2959+ assert("nikita-2996", node != NULL);
2960+
2961+ reference = insert_carry_node(doing, todo, node);
2962+ assert("nikita-2997", reference != NULL);
2963+
2964+ return reiser4_add_carry(todo, POOLO_BEFORE, reference);
2965+}
2966+
2967+/* like reiser4_post_carry(), but designed to be called from node plugin methods.
2968+ This function is different from reiser4_post_carry() in that it finds proper
2969+ place to insert node in the queue. */
2970+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
2971+ * passed down to node
2972+ * plugin */ ,
2973+ carry_opcode op /* opcode of operation */ ,
2974+ znode * node /* node on which this
2975+ * operation will operate */ ,
2976+ int apply_to_parent_p /* whether operation will
2977+ * operate directly on @node
2978+ * or on it parent. */ )
2979+{
2980+ carry_op *result;
2981+ carry_node *child;
2982+
2983+ assert("nikita-2207", info != NULL);
2984+ assert("nikita-2208", info->todo != NULL);
2985+
2986+ if (info->doing == NULL)
2987+ return reiser4_post_carry(info->todo, op, node,
2988+ apply_to_parent_p);
2989+
2990+ result = add_op(info->todo, POOLO_LAST, NULL);
2991+ if (IS_ERR(result))
2992+ return result;
2993+ child = add_carry_atplace(info->doing, info->todo, node);
2994+ if (IS_ERR(child)) {
2995+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
2996+ return (carry_op *) child;
2997+ }
2998+ result->node = child;
2999+ result->op = op;
3000+ child->parent = apply_to_parent_p;
3001+ if (ZF_ISSET(node, JNODE_ORPHAN))
3002+ child->left_before = 1;
3003+ child->node = node;
3004+ return result;
3005+}
3006+
3007+/* lock all carry nodes in @level */
3008+static int lock_carry_level(carry_level * level /* level to lock */ )
3009+{
3010+ int result;
3011+ carry_node *node;
3012+ carry_node *tmp_node;
3013+
3014+ assert("nikita-881", level != NULL);
3015+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3016+
3017+ /* lock nodes from left to right */
3018+ result = 0;
3019+ for_all_nodes(level, node, tmp_node) {
3020+ result = lock_carry_node(level, node);
3021+ if (result != 0)
3022+ break;
3023+ }
3024+ return result;
3025+}
3026+
3027+/* Synchronize delimiting keys between @node and its left neighbor.
3028+
3029+ To reduce contention on dk key and simplify carry code, we synchronize
3030+ delimiting keys only when carry ultimately leaves tree level (carrying
3031+ changes upward) and unlocks nodes at this level.
3032+
3033+ This function first finds left neighbor of @node and then updates left
3034+ neighbor's right delimiting key to conincide with least key in @node.
3035+
3036+*/
3037+
3038+ON_DEBUG(extern atomic_t delim_key_version;
3039+ )
3040+
3041+static void sync_dkeys(znode * spot /* node to update */ )
3042+{
3043+ reiser4_key pivot;
3044+ reiser4_tree *tree;
3045+
3046+ assert("nikita-1610", spot != NULL);
3047+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3048+
3049+ tree = znode_get_tree(spot);
3050+ read_lock_tree(tree);
3051+ write_lock_dk(tree);
3052+
3053+ assert("nikita-2192", znode_is_loaded(spot));
3054+
3055+ /* sync left delimiting key of @spot with key in its leftmost item */
3056+ if (node_is_empty(spot))
3057+ pivot = *znode_get_rd_key(spot);
3058+ else
3059+ leftmost_key_in_node(spot, &pivot);
3060+
3061+ znode_set_ld_key(spot, &pivot);
3062+
3063+ /* there can be sequence of empty nodes pending removal on the left of
3064+ @spot. Scan them and update their left and right delimiting keys to
3065+ match left delimiting key of @spot. Also, update right delimiting
3066+ key of first non-empty left neighbor.
3067+ */
3068+ while (1) {
3069+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3070+ break;
3071+
3072+ spot = spot->left;
3073+ if (spot == NULL)
3074+ break;
3075+
3076+ znode_set_rd_key(spot, &pivot);
3077+ /* don't sink into the domain of another balancing */
3078+ if (!znode_is_write_locked(spot))
3079+ break;
3080+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3081+ znode_set_ld_key(spot, &pivot);
3082+ else
3083+ break;
3084+ }
3085+
3086+ write_unlock_dk(tree);
3087+ read_unlock_tree(tree);
3088+}
3089+
3090+/* unlock all carry nodes in @level */
3091+static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3092+ int failure /* true if unlocking owing to
3093+ * failure */ )
3094+{
3095+ carry_node *node;
3096+ carry_node *tmp_node;
3097+
3098+ assert("nikita-889", level != NULL);
3099+
3100+ if (!failure) {
3101+ znode *spot;
3102+
3103+ spot = NULL;
3104+ /* update delimiting keys */
3105+ for_all_nodes(level, node, tmp_node) {
3106+ if (reiser4_carry_real(node) != spot) {
3107+ spot = reiser4_carry_real(node);
3108+ sync_dkeys(spot);
3109+ }
3110+ }
3111+ }
3112+
3113+ /* nodes can be unlocked in arbitrary order. In preemptible
3114+ environment it's better to unlock in reverse order of locking,
3115+ though.
3116+ */
3117+ for_all_nodes_back(level, node, tmp_node) {
3118+ /* all allocated nodes should be already linked to their
3119+ parents at this moment. */
3120+ assert("nikita-1631",
3121+ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3122+ JNODE_ORPHAN)));
3123+ ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3124+ unlock_carry_node(level, node, failure);
3125+ }
3126+ level->new_root = NULL;
3127+}
3128+
3129+/* finish with @level
3130+
3131+ Unlock nodes and release all allocated resources */
3132+static void done_carry_level(carry_level * level /* level to finish */ )
3133+{
3134+ carry_node *node;
3135+ carry_node *tmp_node;
3136+ carry_op *op;
3137+ carry_op *tmp_op;
3138+
3139+ assert("nikita-1076", level != NULL);
3140+
3141+ unlock_carry_level(level, 0);
3142+ for_all_nodes(level, node, tmp_node) {
3143+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3144+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3145+ reiser4_pool_free(&level->pool->node_pool, &node->header);
3146+ }
3147+ for_all_ops(level, op, tmp_op)
3148+ reiser4_pool_free(&level->pool->op_pool, &op->header);
3149+}
3150+
3151+/* helper function to complete locking of carry node
3152+
3153+ Finish locking of carry node. There are several ways in which new carry
3154+ node can be added into carry level and locked. Normal is through
3155+ lock_carry_node(), but also from find_{left|right}_neighbor(). This
3156+ function factors out common final part of all locking scenarios. It
3157+ supposes that @node -> lock_handle is lock handle for lock just taken and
3158+ fills ->real_node from this lock handle.
3159+
3160+*/
3161+int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3162+{
3163+ assert("nikita-1052", node != NULL);
3164+ assert("nikita-1187", reiser4_carry_real(node) != NULL);
3165+ assert("nikita-1188", !node->unlock);
3166+
3167+ node->unlock = 1;
3168+ /* Load node content into memory and install node plugin by
3169+ looking at the node header.
3170+
3171+ Most of the time this call is cheap because the node is
3172+ already in memory.
3173+
3174+ Corresponding zrelse() is in unlock_carry_node()
3175+ */
3176+ return zload(reiser4_carry_real(node));
3177+}
3178+
3179+/* lock carry node
3180+
3181+ "Resolve" node to real znode, lock it and mark as locked.
3182+ This requires recursive locking of znodes.
3183+
3184+ When operation is posted to the parent level, node it will be applied to is
3185+ not yet known. For example, when shifting data between two nodes,
3186+ delimiting has to be updated in parent or parents of nodes involved. But
3187+ their parents is not yet locked and, moreover said nodes can be reparented
3188+ by concurrent balancing.
3189+
3190+ To work around this, carry operation is applied to special "carry node"
3191+ rather than to the znode itself. Carry node consists of some "base" or
3192+ "reference" znode and flags indicating how to get to the target of carry
3193+ operation (->real_node field of carry_node) from base.
3194+
3195+*/
3196+int lock_carry_node(carry_level * level /* level @node is in */ ,
3197+ carry_node * node /* node to lock */ )
3198+{
3199+ int result;
3200+ znode *reference_point;
3201+ lock_handle lh;
3202+ lock_handle tmp_lh;
3203+ reiser4_tree *tree;
3204+
3205+ assert("nikita-887", level != NULL);
3206+ assert("nikita-882", node != NULL);
3207+
3208+ result = 0;
3209+ reference_point = node->node;
3210+ init_lh(&lh);
3211+ init_lh(&tmp_lh);
3212+ if (node->left_before) {
3213+ /* handling of new nodes, allocated on the previous level:
3214+
3215+ some carry ops were propably posted from the new node, but
3216+ this node neither has parent pointer set, nor is
3217+ connected. This will be done in ->create_hook() for
3218+ internal item.
3219+
3220+ No then less, parent of new node has to be locked. To do
3221+ this, first go to the "left" in the carry order. This
3222+ depends on the decision to always allocate new node on the
3223+ right of existing one.
3224+
3225+ Loop handles case when multiple nodes, all orphans, were
3226+ inserted.
3227+
3228+ Strictly speaking, taking tree lock is not necessary here,
3229+ because all nodes scanned by loop in
3230+ find_begetting_brother() are write-locked by this thread,
3231+ and thus, their sibling linkage cannot change.
3232+
3233+ */
3234+ tree = znode_get_tree(reference_point);
3235+ read_lock_tree(tree);
3236+ reference_point = find_begetting_brother(node, level)->node;
3237+ read_unlock_tree(tree);
3238+ assert("nikita-1186", reference_point != NULL);
3239+ }
3240+ if (node->parent && (result == 0)) {
3241+ result =
3242+ reiser4_get_parent(&tmp_lh, reference_point,
3243+ ZNODE_WRITE_LOCK);
3244+ if (result != 0) {
3245+ ; /* nothing */
3246+ } else if (znode_get_level(tmp_lh.node) == 0) {
3247+ assert("nikita-1347", znode_above_root(tmp_lh.node));
3248+ result = add_new_root(level, node, tmp_lh.node);
3249+ if (result == 0) {
3250+ reference_point = level->new_root;
3251+ move_lh(&lh, &node->lock_handle);
3252+ }
3253+ } else if ((level->new_root != NULL)
3254+ && (level->new_root !=
3255+ znode_parent_nolock(reference_point))) {
3256+ /* parent of node exists, but this level aready
3257+ created different new root, so */
3258+ warning("nikita-1109",
3259+ /* it should be "radicis", but tradition is
3260+ tradition. do banshees read latin? */
3261+ "hodie natus est radici frater");
3262+ result = -EIO;
3263+ } else {
3264+ move_lh(&lh, &tmp_lh);
3265+ reference_point = lh.node;
3266+ }
3267+ }
3268+ if (node->left && (result == 0)) {
3269+ assert("nikita-1183", node->parent);
3270+ assert("nikita-883", reference_point != NULL);
3271+ result =
3272+ reiser4_get_left_neighbor(&tmp_lh, reference_point,
3273+ ZNODE_WRITE_LOCK,
3274+ GN_CAN_USE_UPPER_LEVELS);
3275+ if (result == 0) {
3276+ done_lh(&lh);
3277+ move_lh(&lh, &tmp_lh);
3278+ reference_point = lh.node;
3279+ }
3280+ }
3281+ if (!node->parent && !node->left && !node->left_before) {
3282+ result =
3283+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3284+ ZNODE_LOCK_HIPRI);
3285+ }
3286+ if (result == 0) {
3287+ move_lh(&node->lock_handle, &lh);
3288+ result = lock_carry_node_tail(node);
3289+ }
3290+ done_lh(&tmp_lh);
3291+ done_lh(&lh);
3292+ return result;
3293+}
3294+
3295+/* release a lock on &carry_node.
3296+
3297+ Release if necessary lock on @node. This opearion is pair of
3298+ lock_carry_node() and is idempotent: you can call it more than once on the
3299+ same node.
3300+
3301+*/
3302+static void
3303+unlock_carry_node(carry_level * level,
3304+ carry_node * node /* node to be released */ ,
3305+ int failure /* 0 if node is unlocked due
3306+ * to some error */ )
3307+{
3308+ znode *real_node;
3309+
3310+ assert("nikita-884", node != NULL);
3311+
3312+ real_node = reiser4_carry_real(node);
3313+ /* pair to zload() in lock_carry_node_tail() */
3314+ zrelse(real_node);
3315+ if (node->unlock && (real_node != NULL)) {
3316+ assert("nikita-899", real_node == node->lock_handle.node);
3317+ longterm_unlock_znode(&node->lock_handle);
3318+ }
3319+ if (failure) {
3320+ if (node->deallocate && (real_node != NULL)) {
3321+ /* free node in bitmap
3322+
3323+ Prepare node for removal. Last zput() will finish
3324+ with it.
3325+ */
3326+ ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3327+ }
3328+ if (node->free) {
3329+ assert("nikita-2177",
3330+ list_empty_careful(&node->lock_handle.locks_link));
3331+ assert("nikita-2112",
3332+ list_empty_careful(&node->lock_handle.owners_link));
3333+ reiser4_pool_free(&level->pool->node_pool,
3334+ &node->header);
3335+ }
3336+ }
3337+}
3338+
3339+/* fatal_carry_error() - all-catching error handling function
3340+
3341+ It is possible that carry faces unrecoverable error, like unability to
3342+ insert pointer at the internal level. Our simple solution is just panic in
3343+ this situation. More sophisticated things like attempt to remount
3344+ file-system as read-only can be implemented without much difficlties.
3345+
3346+ It is believed, that:
3347+
3348+ 1. in stead of panicking, all current transactions can be aborted rolling
3349+ system back to the consistent state.
3350+
3351+Umm, if you simply panic without doing anything more at all, then all current
3352+transactions are aborted and the system is rolled back to a consistent state,
3353+by virtue of the design of the transactional mechanism. Well, wait, let's be
3354+precise. If an internal node is corrupted on disk due to hardware failure,
3355+then there may be no consistent state that can be rolled back to, so instead
3356+we should say that it will rollback the transactions, which barring other
3357+factors means rolling back to a consistent state.
3358+
3359+# Nikita: there is a subtle difference between panic and aborting
3360+# transactions: machine doesn't reboot. Processes aren't killed. Processes
3361+# don't using reiser4 (not that we care about such processes), or using other
3362+# reiser4 mounts (about them we do care) will simply continue to run. With
3363+# some luck, even application using aborted file system can survive: it will
3364+# get some error, like EBADF, from each file descriptor on failed file system,
3365+# but applications that do care about tolerance will cope with this (squid
3366+# will).
3367+
3368+It would be a nice feature though to support rollback without rebooting
3369+followed by remount, but this can wait for later versions.
3370+
3371+ 2. once isolated transactions will be implemented it will be possible to
3372+ roll back offending transaction.
3373+
3374+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3375+it more before deciding if it should be done. -Hans
3376+
3377+*/
3378+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3379+ * where
3380+ * unrecoverable
3381+ * error
3382+ * occurred */ ,
3383+ int ecode /* error code */ )
3384+{
3385+ assert("nikita-1230", doing != NULL);
3386+ assert("nikita-1231", ecode < 0);
3387+
3388+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3389+}
3390+
3391+/* add new root to the tree
3392+
3393+ This function itself only manages changes in carry structures and delegates
3394+ all hard work (allocation of znode for new root, changes of parent and
3395+ sibling pointers to the reiser4_add_tree_root().
3396+
3397+ Locking: old tree root is locked by carry at this point. Fake znode is also
3398+ locked.
3399+
3400+*/
3401+static int add_new_root(carry_level * level /* carry level in context of which
3402+ * operation is performed */ ,
3403+ carry_node * node /* carry node for existing root */ ,
3404+ znode * fake /* "fake" znode already locked by
3405+ * us */ )
3406+{
3407+ int result;
3408+
3409+ assert("nikita-1104", level != NULL);
3410+ assert("nikita-1105", node != NULL);
3411+
3412+ assert("nikita-1403", znode_is_write_locked(node->node));
3413+ assert("nikita-1404", znode_is_write_locked(fake));
3414+
3415+ /* trying to create new root. */
3416+ /* @node is root and it's already locked by us. This
3417+ means that nobody else can be trying to add/remove
3418+ tree root right now.
3419+ */
3420+ if (level->new_root == NULL)
3421+ level->new_root = reiser4_add_tree_root(node->node, fake);
3422+ if (!IS_ERR(level->new_root)) {
3423+ assert("nikita-1210", znode_is_root(level->new_root));
3424+ node->deallocate = 1;
3425+ result =
3426+ longterm_lock_znode(&node->lock_handle, level->new_root,
3427+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3428+ if (result == 0)
3429+ zput(level->new_root);
3430+ } else {
3431+ result = PTR_ERR(level->new_root);
3432+ level->new_root = NULL;
3433+ }
3434+ return result;
3435+}
3436+
3437+/* allocate new znode and add the operation that inserts the
3438+ pointer to it into the parent node into the todo level
3439+
3440+ Allocate new znode, add it into carry queue and post into @todo queue
3441+ request to add pointer to new node into its parent.
3442+
3443+ This is carry related routing that calls reiser4_new_node() to allocate new
3444+ node.
3445+*/
3446+carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3447+ * node */ ,
3448+ carry_node * ref /* carry node after which new
3449+ * carry node is to be inserted
3450+ * into queue. This affects
3451+ * locking. */ ,
3452+ carry_level * doing /* carry queue where new node is
3453+ * to be added */ ,
3454+ carry_level * todo /* carry queue where COP_INSERT
3455+ * operation to add pointer to
3456+ * new node will ne added */ )
3457+{
3458+ carry_node *fresh;
3459+ znode *new_znode;
3460+ carry_op *add_pointer;
3461+ carry_plugin_info info;
3462+
3463+ assert("nikita-1048", brother != NULL);
3464+ assert("nikita-1049", todo != NULL);
3465+
3466+ /* There is a lot of possible variations here: to what parent
3467+ new node will be attached and where. For simplicity, always
3468+ do the following:
3469+
3470+ (1) new node and @brother will have the same parent.
3471+
3472+ (2) new node is added on the right of @brother
3473+
3474+ */
3475+
3476+ fresh = reiser4_add_carry_skip(doing,
3477+ ref ? POOLO_AFTER : POOLO_LAST, ref);
3478+ if (IS_ERR(fresh))
3479+ return fresh;
3480+
3481+ fresh->deallocate = 1;
3482+ fresh->free = 1;
3483+
3484+ new_znode = reiser4_new_node(brother, znode_get_level(brother));
3485+ if (IS_ERR(new_znode))
3486+ /* @fresh will be deallocated automatically by error
3487+ handling code in the caller. */
3488+ return (carry_node *) new_znode;
3489+
3490+ /* new_znode returned znode with x_count 1. Caller has to decrease
3491+ it. make_space() does. */
3492+
3493+ ZF_SET(new_znode, JNODE_ORPHAN);
3494+ fresh->node = new_znode;
3495+
3496+ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3497+ ref = carry_node_prev(ref);
3498+ assert("nikita-1606", !carry_node_end(doing, ref));
3499+ }
3500+
3501+ info.todo = todo;
3502+ info.doing = doing;
3503+ add_pointer = node_post_carry(&info, COP_INSERT,
3504+ reiser4_carry_real(ref), 1);
3505+ if (IS_ERR(add_pointer)) {
3506+ /* no need to deallocate @new_znode here: it will be
3507+ deallocated during carry error handling. */
3508+ return (carry_node *) add_pointer;
3509+ }
3510+
3511+ add_pointer->u.insert.type = COPT_CHILD;
3512+ add_pointer->u.insert.child = fresh;
3513+ add_pointer->u.insert.brother = brother;
3514+ /* initially new node spawns empty key range */
3515+ write_lock_dk(znode_get_tree(brother));
3516+ znode_set_ld_key(new_znode,
3517+ znode_set_rd_key(new_znode,
3518+ znode_get_rd_key(brother)));
3519+ write_unlock_dk(znode_get_tree(brother));
3520+ return fresh;
3521+}
3522+
3523+/* DEBUGGING FUNCTIONS.
3524+
3525+ Probably we also should leave them on even when
3526+ debugging is turned off to print dumps at errors.
3527+*/
3528+#if REISER4_DEBUG
3529+static int carry_level_invariant(carry_level * level, carry_queue_state state)
3530+{
3531+ carry_node *node;
3532+ carry_node *tmp_node;
3533+
3534+ if (level == NULL)
3535+ return 0;
3536+
3537+ if (level->track_type != 0 &&
3538+ level->track_type != CARRY_TRACK_NODE &&
3539+ level->track_type != CARRY_TRACK_CHANGE)
3540+ return 0;
3541+
3542+ /* check that nodes are in ascending order */
3543+ for_all_nodes(level, node, tmp_node) {
3544+ znode *left;
3545+ znode *right;
3546+
3547+ reiser4_key lkey;
3548+ reiser4_key rkey;
3549+
3550+ if (node != carry_node_front(level)) {
3551+ if (state == CARRY_TODO) {
3552+ right = node->node;
3553+ left = carry_node_prev(node)->node;
3554+ } else {
3555+ right = reiser4_carry_real(node);
3556+ left = reiser4_carry_real(carry_node_prev(node));
3557+ }
3558+ if (right == NULL || left == NULL)
3559+ continue;
3560+ if (node_is_empty(right) || node_is_empty(left))
3561+ continue;
3562+ if (!keyle(leftmost_key_in_node(left, &lkey),
3563+ leftmost_key_in_node(right, &rkey))) {
3564+ warning("", "wrong key order");
3565+ return 0;
3566+ }
3567+ }
3568+ }
3569+ return 1;
3570+}
3571+#endif
3572+
3573+/* get symbolic name for boolean */
3574+static const char *tf(int boolean /* truth value */ )
3575+{
3576+ return boolean ? "t" : "f";
3577+}
3578+
3579+/* symbolic name for carry operation */
3580+static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3581+{
3582+ switch (op) {
3583+ case COP_INSERT:
3584+ return "COP_INSERT";
3585+ case COP_DELETE:
3586+ return "COP_DELETE";
3587+ case COP_CUT:
3588+ return "COP_CUT";
3589+ case COP_PASTE:
3590+ return "COP_PASTE";
3591+ case COP_UPDATE:
3592+ return "COP_UPDATE";
3593+ case COP_EXTENT:
3594+ return "COP_EXTENT";
3595+ case COP_INSERT_FLOW:
3596+ return "COP_INSERT_FLOW";
3597+ default:{
3598+ /* not mt safe, but who cares? */
3599+ static char buf[20];
3600+
3601+ sprintf(buf, "unknown op: %x", op);
3602+ return buf;
3603+ }
3604+ }
3605+}
3606+
3607+/* dump information about carry node */
3608+static void print_carry(const char *prefix /* prefix to print */ ,
3609+ carry_node * node /* node to print */ )
3610+{
3611+ if (node == NULL) {
3612+ printk("%s: null\n", prefix);
3613+ return;
3614+ }
3615+ printk
3616+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3617+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3618+ tf(node->free), tf(node->deallocate));
3619+}
3620+
3621+/* dump information about carry operation */
3622+static void print_op(const char *prefix /* prefix to print */ ,
3623+ carry_op * op /* operation to print */ )
3624+{
3625+ if (op == NULL) {
3626+ printk("%s: null\n", prefix);
3627+ return;
3628+ }
3629+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3630+ print_carry("\tnode", op->node);
3631+ switch (op->op) {
3632+ case COP_INSERT:
3633+ case COP_PASTE:
3634+ print_coord("\tcoord",
3635+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3636+ reiser4_print_key("\tkey",
3637+ op->u.insert.d ? op->u.insert.d->key : NULL);
3638+ print_carry("\tchild", op->u.insert.child);
3639+ break;
3640+ case COP_DELETE:
3641+ print_carry("\tchild", op->u.delete.child);
3642+ break;
3643+ case COP_CUT:
3644+ if (op->u.cut_or_kill.is_cut) {
3645+ print_coord("\tfrom",
3646+ op->u.cut_or_kill.u.kill->params.from, 0);
3647+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3648+ 0);
3649+ } else {
3650+ print_coord("\tfrom",
3651+ op->u.cut_or_kill.u.cut->params.from, 0);
3652+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3653+ 0);
3654+ }
3655+ break;
3656+ case COP_UPDATE:
3657+ print_carry("\tleft", op->u.update.left);
3658+ break;
3659+ default:
3660+ /* do nothing */
3661+ break;
3662+ }
3663+}
3664+
3665+/* dump information about all nodes and operations in a @level */
3666+static void print_level(const char *prefix /* prefix to print */ ,
3667+ carry_level * level /* level to print */ )
3668+{
3669+ carry_node *node;
3670+ carry_node *tmp_node;
3671+ carry_op *op;
3672+ carry_op *tmp_op;
3673+
3674+ if (level == NULL) {
3675+ printk("%s: null\n", prefix);
3676+ return;
3677+ }
3678+ printk("%s: %p, restartable: %s\n",
3679+ prefix, level, tf(level->restartable));
3680+
3681+ for_all_nodes(level, node, tmp_node)
3682+ print_carry("\tcarry node", node);
3683+ for_all_ops(level, op, tmp_op)
3684+ print_op("\tcarry op", op);
3685+}
3686+
3687+/* Make Linus happy.
3688+ Local variables:
3689+ c-indentation-style: "K&R"
3690+ mode-name: "LC"
3691+ c-basic-offset: 8
3692+ tab-width: 8
3693+ fill-column: 120
3694+ scroll-step: 1
3695+ End:
3696+*/
3697diff -urN linux-2.6.23.orig/fs/reiser4/carry.h linux-2.6.23/fs/reiser4/carry.h
3698--- linux-2.6.23.orig/fs/reiser4/carry.h 1970-01-01 03:00:00.000000000 +0300
3699+++ linux-2.6.23/fs/reiser4/carry.h 2007-12-04 16:49:30.000000000 +0300
3700@@ -0,0 +1,442 @@
3701+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3702+
3703+/* Functions and data types to "carry" tree modification(s) upward.
3704+ See fs/reiser4/carry.c for details. */
3705+
3706+#if !defined( __FS_REISER4_CARRY_H__ )
3707+#define __FS_REISER4_CARRY_H__
3708+
3709+#include "forward.h"
3710+#include "debug.h"
3711+#include "pool.h"
3712+#include "znode.h"
3713+
3714+#include <linux/types.h>
3715+
3716+/* &carry_node - "location" of carry node.
3717+
3718+ "location" of node that is involved or going to be involved into
3719+ carry process. Node where operation will be carried to on the
3720+ parent level cannot be recorded explicitly. Operation will be carried
3721+ usually to the parent of some node (where changes are performed at
3722+ the current level) or, to the left neighbor of its parent. But while
3723+ modifications are performed at the current level, parent may
3724+ change. So, we have to allow some indirection (or, positevly,
3725+ flexibility) in locating carry nodes.
3726+
3727+*/
3728+typedef struct carry_node {
3729+ /* pool linkage */
3730+ struct reiser4_pool_header header;
3731+
3732+ /* base node from which real_node is calculated. See
3733+ fs/reiser4/carry.c:lock_carry_node(). */
3734+ znode *node;
3735+
3736+ /* how to get ->real_node */
3737+ /* to get ->real_node obtain parent of ->node */
3738+ __u32 parent:1;
3739+ /* to get ->real_node obtain left neighbor of parent of
3740+ ->node */
3741+ __u32 left:1;
3742+ __u32 left_before:1;
3743+
3744+ /* locking */
3745+
3746+ /* this node was locked by carry process and should be
3747+ unlocked when carry leaves a level */
3748+ __u32 unlock:1;
3749+
3750+ /* disk block for this node was allocated by carry process and
3751+ should be deallocated when carry leaves a level */
3752+ __u32 deallocate:1;
3753+ /* this carry node was allocated by carry process and should be
3754+ freed when carry leaves a level */
3755+ __u32 free:1;
3756+
3757+ /* type of lock we want to take on this node */
3758+ lock_handle lock_handle;
3759+} carry_node;
3760+
3761+/* &carry_opcode - elementary operations that can be carried upward
3762+
3763+ Operations that carry() can handle. This list is supposed to be
3764+ expanded.
3765+
3766+ Each carry operation (cop) is handled by appropriate function defined
3767+ in fs/reiser4/carry.c. For example COP_INSERT is handled by
3768+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3769+ call plugins of nodes affected by operation to modify nodes' content
3770+ and to gather operations to be performed on the next level.
3771+
3772+*/
3773+typedef enum {
3774+ /* insert new item into node. */
3775+ COP_INSERT,
3776+ /* delete pointer from parent node */
3777+ COP_DELETE,
3778+ /* remove part of or whole node. */
3779+ COP_CUT,
3780+ /* increase size of item. */
3781+ COP_PASTE,
3782+ /* insert extent (that is sequence of unformatted nodes). */
3783+ COP_EXTENT,
3784+ /* update delimiting key in least common ancestor of two
3785+ nodes. This is performed when items are moved between two
3786+ nodes.
3787+ */
3788+ COP_UPDATE,
3789+ /* insert flow */
3790+ COP_INSERT_FLOW,
3791+ COP_LAST_OP,
3792+} carry_opcode;
3793+
3794+#define CARRY_FLOW_NEW_NODES_LIMIT 20
3795+
3796+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3797+ item is determined. */
3798+typedef enum {
3799+ /* target item is one containing pointer to the ->child node */
3800+ COPT_CHILD,
3801+ /* target item is given explicitly by @coord */
3802+ COPT_ITEM_DATA,
3803+ /* target item is given by key */
3804+ COPT_KEY,
3805+ /* see insert_paste_common() for more comments on this. */
3806+ COPT_PASTE_RESTARTED,
3807+} cop_insert_pos_type;
3808+
3809+/* flags to cut and delete */
3810+typedef enum {
3811+ /* don't kill node even if it became completely empty as results of
3812+ * cut. This is needed for eottl handling. See carry_extent() for
3813+ * details. */
3814+ DELETE_RETAIN_EMPTY = (1 << 0)
3815+} cop_delete_flag;
3816+
3817+/*
3818+ * carry() implements "lock handle tracking" feature.
3819+ *
3820+ * Callers supply carry with node where to perform initial operation and lock
3821+ * handle on this node. Trying to optimize node utilization carry may actually
3822+ * move insertion point to different node. Callers expect that lock handle
3823+ * will rebe transferred to the new node also.
3824+ *
3825+ */
3826+typedef enum {
3827+ /* transfer lock handle along with insertion point */
3828+ CARRY_TRACK_CHANGE = 1,
3829+ /* acquire new lock handle to the node where insertion point is. This
3830+ * is used when carry() client doesn't initially possess lock handle
3831+ * on the insertion point node, for example, by extent insertion
3832+ * code. See carry_extent(). */
3833+ CARRY_TRACK_NODE = 2
3834+} carry_track_type;
3835+
3836+/* data supplied to COP_{INSERT|PASTE} by callers */
3837+typedef struct carry_insert_data {
3838+ /* position where new item is to be inserted */
3839+ coord_t *coord;
3840+ /* new item description */
3841+ reiser4_item_data *data;
3842+ /* key of new item */
3843+ const reiser4_key *key;
3844+} carry_insert_data;
3845+
3846+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
3847+struct cut_kill_params {
3848+ /* coord where cut starts (inclusive) */
3849+ coord_t *from;
3850+ /* coord where cut stops (inclusive, this item/unit will also be
3851+ * cut) */
3852+ coord_t *to;
3853+ /* starting key. This is necessary when item and unit pos don't
3854+ * uniquely identify what portion or tree to remove. For example, this
3855+ * indicates what portion of extent unit will be affected. */
3856+ const reiser4_key *from_key;
3857+ /* exclusive stop key */
3858+ const reiser4_key *to_key;
3859+ /* if this is not NULL, smallest actually removed key is stored
3860+ * here. */
3861+ reiser4_key *smallest_removed;
3862+ /* kill_node_content() is called for file truncate */
3863+ int truncate;
3864+};
3865+
3866+struct carry_cut_data {
3867+ struct cut_kill_params params;
3868+};
3869+
3870+struct carry_kill_data {
3871+ struct cut_kill_params params;
3872+ /* parameter to be passed to the ->kill_hook() method of item
3873+ * plugin */
3874+ /*void *iplug_params; *//* FIXME: unused currently */
3875+ /* if not NULL---inode whose items are being removed. This is needed
3876+ * for ->kill_hook() of extent item to update VM structures when
3877+ * removing pages. */
3878+ struct inode *inode;
3879+ /* sibling list maintenance is complicated by existence of eottl. When
3880+ * eottl whose left and right neighbors are formatted leaves is
3881+ * removed, one has to connect said leaves in the sibling list. This
3882+ * cannot be done when extent removal is just started as locking rules
3883+ * require sibling list update to happen atomically with removal of
3884+ * extent item. Therefore: 1. pointers to left and right neighbors
3885+ * have to be passed down to the ->kill_hook() of extent item, and
3886+ * 2. said neighbors have to be locked. */
3887+ lock_handle *left;
3888+ lock_handle *right;
3889+ /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
3890+ unsigned flags;
3891+ char *buf;
3892+};
3893+
3894+/* &carry_tree_op - operation to "carry" upward.
3895+
3896+ Description of an operation we want to "carry" to the upper level of
3897+ a tree: e.g, when we insert something and there is not enough space
3898+ we allocate a new node and "carry" the operation of inserting a
3899+ pointer to the new node to the upper level, on removal of empty node,
3900+ we carry up operation of removing appropriate entry from parent.
3901+
3902+ There are two types of carry ops: when adding or deleting node we
3903+ node at the parent level where appropriate modification has to be
3904+ performed is known in advance. When shifting items between nodes
3905+ (split, merge), delimiting key should be changed in the least common
3906+ parent of the nodes involved that is not known in advance.
3907+
3908+ For the operations of the first type we store in &carry_op pointer to
3909+ the &carry_node at the parent level. For the operation of the second
3910+ type we store &carry_node or parents of the left and right nodes
3911+ modified and keep track of them upward until they coincide.
3912+
3913+*/
3914+typedef struct carry_op {
3915+ /* pool linkage */
3916+ struct reiser4_pool_header header;
3917+ carry_opcode op;
3918+ /* node on which operation is to be performed:
3919+
3920+ for insert, paste: node where new item is to be inserted
3921+
3922+ for delete: node where pointer is to be deleted
3923+
3924+ for cut: node to cut from
3925+
3926+ for update: node where delimiting key is to be modified
3927+
3928+ for modify: parent of modified node
3929+
3930+ */
3931+ carry_node *node;
3932+ union {
3933+ struct {
3934+ /* (sub-)type of insertion/paste. Taken from
3935+ cop_insert_pos_type. */
3936+ __u8 type;
3937+ /* various operation flags. Taken from
3938+ cop_insert_flag. */
3939+ __u8 flags;
3940+ carry_insert_data *d;
3941+ carry_node *child;
3942+ znode *brother;
3943+ } insert, paste, extent;
3944+
3945+ struct {
3946+ int is_cut;
3947+ union {
3948+ carry_kill_data *kill;
3949+ carry_cut_data *cut;
3950+ } u;
3951+ } cut_or_kill;
3952+
3953+ struct {
3954+ carry_node *left;
3955+ } update;
3956+ struct {
3957+ /* changed child */
3958+ carry_node *child;
3959+ /* bitmask of changes. See &cop_modify_flag */
3960+ __u32 flag;
3961+ } modify;
3962+ struct {
3963+ /* flags to deletion operation. Are taken from
3964+ cop_delete_flag */
3965+ __u32 flags;
3966+ /* child to delete from parent. If this is
3967+ NULL, delete op->node. */
3968+ carry_node *child;
3969+ } delete;
3970+ struct {
3971+ /* various operation flags. Taken from
3972+ cop_insert_flag. */
3973+ __u32 flags;
3974+ flow_t *flow;
3975+ coord_t *insert_point;
3976+ reiser4_item_data *data;
3977+ /* flow insertion is limited by number of new blocks
3978+ added in that operation which do not get any data
3979+ but part of flow. This limit is set by macro
3980+ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
3981+ of nodes added already during one carry_flow */
3982+ int new_nodes;
3983+ } insert_flow;
3984+ } u;
3985+} carry_op;
3986+
3987+/* &carry_op_pool - preallocated pool of carry operations, and nodes */
3988+typedef struct carry_pool {
3989+ carry_op op[CARRIES_POOL_SIZE];
3990+ struct reiser4_pool op_pool;
3991+ carry_node node[NODES_LOCKED_POOL_SIZE];
3992+ struct reiser4_pool node_pool;
3993+} carry_pool;
3994+
3995+/* &carry_tree_level - carry process on given level
3996+
3997+ Description of balancing process on the given level.
3998+
3999+ No need for locking here, as carry_tree_level is essentially per
4000+ thread thing (for now).
4001+
4002+*/
4003+struct carry_level {
4004+ /* this level may be restarted */
4005+ __u32 restartable:1;
4006+ /* list of carry nodes on this level, ordered by key order */
4007+ struct list_head nodes;
4008+ struct list_head ops;
4009+ /* pool where new objects are allocated from */
4010+ carry_pool *pool;
4011+ int ops_num;
4012+ int nodes_num;
4013+ /* new root created on this level, if any */
4014+ znode *new_root;
4015+ /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
4016+ when they want ->tracked to automagically wander to the node where
4017+ insertion point moved after insert or paste.
4018+ */
4019+ carry_track_type track_type;
4020+ /* lock handle supplied by user that we are tracking. See
4021+ above. */
4022+ lock_handle *tracked;
4023+};
4024+
4025+/* information carry passes to plugin methods that may add new operations to
4026+ the @todo queue */
4027+struct carry_plugin_info {
4028+ carry_level *doing;
4029+ carry_level *todo;
4030+};
4031+
4032+int reiser4_carry(carry_level * doing, carry_level * done);
4033+
4034+carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
4035+ carry_node * reference);
4036+carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
4037+ carry_node * reference);
4038+
4039+extern carry_node *insert_carry_node(carry_level * doing,
4040+ carry_level * todo, const znode * node);
4041+
4042+extern carry_pool *init_carry_pool(int);
4043+extern void done_carry_pool(carry_pool * pool);
4044+
4045+extern void init_carry_level(carry_level * level, carry_pool * pool);
4046+
4047+extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4048+ znode * node, int apply_to_parent);
4049+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4050+ znode * node, int apply_to_parent_p);
4051+
4052+carry_node *add_new_znode(znode * brother, carry_node * reference,
4053+ carry_level * doing, carry_level * todo);
4054+
4055+carry_node *find_carry_node(carry_level * level, const znode * node);
4056+
4057+extern znode *reiser4_carry_real(const carry_node * node);
4058+
4059+/* helper macros to iterate over carry queues */
4060+
4061+#define carry_node_next( node ) \
4062+ list_entry((node)->header.level_linkage.next, carry_node, \
4063+ header.level_linkage)
4064+
4065+#define carry_node_prev( node ) \
4066+ list_entry((node)->header.level_linkage.prev, carry_node, \
4067+ header.level_linkage)
4068+
4069+#define carry_node_front( level ) \
4070+ list_entry((level)->nodes.next, carry_node, header.level_linkage)
4071+
4072+#define carry_node_back( level ) \
4073+ list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4074+
4075+#define carry_node_end( level, node ) \
4076+ (&(level)->nodes == &(node)->header.level_linkage)
4077+
4078+/* macro to iterate over all operations in a @level */
4079+#define for_all_ops( level /* carry level (of type carry_level *) */, \
4080+ op /* pointer to carry operation, modified by loop (of \
4081+ * type carry_op *) */, \
4082+ tmp /* pointer to carry operation (of type carry_op *), \
4083+ * used to make iterator stable in the face of \
4084+ * deletions from the level */ ) \
4085+for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4086+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4087+ &op->header.level_linkage != &level->ops; \
4088+ op = tmp, \
4089+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4090+
4091+#if 0
4092+for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4093+ tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4094+ ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4095+ op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4096+#endif
4097+
4098+/* macro to iterate over all nodes in a @level */ \
4099+#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4100+ node /* pointer to carry node, modified by loop (of \
4101+ * type carry_node *) */, \
4102+ tmp /* pointer to carry node (of type carry_node *), \
4103+ * used to make iterator stable in the face of * \
4104+ * deletions from the level */ ) \
4105+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4106+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4107+ &node->header.level_linkage != &level->nodes; \
4108+ node = tmp, \
4109+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4110+
4111+#if 0
4112+for( node = carry_node_front( level ), \
4113+ tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4114+ node = tmp, tmp = carry_node_next( node ) )
4115+#endif
4116+
4117+/* macro to iterate over all nodes in a @level in reverse order
4118+
4119+ This is used, because nodes are unlocked in reversed order of locking */
4120+#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4121+ node /* pointer to carry node, modified by loop \
4122+ * (of type carry_node *) */, \
4123+ tmp /* pointer to carry node (of type carry_node \
4124+ * *), used to make iterator stable in the \
4125+ * face of deletions from the level */ ) \
4126+for( node = carry_node_back( level ), \
4127+ tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4128+ node = tmp, tmp = carry_node_prev( node ) )
4129+
4130+/* __FS_REISER4_CARRY_H__ */
4131+#endif
4132+
4133+/* Make Linus happy.
4134+ Local variables:
4135+ c-indentation-style: "K&R"
4136+ mode-name: "LC"
4137+ c-basic-offset: 8
4138+ tab-width: 8
4139+ fill-column: 120
4140+ scroll-step: 1
4141+ End:
4142+*/
4143diff -urN linux-2.6.23.orig/fs/reiser4/carry_ops.c linux-2.6.23/fs/reiser4/carry_ops.c
4144--- linux-2.6.23.orig/fs/reiser4/carry_ops.c 1970-01-01 03:00:00.000000000 +0300
4145+++ linux-2.6.23/fs/reiser4/carry_ops.c 2007-12-04 16:49:30.000000000 +0300
4146@@ -0,0 +1,2131 @@
4147+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4148+
4149+/* implementation of carry operations */
4150+
4151+#include "forward.h"
4152+#include "debug.h"
4153+#include "key.h"
4154+#include "coord.h"
4155+#include "plugin/item/item.h"
4156+#include "plugin/node/node.h"
4157+#include "jnode.h"
4158+#include "znode.h"
4159+#include "block_alloc.h"
4160+#include "tree_walk.h"
4161+#include "pool.h"
4162+#include "tree_mod.h"
4163+#include "carry.h"
4164+#include "carry_ops.h"
4165+#include "tree.h"
4166+#include "super.h"
4167+#include "reiser4.h"
4168+
4169+#include <linux/types.h>
4170+#include <linux/err.h>
4171+
4172+static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4173+ carry_level * doing, carry_level * todo,
4174+ unsigned int including_insert_coord_p);
4175+
4176+extern int lock_carry_node(carry_level * level, carry_node * node);
4177+extern int lock_carry_node_tail(carry_node * node);
4178+
4179+/* find left neighbor of a carry node
4180+
4181+ Look for left neighbor of @node and add it to the @doing queue. See
4182+ comments in the body.
4183+
4184+*/
4185+static carry_node *find_left_neighbor(carry_op * op /* node to find left
4186+ * neighbor of */ ,
4187+ carry_level * doing /* level to scan */ )
4188+{
4189+ int result;
4190+ carry_node *node;
4191+ carry_node *left;
4192+ int flags;
4193+ reiser4_tree *tree;
4194+
4195+ node = op->node;
4196+
4197+ tree = current_tree;
4198+ read_lock_tree(tree);
4199+ /* first, check whether left neighbor is already in a @doing queue */
4200+ if (reiser4_carry_real(node)->left != NULL) {
4201+ /* NOTE: there is locking subtlety here. Look into
4202+ * find_right_neighbor() for more info */
4203+ if (find_carry_node(doing,
4204+ reiser4_carry_real(node)->left) != NULL) {
4205+ read_unlock_tree(tree);
4206+ left = node;
4207+ do {
4208+ left = list_entry(left->header.level_linkage.prev,
4209+ carry_node, header.level_linkage);
4210+ assert("nikita-3408", !carry_node_end(doing,
4211+ left));
4212+ } while (reiser4_carry_real(left) ==
4213+ reiser4_carry_real(node));
4214+ return left;
4215+ }
4216+ }
4217+ read_unlock_tree(tree);
4218+
4219+ left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4220+ if (IS_ERR(left))
4221+ return left;
4222+
4223+ left->node = node->node;
4224+ left->free = 1;
4225+
4226+ flags = GN_TRY_LOCK;
4227+ if (!op->u.insert.flags & COPI_LOAD_LEFT)
4228+ flags |= GN_NO_ALLOC;
4229+
4230+ /* then, feeling lucky, peek left neighbor in the cache. */
4231+ result = reiser4_get_left_neighbor(&left->lock_handle,
4232+ reiser4_carry_real(node),
4233+ ZNODE_WRITE_LOCK, flags);
4234+ if (result == 0) {
4235+ /* ok, node found and locked. */
4236+ result = lock_carry_node_tail(left);
4237+ if (result != 0)
4238+ left = ERR_PTR(result);
4239+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4240+ /* node is leftmost node in a tree, or neighbor wasn't in
4241+ cache, or there is an extent on the left. */
4242+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4243+ left = NULL;
4244+ } else if (doing->restartable) {
4245+ /* if left neighbor is locked, and level is restartable, add
4246+ new node to @doing and restart. */
4247+ assert("nikita-913", node->parent != 0);
4248+ assert("nikita-914", node->node != NULL);
4249+ left->left = 1;
4250+ left->free = 0;
4251+ left = ERR_PTR(-E_REPEAT);
4252+ } else {
4253+ /* left neighbor is locked, level cannot be restarted. Just
4254+ ignore left neighbor. */
4255+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4256+ left = NULL;
4257+ }
4258+ return left;
4259+}
4260+
4261+/* find right neighbor of a carry node
4262+
4263+ Look for right neighbor of @node and add it to the @doing queue. See
4264+ comments in the body.
4265+
4266+*/
4267+static carry_node *find_right_neighbor(carry_op * op /* node to find right
4268+ * neighbor of */ ,
4269+ carry_level * doing /* level to scan */ )
4270+{
4271+ int result;
4272+ carry_node *node;
4273+ carry_node *right;
4274+ lock_handle lh;
4275+ int flags;
4276+ reiser4_tree *tree;
4277+
4278+ init_lh(&lh);
4279+
4280+ node = op->node;
4281+
4282+ tree = current_tree;
4283+ read_lock_tree(tree);
4284+ /* first, check whether right neighbor is already in a @doing queue */
4285+ if (reiser4_carry_real(node)->right != NULL) {
4286+ /*
4287+ * Tree lock is taken here anyway, because, even if _outcome_
4288+ * of (find_carry_node() != NULL) doesn't depends on
4289+ * concurrent updates to ->right, find_carry_node() cannot
4290+ * work with second argument NULL. Hence, following comment is
4291+ * of historic importance only.
4292+ *
4293+ * Subtle:
4294+ *
4295+ * Q: why don't we need tree lock here, looking for the right
4296+ * neighbor?
4297+ *
4298+ * A: even if value of node->real_node->right were changed
4299+ * during find_carry_node() execution, outcome of execution
4300+ * wouldn't change, because (in short) other thread cannot add
4301+ * elements to the @doing, and if node->real_node->right
4302+ * already was in @doing, value of node->real_node->right
4303+ * couldn't change, because node cannot be inserted between
4304+ * locked neighbors.
4305+ */
4306+ if (find_carry_node(doing,
4307+ reiser4_carry_real(node)->right) != NULL) {
4308+ read_unlock_tree(tree);
4309+ /*
4310+ * What we are doing here (this is also applicable to
4311+ * the find_left_neighbor()).
4312+ *
4313+ * tree_walk.c code requires that insertion of a
4314+ * pointer to a child, modification of parent pointer
4315+ * in the child, and insertion of the child into
4316+ * sibling list are atomic (see
4317+ * plugin/item/internal.c:create_hook_internal()).
4318+ *
4319+ * carry allocates new node long before pointer to it
4320+ * is inserted into parent and, actually, long before
4321+ * parent is even known. Such allocated-but-orphaned
4322+ * nodes are only trackable through carry level lists.
4323+ *
4324+ * Situation that is handled here is following: @node
4325+ * has valid ->right pointer, but there is
4326+ * allocated-but-orphaned node in the carry queue that
4327+ * is logically between @node and @node->right. Here
4328+ * we are searching for it. Critical point is that
4329+ * this is only possible if @node->right is also in
4330+ * the carry queue (this is checked above), because
4331+ * this is the only way new orphaned node could be
4332+ * inserted between them (before inserting new node,
4333+ * make_space() first tries to shift to the right, so,
4334+ * right neighbor will be locked and queued).
4335+ *
4336+ */
4337+ right = node;
4338+ do {
4339+ right = list_entry(right->header.level_linkage.next,
4340+ carry_node, header.level_linkage);
4341+ assert("nikita-3408", !carry_node_end(doing,
4342+ right));
4343+ } while (reiser4_carry_real(right) ==
4344+ reiser4_carry_real(node));
4345+ return right;
4346+ }
4347+ }
4348+ read_unlock_tree(tree);
4349+
4350+ flags = GN_CAN_USE_UPPER_LEVELS;
4351+ if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4352+ flags = GN_NO_ALLOC;
4353+
4354+ /* then, try to lock right neighbor */
4355+ init_lh(&lh);
4356+ result = reiser4_get_right_neighbor(&lh,
4357+ reiser4_carry_real(node),
4358+ ZNODE_WRITE_LOCK, flags);
4359+ if (result == 0) {
4360+ /* ok, node found and locked. */
4361+ right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4362+ if (!IS_ERR(right)) {
4363+ right->node = lh.node;
4364+ move_lh(&right->lock_handle, &lh);
4365+ right->free = 1;
4366+ result = lock_carry_node_tail(right);
4367+ if (result != 0)
4368+ right = ERR_PTR(result);
4369+ }
4370+ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4371+ /* node is rightmost node in a tree, or neighbor wasn't in
4372+ cache, or there is an extent on the right. */
4373+ right = NULL;
4374+ } else
4375+ right = ERR_PTR(result);
4376+ done_lh(&lh);
4377+ return right;
4378+}
4379+
4380+/* how much free space in a @node is needed for @op
4381+
4382+ How much space in @node is required for completion of @op, where @op is
4383+ insert or paste operation.
4384+*/
4385+static unsigned int space_needed_for_op(znode * node /* znode data are
4386+ * inserted or
4387+ * pasted in */ ,
4388+ carry_op * op /* carry
4389+ operation */ )
4390+{
4391+ assert("nikita-919", op != NULL);
4392+
4393+ switch (op->op) {
4394+ default:
4395+ impossible("nikita-1701", "Wrong opcode");
4396+ case COP_INSERT:
4397+ return space_needed(node, NULL, op->u.insert.d->data, 1);
4398+ case COP_PASTE:
4399+ return space_needed(node, op->u.insert.d->coord,
4400+ op->u.insert.d->data, 0);
4401+ }
4402+}
4403+
4404+/* how much space in @node is required to insert or paste @data at
4405+ @coord. */
4406+unsigned int space_needed(const znode * node /* node data are inserted or
4407+ * pasted in */ ,
4408+ const coord_t * coord /* coord where data are
4409+ * inserted or pasted
4410+ * at */ ,
4411+ const reiser4_item_data * data /* data to insert or
4412+ * paste */ ,
4413+ int insertion /* non-0 is inserting, 0---paste */ )
4414+{
4415+ int result;
4416+ item_plugin *iplug;
4417+
4418+ assert("nikita-917", node != NULL);
4419+ assert("nikita-918", node_plugin_by_node(node) != NULL);
4420+ assert("vs-230", !insertion || (coord == NULL));
4421+
4422+ result = 0;
4423+ iplug = data->iplug;
4424+ if (iplug->b.estimate != NULL) {
4425+ /* ask item plugin how much space is needed to insert this
4426+ item */
4427+ result += iplug->b.estimate(insertion ? NULL : coord, data);
4428+ } else {
4429+ /* reasonable default */
4430+ result += data->length;
4431+ }
4432+ if (insertion) {
4433+ node_plugin *nplug;
4434+
4435+ nplug = node->nplug;
4436+ /* and add node overhead */
4437+ if (nplug->item_overhead != NULL) {
4438+ result += nplug->item_overhead(node, NULL);
4439+ }
4440+ }
4441+ return result;
4442+}
4443+
4444+/* find &coord in parent where pointer to new child is to be stored. */
4445+static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4446+ * insert pointer to new
4447+ * child */ )
4448+{
4449+ int result;
4450+ znode *node;
4451+ znode *child;
4452+
4453+ assert("nikita-941", op != NULL);
4454+ assert("nikita-942", op->op == COP_INSERT);
4455+
4456+ node = reiser4_carry_real(op->node);
4457+ assert("nikita-943", node != NULL);
4458+ assert("nikita-944", node_plugin_by_node(node) != NULL);
4459+
4460+ child = reiser4_carry_real(op->u.insert.child);
4461+ result =
4462+ find_new_child_ptr(node, child, op->u.insert.brother,
4463+ op->u.insert.d->coord);
4464+
4465+ build_child_ptr_data(child, op->u.insert.d->data);
4466+ return result;
4467+}
4468+
4469+/* additional amount of free space in @node required to complete @op */
4470+static int free_space_shortage(znode * node /* node to check */ ,
4471+ carry_op * op /* operation being performed */ )
4472+{
4473+ assert("nikita-1061", node != NULL);
4474+ assert("nikita-1062", op != NULL);
4475+
4476+ switch (op->op) {
4477+ default:
4478+ impossible("nikita-1702", "Wrong opcode");
4479+ case COP_INSERT:
4480+ case COP_PASTE:
4481+ return space_needed_for_op(node, op) - znode_free_space(node);
4482+ case COP_EXTENT:
4483+ /* when inserting extent shift data around until insertion
4484+ point is utmost in the node. */
4485+ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4486+ return +1;
4487+ else
4488+ return -1;
4489+ }
4490+}
4491+
4492+/* helper function: update node pointer in operation after insertion
4493+ point was probably shifted into @target. */
4494+static znode *sync_op(carry_op * op, carry_node * target)
4495+{
4496+ znode *insertion_node;
4497+
4498+ /* reget node from coord: shift might move insertion coord to
4499+ the neighbor */
4500+ insertion_node = op->u.insert.d->coord->node;
4501+ /* if insertion point was actually moved into new node,
4502+ update carry node pointer in operation. */
4503+ if (insertion_node != reiser4_carry_real(op->node)) {
4504+ op->node = target;
4505+ assert("nikita-2540",
4506+ reiser4_carry_real(target) == insertion_node);
4507+ }
4508+ assert("nikita-2541",
4509+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4510+ return insertion_node;
4511+}
4512+
4513+/*
4514+ * complete make_space() call: update tracked lock handle if necessary. See
4515+ * comments for fs/reiser4/carry.h:carry_track_type
4516+ */
4517+static int
4518+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4519+{
4520+ int result;
4521+ carry_track_type tracking;
4522+ znode *node;
4523+
4524+ tracking = doing->track_type;
4525+ node = op->u.insert.d->coord->node;
4526+
4527+ if (tracking == CARRY_TRACK_NODE ||
4528+ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4529+ /* inserting or pasting into node different from
4530+ original. Update lock handle supplied by caller. */
4531+ assert("nikita-1417", doing->tracked != NULL);
4532+ done_lh(doing->tracked);
4533+ init_lh(doing->tracked);
4534+ result = longterm_lock_znode(doing->tracked, node,
4535+ ZNODE_WRITE_LOCK,
4536+ ZNODE_LOCK_HIPRI);
4537+ } else
4538+ result = 0;
4539+ return result;
4540+}
4541+
4542+/* This is insertion policy function. It shifts data to the left and right
4543+ neighbors of insertion coord and allocates new nodes until there is enough
4544+ free space to complete @op.
4545+
4546+ See comments in the body.
4547+
4548+ Assumes that the node format favors insertions at the right end of the node
4549+ as node40 does.
4550+
4551+ See carry_flow() on detail about flow insertion
4552+*/
4553+static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4554+ carry_level * doing /* current carry queue */ ,
4555+ carry_level * todo /* carry queue on the parent level */ )
4556+{
4557+ znode *node;
4558+ int result;
4559+ int not_enough_space;
4560+ int blk_alloc;
4561+ znode *orig_node;
4562+ __u32 flags;
4563+
4564+ coord_t *coord;
4565+
4566+ assert("nikita-890", op != NULL);
4567+ assert("nikita-891", todo != NULL);
4568+ assert("nikita-892",
4569+ op->op == COP_INSERT ||
4570+ op->op == COP_PASTE || op->op == COP_EXTENT);
4571+ assert("nikita-1607",
4572+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4573+
4574+ flags = op->u.insert.flags;
4575+
4576+ /* NOTE check that new node can only be allocated after checking left
4577+ * and right neighbors. This is necessary for proper work of
4578+ * find_{left,right}_neighbor(). */
4579+ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4580+ flags & COPI_DONT_SHIFT_LEFT));
4581+ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4582+ flags & COPI_DONT_SHIFT_RIGHT));
4583+
4584+ coord = op->u.insert.d->coord;
4585+ orig_node = node = coord->node;
4586+
4587+ assert("nikita-908", node != NULL);
4588+ assert("nikita-909", node_plugin_by_node(node) != NULL);
4589+
4590+ result = 0;
4591+ /* If there is not enough space in a node, try to shift something to
4592+ the left neighbor. This is a bit tricky, as locking to the left is
4593+ low priority. This is handled by restart logic in carry().
4594+ */
4595+ not_enough_space = free_space_shortage(node, op);
4596+ if (not_enough_space <= 0)
4597+ /* it is possible that carry was called when there actually
4598+ was enough space in the node. For example, when inserting
4599+ leftmost item so that delimiting keys have to be updated.
4600+ */
4601+ return make_space_tail(op, doing, orig_node);
4602+ if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4603+ carry_node *left;
4604+ /* make note in statistics of an attempt to move
4605+ something into the left neighbor */
4606+ left = find_left_neighbor(op, doing);
4607+ if (unlikely(IS_ERR(left))) {
4608+ if (PTR_ERR(left) == -E_REPEAT)
4609+ return -E_REPEAT;
4610+ else {
4611+ /* some error other than restart request
4612+ occurred. This shouldn't happen. Issue a
4613+ warning and continue as if left neighbor
4614+ weren't existing.
4615+ */
4616+ warning("nikita-924",
4617+ "Error accessing left neighbor: %li",
4618+ PTR_ERR(left));
4619+ }
4620+ } else if (left != NULL) {
4621+
4622+ /* shift everything possible on the left of and
4623+ including insertion coord into the left neighbor */
4624+ result = carry_shift_data(LEFT_SIDE, coord,
4625+ reiser4_carry_real(left),
4626+ doing, todo,
4627+ flags & COPI_GO_LEFT);
4628+
4629+ /* reget node from coord: shift_left() might move
4630+ insertion coord to the left neighbor */
4631+ node = sync_op(op, left);
4632+
4633+ not_enough_space = free_space_shortage(node, op);
4634+ /* There is not enough free space in @node, but
4635+ may be, there is enough free space in
4636+ @left. Various balancing decisions are valid here.
4637+ The same for the shifiting to the right.
4638+ */
4639+ }
4640+ }
4641+ /* If there still is not enough space, shift to the right */
4642+ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4643+ carry_node *right;
4644+
4645+ right = find_right_neighbor(op, doing);
4646+ if (IS_ERR(right)) {
4647+ warning("nikita-1065",
4648+ "Error accessing right neighbor: %li",
4649+ PTR_ERR(right));
4650+ } else if (right != NULL) {
4651+ /* node containing insertion point, and its right
4652+ neighbor node are write locked by now.
4653+
4654+ shift everything possible on the right of but
4655+ excluding insertion coord into the right neighbor
4656+ */
4657+ result = carry_shift_data(RIGHT_SIDE, coord,
4658+ reiser4_carry_real(right),
4659+ doing, todo,
4660+ flags & COPI_GO_RIGHT);
4661+ /* reget node from coord: shift_right() might move
4662+ insertion coord to the right neighbor */
4663+ node = sync_op(op, right);
4664+ not_enough_space = free_space_shortage(node, op);
4665+ }
4666+ }
4667+ /* If there is still not enough space, allocate new node(s).
4668+
4669+ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4670+ the carry operation flags (currently this is needed during flush
4671+ only).
4672+ */
4673+ for (blk_alloc = 0;
4674+ not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4675+ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4676+ carry_node *fresh; /* new node we are allocating */
4677+ coord_t coord_shadow; /* remembered insertion point before
4678+ * shifting data into new node */
4679+ carry_node *node_shadow; /* remembered insertion node before
4680+ * shifting */
4681+ unsigned int gointo; /* whether insertion point should move
4682+ * into newly allocated node */
4683+
4684+ /* allocate new node on the right of @node. Znode and disk
4685+ fake block number for new node are allocated.
4686+
4687+ add_new_znode() posts carry operation COP_INSERT with
4688+ COPT_CHILD option to the parent level to add
4689+ pointer to newly created node to its parent.
4690+
4691+ Subtle point: if several new nodes are required to complete
4692+ insertion operation at this level, they will be inserted
4693+ into their parents in the order of creation, which means
4694+ that @node will be valid "cookie" at the time of insertion.
4695+
4696+ */
4697+ fresh = add_new_znode(node, op->node, doing, todo);
4698+ if (IS_ERR(fresh))
4699+ return PTR_ERR(fresh);
4700+
4701+ /* Try to shift into new node. */
4702+ result = lock_carry_node(doing, fresh);
4703+ zput(reiser4_carry_real(fresh));
4704+ if (result != 0) {
4705+ warning("nikita-947",
4706+ "Cannot lock new node: %i", result);
4707+ return result;
4708+ }
4709+
4710+ /* both nodes are write locked by now.
4711+
4712+ shift everything possible on the right of and
4713+ including insertion coord into the right neighbor.
4714+ */
4715+ coord_dup(&coord_shadow, op->u.insert.d->coord);
4716+ node_shadow = op->node;
4717+ /* move insertion point into newly created node if:
4718+
4719+ . insertion point is rightmost in the source node, or
4720+ . this is not the first node we are allocating in a row.
4721+ */
4722+ gointo =
4723+ (blk_alloc > 0) ||
4724+ coord_is_after_rightmost(op->u.insert.d->coord);
4725+
4726+ if (gointo &&
4727+ op->op == COP_PASTE &&
4728+ coord_is_existing_item(op->u.insert.d->coord) &&
4729+ is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
4730+ /* paste into solid (atomic) item, which can contain
4731+ only one unit, so we need to shift it right, where
4732+ insertion point supposed to be */
4733+
4734+ assert("edward-1444", op->u.insert.d->data->iplug ==
4735+ item_plugin_by_id(STATIC_STAT_DATA_ID));
4736+ assert("edward-1445",
4737+ op->u.insert.d->data->length >
4738+ node_plugin_by_node(coord->node)->free_space
4739+ (coord->node));
4740+
4741+ op->u.insert.d->coord->between = BEFORE_UNIT;
4742+ }
4743+
4744+ result = carry_shift_data(RIGHT_SIDE, coord,
4745+ reiser4_carry_real(fresh),
4746+ doing, todo, gointo);
4747+ /* if insertion point was actually moved into new node,
4748+ update carry node pointer in operation. */
4749+ node = sync_op(op, fresh);
4750+ not_enough_space = free_space_shortage(node, op);
4751+ if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4752+ /* there is not enough free in new node. Shift
4753+ insertion point back to the @shadow_node so that
4754+ next new node would be inserted between
4755+ @shadow_node and @fresh.
4756+ */
4757+ coord_normalize(&coord_shadow);
4758+ coord_dup(coord, &coord_shadow);
4759+ node = coord->node;
4760+ op->node = node_shadow;
4761+ if (1 || (flags & COPI_STEP_BACK)) {
4762+ /* still not enough space?! Maybe there is
4763+ enough space in the source node (i.e., node
4764+ data are moved from) now.
4765+ */
4766+ not_enough_space =
4767+ free_space_shortage(node, op);
4768+ }
4769+ }
4770+ }
4771+ if (not_enough_space > 0) {
4772+ if (!(flags & COPI_DONT_ALLOCATE))
4773+ warning("nikita-948", "Cannot insert new item");
4774+ result = -E_NODE_FULL;
4775+ }
4776+ assert("nikita-1622", ergo(result == 0,
4777+ reiser4_carry_real(op->node) == coord->node));
4778+ assert("nikita-2616", coord == op->u.insert.d->coord);
4779+ if (result == 0)
4780+ result = make_space_tail(op, doing, orig_node);
4781+ return result;
4782+}
4783+
4784+/* insert_paste_common() - common part of insert and paste operations
4785+
4786+ This function performs common part of COP_INSERT and COP_PASTE.
4787+
4788+ There are two ways in which insertion/paste can be requested:
4789+
4790+ . by directly supplying reiser4_item_data. In this case, op ->
4791+ u.insert.type is set to COPT_ITEM_DATA.
4792+
4793+ . by supplying child pointer to which is to inserted into parent. In this
4794+ case op -> u.insert.type == COPT_CHILD.
4795+
4796+ . by supplying key of new item/unit. This is currently only used during
4797+ extent insertion
4798+
4799+ This is required, because when new node is allocated we don't know at what
4800+ position pointer to it is to be stored in the parent. Actually, we don't
4801+ even know what its parent will be, because parent can be re-balanced
4802+ concurrently and new node re-parented, and because parent can be full and
4803+ pointer to the new node will go into some other node.
4804+
4805+ insert_paste_common() resolves pointer to child node into position in the
4806+ parent by calling find_new_child_coord(), that fills
4807+ reiser4_item_data. After this, insertion/paste proceeds uniformly.
4808+
4809+ Another complication is with finding free space during pasting. It may
4810+ happen that while shifting items to the neighbors and newly allocated
4811+ nodes, insertion coord can no longer be in the item we wanted to paste
4812+ into. At this point, paste becomes (morphs) into insert. Moreover free
4813+ space analysis has to be repeated, because amount of space required for
4814+ insertion is different from that of paste (item header overhead, etc).
4815+
4816+ This function "unifies" different insertion modes (by resolving child
4817+ pointer or key into insertion coord), and then calls make_space() to free
4818+ enough space in the node by shifting data to the left and right and by
4819+ allocating new nodes if necessary. Carry operation knows amount of space
4820+ required for its completion. After enough free space is obtained, caller of
4821+ this function (carry_{insert,paste,etc.}) performs actual insertion/paste
4822+ by calling item plugin method.
4823+
4824+*/
4825+static int insert_paste_common(carry_op * op /* carry operation being
4826+ * performed */ ,
4827+ carry_level * doing /* current carry level */ ,
4828+ carry_level * todo /* next carry level */ ,
4829+ carry_insert_data * cdata /* pointer to
4830+ * cdata */ ,
4831+ coord_t * coord /* insertion/paste coord */ ,
4832+ reiser4_item_data * data /* data to be
4833+ * inserted/pasted */ )
4834+{
4835+ assert("nikita-981", op != NULL);
4836+ assert("nikita-980", todo != NULL);
4837+ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
4838+ || (op->op == COP_EXTENT));
4839+
4840+ if (op->u.insert.type == COPT_PASTE_RESTARTED) {
4841+ /* nothing to do. Fall through to make_space(). */
4842+ ;
4843+ } else if (op->u.insert.type == COPT_KEY) {
4844+ node_search_result intra_node;
4845+ znode *node;
4846+ /* Problem with doing batching at the lowest level, is that
4847+ operations here are given by coords where modification is
4848+ to be performed, and one modification can invalidate coords
4849+ of all following operations.
4850+
4851+ So, we are implementing yet another type for operation that
4852+ will use (the only) "locator" stable across shifting of
4853+ data between nodes, etc.: key (COPT_KEY).
4854+
4855+ This clause resolves key to the coord in the node.
4856+
4857+ But node can change also. Probably some pieces have to be
4858+ added to the lock_carry_node(), to lock node by its key.
4859+
4860+ */
4861+ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
4862+ if you need something else. */
4863+ op->u.insert.d->coord = coord;
4864+ node = reiser4_carry_real(op->node);
4865+ intra_node = node_plugin_by_node(node)->lookup
4866+ (node, op->u.insert.d->key, FIND_EXACT,
4867+ op->u.insert.d->coord);
4868+ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
4869+ warning("nikita-1715", "Intra node lookup failure: %i",
4870+ intra_node);
4871+ return intra_node;
4872+ }
4873+ } else if (op->u.insert.type == COPT_CHILD) {
4874+ /* if we are asked to insert pointer to the child into
4875+ internal node, first convert pointer to the child into
4876+ coord within parent node.
4877+ */
4878+ znode *child;
4879+ int result;
4880+
4881+ op->u.insert.d = cdata;
4882+ op->u.insert.d->coord = coord;
4883+ op->u.insert.d->data = data;
4884+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4885+ result = find_new_child_coord(op);
4886+ child = reiser4_carry_real(op->u.insert.child);
4887+ if (result != NS_NOT_FOUND) {
4888+ warning("nikita-993",
4889+ "Cannot find a place for child pointer: %i",
4890+ result);
4891+ return result;
4892+ }
4893+ /* This only happens when we did multiple insertions at
4894+ the previous level, trying to insert single item and
4895+ it so happened, that insertion of pointers to all new
4896+ nodes before this one already caused parent node to
4897+ split (may be several times).
4898+
4899+ I am going to come up with better solution.
4900+
4901+ You are not expected to understand this.
4902+ -- v6root/usr/sys/ken/slp.c
4903+
4904+ Basically, what happens here is the following: carry came
4905+ to the parent level and is about to insert internal item
4906+ pointing to the child node that it just inserted in the
4907+ level below. Position where internal item is to be inserted
4908+ was found by find_new_child_coord() above, but node of the
4909+ current carry operation (that is, parent node of child
4910+ inserted on the previous level), was determined earlier in
4911+ the lock_carry_level/lock_carry_node. It could so happen
4912+ that other carry operations already performed on the parent
4913+ level already split parent node, so that insertion point
4914+ moved into another node. Handle this by creating new carry
4915+ node for insertion point if necessary.
4916+ */
4917+ if (reiser4_carry_real(op->node) !=
4918+ op->u.insert.d->coord->node) {
4919+ pool_ordering direction;
4920+ znode *z1;
4921+ znode *z2;
4922+ reiser4_key k1;
4923+ reiser4_key k2;
4924+
4925+ /*
4926+ * determine in what direction insertion point
4927+ * moved. Do this by comparing delimiting keys.
4928+ */
4929+ z1 = op->u.insert.d->coord->node;
4930+ z2 = reiser4_carry_real(op->node);
4931+ if (keyle(leftmost_key_in_node(z1, &k1),
4932+ leftmost_key_in_node(z2, &k2)))
4933+ /* insertion point moved to the left */
4934+ direction = POOLO_BEFORE;
4935+ else
4936+ /* insertion point moved to the right */
4937+ direction = POOLO_AFTER;
4938+
4939+ op->node = reiser4_add_carry_skip(doing,
4940+ direction, op->node);
4941+ if (IS_ERR(op->node))
4942+ return PTR_ERR(op->node);
4943+ op->node->node = op->u.insert.d->coord->node;
4944+ op->node->free = 1;
4945+ result = lock_carry_node(doing, op->node);
4946+ if (result != 0)
4947+ return result;
4948+ }
4949+
4950+ /*
4951+ * set up key of an item being inserted: we are inserting
4952+ * internal item and its key is (by the very definition of
4953+ * search tree) is leftmost key in the child node.
4954+ */
4955+ write_lock_dk(znode_get_tree(child));
4956+ op->u.insert.d->key = leftmost_key_in_node(child,
4957+ znode_get_ld_key(child));
4958+ write_unlock_dk(znode_get_tree(child));
4959+ op->u.insert.d->data->arg = op->u.insert.brother;
4960+ } else {
4961+ assert("vs-243", op->u.insert.d->coord != NULL);
4962+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4963+ }
4964+
4965+ /* find free space. */
4966+ return make_space(op, doing, todo);
4967+}
4968+
4969+/* handle carry COP_INSERT operation.
4970+
4971+ Insert new item into node. New item can be given in one of two ways:
4972+
4973+ - by passing &tree_coord and &reiser4_item_data as part of @op. This is
4974+ only applicable at the leaf/twig level.
4975+
4976+ - by passing a child node pointer to which is to be inserted by this
4977+ operation.
4978+
4979+*/
4980+static int carry_insert(carry_op * op /* operation to perform */ ,
4981+ carry_level * doing /* queue of operations @op
4982+ * is part of */ ,
4983+ carry_level * todo /* queue where new operations
4984+ * are accumulated */ )
4985+{
4986+ znode *node;
4987+ carry_insert_data cdata;
4988+ coord_t coord;
4989+ reiser4_item_data data;
4990+ carry_plugin_info info;
4991+ int result;
4992+
4993+ assert("nikita-1036", op != NULL);
4994+ assert("nikita-1037", todo != NULL);
4995+ assert("nikita-1038", op->op == COP_INSERT);
4996+
4997+ coord_init_zero(&coord);
4998+
4999+ /* perform common functionality of insert and paste. */
5000+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5001+ if (result != 0)
5002+ return result;
5003+
5004+ node = op->u.insert.d->coord->node;
5005+ assert("nikita-1039", node != NULL);
5006+ assert("nikita-1040", node_plugin_by_node(node) != NULL);
5007+
5008+ assert("nikita-949",
5009+ space_needed_for_op(node, op) <= znode_free_space(node));
5010+
5011+ /* ask node layout to create new item. */
5012+ info.doing = doing;
5013+ info.todo = todo;
5014+ result = node_plugin_by_node(node)->create_item
5015+ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
5016+ &info);
5017+ doing->restartable = 0;
5018+ znode_make_dirty(node);
5019+
5020+ return result;
5021+}
5022+
5023+/*
5024+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5025+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5026+ * by slicing into multiple items.
5027+ */
5028+
5029+#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
5030+#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
5031+#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
5032+
5033+static size_t item_data_overhead(carry_op * op)
5034+{
5035+ if (flow_insert_data(op)->iplug->b.estimate == NULL)
5036+ return 0;
5037+ return (flow_insert_data(op)->iplug->b.
5038+ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5039+ flow_insert_data(op)->length);
5040+}
5041+
5042+/* FIXME-VS: this is called several times during one make_flow_for_insertion
5043+ and it will always return the same result. Some optimization could be made
5044+ by calculating this value once at the beginning and passing it around. That
5045+ would reduce some flexibility in future changes
5046+*/
5047+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5048+static size_t flow_insertion_overhead(carry_op * op)
5049+{
5050+ znode *node;
5051+ size_t insertion_overhead;
5052+
5053+ node = flow_insert_point(op)->node;
5054+ insertion_overhead = 0;
5055+ if (node->nplug->item_overhead &&
5056+ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5057+ flow_insert_data(op)))
5058+ insertion_overhead =
5059+ node->nplug->item_overhead(node, NULL) +
5060+ item_data_overhead(op);
5061+ return insertion_overhead;
5062+}
5063+
5064+/* how many bytes of flow does fit to the node */
5065+static int what_can_fit_into_node(carry_op * op)
5066+{
5067+ size_t free, overhead;
5068+
5069+ overhead = flow_insertion_overhead(op);
5070+ free = znode_free_space(flow_insert_point(op)->node);
5071+ if (free <= overhead)
5072+ return 0;
5073+ free -= overhead;
5074+ /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5075+ if (free < op->u.insert_flow.flow->length)
5076+ return free;
5077+ return (int)op->u.insert_flow.flow->length;
5078+}
5079+
5080+/* in make_space_for_flow_insertion we need to check either whether whole flow
5081+ fits into a node or whether minimal fraction of flow fits into a node */
5082+static int enough_space_for_whole_flow(carry_op * op)
5083+{
5084+ return (unsigned)what_can_fit_into_node(op) ==
5085+ op->u.insert_flow.flow->length;
5086+}
5087+
5088+#define MIN_FLOW_FRACTION 1
5089+static int enough_space_for_min_flow_fraction(carry_op * op)
5090+{
5091+ assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5092+
5093+ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5094+}
5095+
5096+/* this returns 0 if left neighbor was obtained successfully and everything
5097+ upto insertion point including it were shifted and left neighbor still has
5098+ some free space to put minimal fraction of flow into it */
5099+static int
5100+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5101+{
5102+ carry_node *left;
5103+ znode *orig;
5104+
5105+ left = find_left_neighbor(op, doing);
5106+ if (unlikely(IS_ERR(left))) {
5107+ warning("vs-899",
5108+ "make_space_by_shift_left: "
5109+ "error accessing left neighbor: %li", PTR_ERR(left));
5110+ return 1;
5111+ }
5112+ if (left == NULL)
5113+ /* left neighbor either does not exist or is unformatted
5114+ node */
5115+ return 1;
5116+
5117+ orig = flow_insert_point(op)->node;
5118+ /* try to shift content of node @orig from its head upto insert point
5119+ including insertion point into the left neighbor */
5120+ carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5121+ reiser4_carry_real(left), doing, todo,
5122+ 1 /* including insert point */);
5123+ if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5124+ /* insertion point did not move */
5125+ return 1;
5126+ }
5127+
5128+ /* insertion point is set after last item in the node */
5129+ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5130+
5131+ if (!enough_space_for_min_flow_fraction(op)) {
5132+ /* insertion point node does not have enough free space to put
5133+ even minimal portion of flow into it, therefore, move
5134+ insertion point back to orig node (before first item) */
5135+ coord_init_before_first_item(flow_insert_point(op), orig);
5136+ return 1;
5137+ }
5138+
5139+ /* part of flow is to be written to the end of node */
5140+ op->node = left;
5141+ return 0;
5142+}
5143+
5144+/* this returns 0 if right neighbor was obtained successfully and everything to
5145+ the right of insertion point was shifted to it and node got enough free
5146+ space to put minimal fraction of flow into it */
5147+static int
5148+make_space_by_shift_right(carry_op * op, carry_level * doing,
5149+ carry_level * todo)
5150+{
5151+ carry_node *right;
5152+
5153+ right = find_right_neighbor(op, doing);
5154+ if (unlikely(IS_ERR(right))) {
5155+ warning("nikita-1065", "shift_right_excluding_insert_point: "
5156+ "error accessing right neighbor: %li", PTR_ERR(right));
5157+ return 1;
5158+ }
5159+ if (right) {
5160+ /* shift everything possible on the right of but excluding
5161+ insertion coord into the right neighbor */
5162+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5163+ reiser4_carry_real(right), doing, todo,
5164+ 0 /* not including insert point */);
5165+ } else {
5166+ /* right neighbor either does not exist or is unformatted
5167+ node */
5168+ ;
5169+ }
5170+ if (coord_is_after_rightmost(flow_insert_point(op))) {
5171+ if (enough_space_for_min_flow_fraction(op)) {
5172+ /* part of flow is to be written to the end of node */
5173+ return 0;
5174+ }
5175+ }
5176+
5177+ /* new node is to be added if insert point node did not get enough
5178+ space for whole flow */
5179+ return 1;
5180+}
5181+
5182+/* this returns 0 when insert coord is set at the node end and fraction of flow
5183+ fits into that node */
5184+static int
5185+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5186+{
5187+ int result;
5188+ znode *node;
5189+ carry_node *new;
5190+
5191+ node = flow_insert_point(op)->node;
5192+
5193+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5194+ return RETERR(-E_NODE_FULL);
5195+ /* add new node after insert point node */
5196+ new = add_new_znode(node, op->node, doing, todo);
5197+ if (unlikely(IS_ERR(new))) {
5198+ return PTR_ERR(new);
5199+ }
5200+ result = lock_carry_node(doing, new);
5201+ zput(reiser4_carry_real(new));
5202+ if (unlikely(result)) {
5203+ return result;
5204+ }
5205+ op->u.insert_flow.new_nodes++;
5206+ if (!coord_is_after_rightmost(flow_insert_point(op))) {
5207+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5208+ reiser4_carry_real(new), doing, todo,
5209+ 0 /* not including insert point */);
5210+ assert("vs-901",
5211+ coord_is_after_rightmost(flow_insert_point(op)));
5212+
5213+ if (enough_space_for_min_flow_fraction(op)) {
5214+ return 0;
5215+ }
5216+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5217+ return RETERR(-E_NODE_FULL);
5218+
5219+ /* add one more new node */
5220+ new = add_new_znode(node, op->node, doing, todo);
5221+ if (unlikely(IS_ERR(new))) {
5222+ return PTR_ERR(new);
5223+ }
5224+ result = lock_carry_node(doing, new);
5225+ zput(reiser4_carry_real(new));
5226+ if (unlikely(result)) {
5227+ return result;
5228+ }
5229+ op->u.insert_flow.new_nodes++;
5230+ }
5231+
5232+ /* move insertion point to new node */
5233+ coord_init_before_first_item(flow_insert_point(op),
5234+ reiser4_carry_real(new));
5235+ op->node = new;
5236+ return 0;
5237+}
5238+
5239+static int
5240+make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5241+ carry_level * todo)
5242+{
5243+ __u32 flags = op->u.insert_flow.flags;
5244+
5245+ if (enough_space_for_whole_flow(op)) {
5246+ /* whole flow fits into insert point node */
5247+ return 0;
5248+ }
5249+
5250+ if (!(flags & COPI_DONT_SHIFT_LEFT)
5251+ && (make_space_by_shift_left(op, doing, todo) == 0)) {
5252+ /* insert point is shifted to left neighbor of original insert
5253+ point node and is set after last unit in that node. It has
5254+ enough space to fit at least minimal fraction of flow. */
5255+ return 0;
5256+ }
5257+
5258+ if (enough_space_for_whole_flow(op)) {
5259+ /* whole flow fits into insert point node */
5260+ return 0;
5261+ }
5262+
5263+ if (!(flags & COPI_DONT_SHIFT_RIGHT)
5264+ && (make_space_by_shift_right(op, doing, todo) == 0)) {
5265+ /* insert point is still set to the same node, but there is
5266+ nothing to the right of insert point. */
5267+ return 0;
5268+ }
5269+
5270+ if (enough_space_for_whole_flow(op)) {
5271+ /* whole flow fits into insert point node */
5272+ return 0;
5273+ }
5274+
5275+ return make_space_by_new_nodes(op, doing, todo);
5276+}
5277+
5278+/* implements COP_INSERT_FLOW operation */
5279+static int
5280+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5281+{
5282+ int result;
5283+ flow_t *f;
5284+ coord_t *insert_point;
5285+ node_plugin *nplug;
5286+ carry_plugin_info info;
5287+ znode *orig_node;
5288+ lock_handle *orig_lh;
5289+
5290+ f = op->u.insert_flow.flow;
5291+ result = 0;
5292+
5293+ /* carry system needs this to work */
5294+ info.doing = doing;
5295+ info.todo = todo;
5296+
5297+ orig_node = flow_insert_point(op)->node;
5298+ orig_lh = doing->tracked;
5299+
5300+ while (f->length) {
5301+ result = make_space_for_flow_insertion(op, doing, todo);
5302+ if (result)
5303+ break;
5304+
5305+ insert_point = flow_insert_point(op);
5306+ nplug = node_plugin_by_node(insert_point->node);
5307+
5308+ /* compose item data for insertion/pasting */
5309+ flow_insert_data(op)->data = f->data;
5310+ flow_insert_data(op)->length = what_can_fit_into_node(op);
5311+
5312+ if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5313+ /* insert point is set to item of file we are writing to and we have to append to it */
5314+ assert("vs-903", insert_point->between == AFTER_UNIT);
5315+ nplug->change_item_size(insert_point,
5316+ flow_insert_data(op)->length);
5317+ flow_insert_data(op)->iplug->b.paste(insert_point,
5318+ flow_insert_data
5319+ (op), &info);
5320+ } else {
5321+ /* new item must be inserted */
5322+ pos_in_node_t new_pos;
5323+ flow_insert_data(op)->length += item_data_overhead(op);
5324+
5325+ /* FIXME-VS: this is because node40_create_item changes
5326+ insert_point for obscure reasons */
5327+ switch (insert_point->between) {
5328+ case AFTER_ITEM:
5329+ new_pos = insert_point->item_pos + 1;
5330+ break;
5331+ case EMPTY_NODE:
5332+ new_pos = 0;
5333+ break;
5334+ case BEFORE_ITEM:
5335+ assert("vs-905", insert_point->item_pos == 0);
5336+ new_pos = 0;
5337+ break;
5338+ default:
5339+ impossible("vs-906",
5340+ "carry_insert_flow: invalid coord");
5341+ new_pos = 0;
5342+ break;
5343+ }
5344+
5345+ nplug->create_item(insert_point, &f->key,
5346+ flow_insert_data(op), &info);
5347+ coord_set_item_pos(insert_point, new_pos);
5348+ }
5349+ coord_init_after_item_end(insert_point);
5350+ doing->restartable = 0;
5351+ znode_make_dirty(insert_point->node);
5352+
5353+ move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5354+ }
5355+
5356+ if (orig_node != flow_insert_point(op)->node) {
5357+ /* move lock to new insert point */
5358+ done_lh(orig_lh);
5359+ init_lh(orig_lh);
5360+ result =
5361+ longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5362+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5363+ }
5364+
5365+ return result;
5366+}
5367+
5368+/* implements COP_DELETE operation
5369+
5370+ Remove pointer to @op -> u.delete.child from it's parent.
5371+
5372+ This function also handles killing of a tree root is last pointer from it
5373+ was removed. This is complicated by our handling of "twig" level: root on
5374+ twig level is never killed.
5375+
5376+*/
5377+static int carry_delete(carry_op * op /* operation to be performed */ ,
5378+ carry_level * doing UNUSED_ARG /* current carry
5379+ * level */ ,
5380+ carry_level * todo /* next carry level */ )
5381+{
5382+ int result;
5383+ coord_t coord;
5384+ coord_t coord2;
5385+ znode *parent;
5386+ znode *child;
5387+ carry_plugin_info info;
5388+ reiser4_tree *tree;
5389+
5390+ /*
5391+ * This operation is called to delete internal item pointing to the
5392+ * child node that was removed by carry from the tree on the previous
5393+ * tree level.
5394+ */
5395+
5396+ assert("nikita-893", op != NULL);
5397+ assert("nikita-894", todo != NULL);
5398+ assert("nikita-895", op->op == COP_DELETE);
5399+
5400+ coord_init_zero(&coord);
5401+ coord_init_zero(&coord2);
5402+
5403+ parent = reiser4_carry_real(op->node);
5404+ child = op->u.delete.child ?
5405+ reiser4_carry_real(op->u.delete.child) : op->node->node;
5406+ tree = znode_get_tree(child);
5407+ read_lock_tree(tree);
5408+
5409+ /*
5410+ * @parent was determined when carry entered parent level
5411+ * (lock_carry_level/lock_carry_node). Since then, actual parent of
5412+ * @child node could change due to other carry operations performed on
5413+ * the parent level. Check for this.
5414+ */
5415+
5416+ if (znode_parent(child) != parent) {
5417+ /* NOTE-NIKITA add stat counter for this. */
5418+ parent = znode_parent(child);
5419+ assert("nikita-2581", find_carry_node(doing, parent));
5420+ }
5421+ read_unlock_tree(tree);
5422+
5423+ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5424+
5425+ /* Twig level horrors: tree should be of height at least 2. So, last
5426+ pointer from the root at twig level is preserved even if child is
5427+ empty. This is ugly, but so it was architectured.
5428+ */
5429+
5430+ if (znode_is_root(parent) &&
5431+ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5432+ node_num_items(parent) == 1) {
5433+ /* Delimiting key manipulations. */
5434+ write_lock_dk(tree);
5435+ znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5436+ znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5437+ ZF_SET(child, JNODE_DKSET);
5438+ write_unlock_dk(tree);
5439+
5440+ /* @child escaped imminent death! */
5441+ ZF_CLR(child, JNODE_HEARD_BANSHEE);
5442+ return 0;
5443+ }
5444+
5445+ /* convert child pointer to the coord_t */
5446+ result = find_child_ptr(parent, child, &coord);
5447+ if (result != NS_FOUND) {
5448+ warning("nikita-994", "Cannot find child pointer: %i", result);
5449+ print_coord_content("coord", &coord);
5450+ return result;
5451+ }
5452+
5453+ coord_dup(&coord2, &coord);
5454+ info.doing = doing;
5455+ info.todo = todo;
5456+ {
5457+ /*
5458+ * Actually kill internal item: prepare structure with
5459+ * arguments for ->cut_and_kill() method...
5460+ */
5461+
5462+ struct carry_kill_data kdata;
5463+ kdata.params.from = &coord;
5464+ kdata.params.to = &coord2;
5465+ kdata.params.from_key = NULL;
5466+ kdata.params.to_key = NULL;
5467+ kdata.params.smallest_removed = NULL;
5468+ kdata.params.truncate = 1;
5469+ kdata.flags = op->u.delete.flags;
5470+ kdata.inode = NULL;
5471+ kdata.left = NULL;
5472+ kdata.right = NULL;
5473+ kdata.buf = NULL;
5474+ /* ... and call it. */
5475+ result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5476+ &info);
5477+ }
5478+ doing->restartable = 0;
5479+
5480+ /* check whether root should be killed violently */
5481+ if (znode_is_root(parent) &&
5482+ /* don't kill roots at and lower than twig level */
5483+ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5484+ node_num_items(parent) == 1) {
5485+ result = reiser4_kill_tree_root(coord.node);
5486+ }
5487+
5488+ return result < 0 ? : 0;
5489+}
5490+
5491+/* implements COP_CUT opration
5492+
5493+ Cuts part or whole content of node.
5494+
5495+*/
5496+static int carry_cut(carry_op * op /* operation to be performed */ ,
5497+ carry_level * doing /* current carry level */ ,
5498+ carry_level * todo /* next carry level */ )
5499+{
5500+ int result;
5501+ carry_plugin_info info;
5502+ node_plugin *nplug;
5503+
5504+ assert("nikita-896", op != NULL);
5505+ assert("nikita-897", todo != NULL);
5506+ assert("nikita-898", op->op == COP_CUT);
5507+
5508+ info.doing = doing;
5509+ info.todo = todo;
5510+
5511+ nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5512+ if (op->u.cut_or_kill.is_cut)
5513+ result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5514+ else
5515+ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5516+
5517+ doing->restartable = 0;
5518+ return result < 0 ? : 0;
5519+}
5520+
5521+/* helper function for carry_paste(): returns true if @op can be continued as
5522+ paste */
5523+static int
5524+can_paste(coord_t * icoord, const reiser4_key * key,
5525+ const reiser4_item_data * data)
5526+{
5527+ coord_t circa;
5528+ item_plugin *new_iplug;
5529+ item_plugin *old_iplug;
5530+ int result = 0; /* to keep gcc shut */
5531+
5532+ assert("", icoord->between != AT_UNIT);
5533+
5534+ /* obviously, one cannot paste when node is empty---there is nothing
5535+ to paste into. */
5536+ if (node_is_empty(icoord->node))
5537+ return 0;
5538+ /* if insertion point is at the middle of the item, then paste */
5539+ if (!coord_is_between_items(icoord))
5540+ return 1;
5541+ coord_dup(&circa, icoord);
5542+ circa.between = AT_UNIT;
5543+
5544+ old_iplug = item_plugin_by_coord(&circa);
5545+ new_iplug = data->iplug;
5546+
5547+ /* check whether we can paste to the item @icoord is "at" when we
5548+ ignore ->between field */
5549+ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5550+ result = 1;
5551+ } else if (icoord->between == BEFORE_UNIT
5552+ || icoord->between == BEFORE_ITEM) {
5553+ /* otherwise, try to glue to the item at the left, if any */
5554+ coord_dup(&circa, icoord);
5555+ if (coord_set_to_left(&circa)) {
5556+ result = 0;
5557+ coord_init_before_item(icoord);
5558+ } else {
5559+ old_iplug = item_plugin_by_coord(&circa);
5560+ result = (old_iplug == new_iplug)
5561+ && item_can_contain_key(icoord, key, data);
5562+ if (result) {
5563+ coord_dup(icoord, &circa);
5564+ icoord->between = AFTER_UNIT;
5565+ }
5566+ }
5567+ } else if (icoord->between == AFTER_UNIT
5568+ || icoord->between == AFTER_ITEM) {
5569+ coord_dup(&circa, icoord);
5570+ /* otherwise, try to glue to the item at the right, if any */
5571+ if (coord_set_to_right(&circa)) {
5572+ result = 0;
5573+ coord_init_after_item(icoord);
5574+ } else {
5575+ int (*cck) (const coord_t *, const reiser4_key *,
5576+ const reiser4_item_data *);
5577+
5578+ old_iplug = item_plugin_by_coord(&circa);
5579+
5580+ cck = old_iplug->b.can_contain_key;
5581+ if (cck == NULL)
5582+ /* item doesn't define ->can_contain_key
5583+ method? So it is not expandable. */
5584+ result = 0;
5585+ else {
5586+ result = (old_iplug == new_iplug)
5587+ && cck(&circa /*icoord */ , key, data);
5588+ if (result) {
5589+ coord_dup(icoord, &circa);
5590+ icoord->between = BEFORE_UNIT;
5591+ }
5592+ }
5593+ }
5594+ } else
5595+ impossible("nikita-2513", "Nothing works");
5596+ if (result) {
5597+ if (icoord->between == BEFORE_ITEM) {
5598+ assert("vs-912", icoord->unit_pos == 0);
5599+ icoord->between = BEFORE_UNIT;
5600+ } else if (icoord->between == AFTER_ITEM) {
5601+ coord_init_after_item_end(icoord);
5602+ }
5603+ }
5604+ return result;
5605+}
5606+
5607+/* implements COP_PASTE operation
5608+
5609+ Paste data into existing item. This is complicated by the fact that after
5610+ we shifted something to the left or right neighbors trying to free some
5611+ space, item we were supposed to paste into can be in different node than
5612+ insertion coord. If so, we are no longer doing paste, but insert. See
5613+ comments in insert_paste_common().
5614+
5615+*/
5616+static int carry_paste(carry_op * op /* operation to be performed */ ,
5617+ carry_level * doing UNUSED_ARG /* current carry
5618+ * level */ ,
5619+ carry_level * todo /* next carry level */ )
5620+{
5621+ znode *node;
5622+ carry_insert_data cdata;
5623+ coord_t dcoord;
5624+ reiser4_item_data data;
5625+ int result;
5626+ int real_size;
5627+ item_plugin *iplug;
5628+ carry_plugin_info info;
5629+ coord_t *coord;
5630+
5631+ assert("nikita-982", op != NULL);
5632+ assert("nikita-983", todo != NULL);
5633+ assert("nikita-984", op->op == COP_PASTE);
5634+
5635+ coord_init_zero(&dcoord);
5636+
5637+ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5638+ if (result != 0)
5639+ return result;
5640+
5641+ coord = op->u.insert.d->coord;
5642+
5643+ /* handle case when op -> u.insert.coord doesn't point to the item
5644+ of required type. restart as insert. */
5645+ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5646+ op->op = COP_INSERT;
5647+ op->u.insert.type = COPT_PASTE_RESTARTED;
5648+ result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5649+
5650+ return result;
5651+ }
5652+
5653+ node = coord->node;
5654+ iplug = item_plugin_by_coord(coord);
5655+ assert("nikita-992", iplug != NULL);
5656+
5657+ assert("nikita-985", node != NULL);
5658+ assert("nikita-986", node_plugin_by_node(node) != NULL);
5659+
5660+ assert("nikita-987",
5661+ space_needed_for_op(node, op) <= znode_free_space(node));
5662+
5663+ assert("nikita-1286", coord_is_existing_item(coord));
5664+
5665+ /*
5666+ * if item is expanded as a result of this operation, we should first
5667+ * change item size, than call ->b.paste item method. If item is
5668+ * shrunk, it should be done other way around: first call ->b.paste
5669+ * method, then reduce item size.
5670+ */
5671+
5672+ real_size = space_needed_for_op(node, op);
5673+ if (real_size > 0)
5674+ node->nplug->change_item_size(coord, real_size);
5675+
5676+ doing->restartable = 0;
5677+ info.doing = doing;
5678+ info.todo = todo;
5679+
5680+ result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5681+
5682+ if (real_size < 0)
5683+ node->nplug->change_item_size(coord, real_size);
5684+
5685+ /* if we pasted at the beginning of the item, update item's key. */
5686+ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5687+ node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5688+
5689+ znode_make_dirty(node);
5690+ return result;
5691+}
5692+
5693+/* handle carry COP_EXTENT operation. */
5694+static int carry_extent(carry_op * op /* operation to perform */ ,
5695+ carry_level * doing /* queue of operations @op
5696+ * is part of */ ,
5697+ carry_level * todo /* queue where new operations
5698+ * are accumulated */ )
5699+{
5700+ znode *node;
5701+ carry_insert_data cdata;
5702+ coord_t coord;
5703+ reiser4_item_data data;
5704+ carry_op *delete_dummy;
5705+ carry_op *insert_extent;
5706+ int result;
5707+ carry_plugin_info info;
5708+
5709+ assert("nikita-1751", op != NULL);
5710+ assert("nikita-1752", todo != NULL);
5711+ assert("nikita-1753", op->op == COP_EXTENT);
5712+
5713+ /* extent insertion overview:
5714+
5715+ extents live on the TWIG LEVEL, which is level one above the leaf
5716+ one. This complicates extent insertion logic somewhat: it may
5717+ happen (and going to happen all the time) that in logical key
5718+ ordering extent has to be placed between items I1 and I2, located
5719+ at the leaf level, but I1 and I2 are in the same formatted leaf
5720+ node N1. To insert extent one has to
5721+
5722+ (1) reach node N1 and shift data between N1, its neighbors and
5723+ possibly newly allocated nodes until I1 and I2 fall into different
5724+ nodes. Since I1 and I2 are still neighboring items in logical key
5725+ order, they will be necessary utmost items in their respective
5726+ nodes.
5727+
5728+ (2) After this new extent item is inserted into node on the twig
5729+ level.
5730+
5731+ Fortunately this process can reuse almost all code from standard
5732+ insertion procedure (viz. make_space() and insert_paste_common()),
5733+ due to the following observation: make_space() only shifts data up
5734+ to and excluding or including insertion point. It never
5735+ "over-moves" through insertion point. Thus, one can use
5736+ make_space() to perform step (1). All required for this is just to
5737+ instruct free_space_shortage() to keep make_space() shifting data
5738+ until insertion point is at the node border.
5739+
5740+ */
5741+
5742+ /* perform common functionality of insert and paste. */
5743+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5744+ if (result != 0)
5745+ return result;
5746+
5747+ node = op->u.extent.d->coord->node;
5748+ assert("nikita-1754", node != NULL);
5749+ assert("nikita-1755", node_plugin_by_node(node) != NULL);
5750+ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5751+
5752+ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5753+ extent fits between items. */
5754+
5755+ info.doing = doing;
5756+ info.todo = todo;
5757+
5758+ /* there is another complication due to placement of extents on the
5759+ twig level: extents are "rigid" in the sense that key-range
5760+ occupied by extent cannot grow indefinitely to the right as it is
5761+ for the formatted leaf nodes. Because of this when search finds two
5762+ adjacent extents on the twig level, it has to "drill" to the leaf
5763+ level, creating new node. Here we are removing this node.
5764+ */
5765+ if (node_is_empty(node)) {
5766+ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
5767+ if (IS_ERR(delete_dummy))
5768+ return PTR_ERR(delete_dummy);
5769+ delete_dummy->u.delete.child = NULL;
5770+ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
5771+ ZF_SET(node, JNODE_HEARD_BANSHEE);
5772+ }
5773+
5774+ /* proceed with inserting extent item into parent. We are definitely
5775+ inserting rather than pasting if we get that far. */
5776+ insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
5777+ if (IS_ERR(insert_extent))
5778+ /* @delete_dummy will be automatically destroyed on the level
5779+ exiting */
5780+ return PTR_ERR(insert_extent);
5781+ /* NOTE-NIKITA insertion by key is simplest option here. Another
5782+ possibility is to insert on the left or right of already existing
5783+ item.
5784+ */
5785+ insert_extent->u.insert.type = COPT_KEY;
5786+ insert_extent->u.insert.d = op->u.extent.d;
5787+ assert("nikita-1719", op->u.extent.d->key != NULL);
5788+ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
5789+ insert_extent->u.insert.flags =
5790+ znode_get_tree(node)->carry.new_extent_flags;
5791+
5792+ /*
5793+ * if carry was asked to track lock handle we should actually track
5794+ * lock handle on the twig node rather than on the leaf where
5795+ * operation was started from. Transfer tracked lock handle.
5796+ */
5797+ if (doing->track_type) {
5798+ assert("nikita-3242", doing->tracked != NULL);
5799+ assert("nikita-3244", todo->tracked == NULL);
5800+ todo->tracked = doing->tracked;
5801+ todo->track_type = CARRY_TRACK_NODE;
5802+ doing->tracked = NULL;
5803+ doing->track_type = 0;
5804+ }
5805+
5806+ return 0;
5807+}
5808+
5809+/* update key in @parent between pointers to @left and @right.
5810+
5811+ Find coords of @left and @right and update delimiting key between them.
5812+ This is helper function called by carry_update(). Finds position of
5813+ internal item involved. Updates item key. Updates delimiting keys of child
5814+ nodes involved.
5815+*/
5816+static int update_delimiting_key(znode * parent /* node key is updated
5817+ * in */ ,
5818+ znode * left /* child of @parent */ ,
5819+ znode * right /* child of @parent */ ,
5820+ carry_level * doing /* current carry
5821+ * level */ ,
5822+ carry_level * todo /* parent carry
5823+ * level */ ,
5824+ const char **error_msg /* place to
5825+ * store error
5826+ * message */ )
5827+{
5828+ coord_t left_pos;
5829+ coord_t right_pos;
5830+ int result;
5831+ reiser4_key ldkey;
5832+ carry_plugin_info info;
5833+
5834+ assert("nikita-1177", right != NULL);
5835+ /* find position of right left child in a parent */
5836+ result = find_child_ptr(parent, right, &right_pos);
5837+ if (result != NS_FOUND) {
5838+ *error_msg = "Cannot find position of right child";
5839+ return result;
5840+ }
5841+
5842+ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
5843+ /* find position of the left child in a parent */
5844+ result = find_child_ptr(parent, left, &left_pos);
5845+ if (result != NS_FOUND) {
5846+ *error_msg = "Cannot find position of left child";
5847+ return result;
5848+ }
5849+ assert("nikita-1355", left_pos.node != NULL);
5850+ } else
5851+ left_pos.node = NULL;
5852+
5853+ /* check that they are separated by exactly one key and are basically
5854+ sane */
5855+ if (REISER4_DEBUG) {
5856+ if ((left_pos.node != NULL)
5857+ && !coord_is_existing_unit(&left_pos)) {
5858+ *error_msg = "Left child is bastard";
5859+ return RETERR(-EIO);
5860+ }
5861+ if (!coord_is_existing_unit(&right_pos)) {
5862+ *error_msg = "Right child is bastard";
5863+ return RETERR(-EIO);
5864+ }
5865+ if (left_pos.node != NULL &&
5866+ !coord_are_neighbors(&left_pos, &right_pos)) {
5867+ *error_msg = "Children are not direct siblings";
5868+ return RETERR(-EIO);
5869+ }
5870+ }
5871+ *error_msg = NULL;
5872+
5873+ info.doing = doing;
5874+ info.todo = todo;
5875+
5876+ /*
5877+ * If child node is not empty, new key of internal item is a key of
5878+ * leftmost item in the child node. If the child is empty, take its
5879+ * right delimiting key as a new key of the internal item. Precise key
5880+ * in the latter case is not important per se, because the child (and
5881+ * the internal item) are going to be killed shortly anyway, but we
5882+ * have to preserve correct order of keys in the parent node.
5883+ */
5884+
5885+ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
5886+ leftmost_key_in_node(right, &ldkey);
5887+ else {
5888+ read_lock_dk(znode_get_tree(parent));
5889+ ldkey = *znode_get_rd_key(right);
5890+ read_unlock_dk(znode_get_tree(parent));
5891+ }
5892+ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
5893+ doing->restartable = 0;
5894+ znode_make_dirty(parent);
5895+ return 0;
5896+}
5897+
5898+/* implements COP_UPDATE opration
5899+
5900+ Update delimiting keys.
5901+
5902+*/
5903+static int carry_update(carry_op * op /* operation to be performed */ ,
5904+ carry_level * doing /* current carry level */ ,
5905+ carry_level * todo /* next carry level */ )
5906+{
5907+ int result;
5908+ carry_node *missing UNUSED_ARG;
5909+ znode *left;
5910+ znode *right;
5911+ carry_node *lchild;
5912+ carry_node *rchild;
5913+ const char *error_msg;
5914+ reiser4_tree *tree;
5915+
5916+ /*
5917+ * This operation is called to update key of internal item. This is
5918+ * necessary when carry shifted of cut data on the child
5919+ * level. Arguments of this operation are:
5920+ *
5921+ * @right --- child node. Operation should update key of internal
5922+ * item pointing to @right.
5923+ *
5924+ * @left --- left neighbor of @right. This parameter is optional.
5925+ */
5926+
5927+ assert("nikita-902", op != NULL);
5928+ assert("nikita-903", todo != NULL);
5929+ assert("nikita-904", op->op == COP_UPDATE);
5930+
5931+ lchild = op->u.update.left;
5932+ rchild = op->node;
5933+
5934+ if (lchild != NULL) {
5935+ assert("nikita-1001", lchild->parent);
5936+ assert("nikita-1003", !lchild->left);
5937+ left = reiser4_carry_real(lchild);
5938+ } else
5939+ left = NULL;
5940+
5941+ tree = znode_get_tree(rchild->node);
5942+ read_lock_tree(tree);
5943+ right = znode_parent(rchild->node);
5944+ read_unlock_tree(tree);
5945+
5946+ if (right != NULL) {
5947+ result = update_delimiting_key(right,
5948+ lchild ? lchild->node : NULL,
5949+ rchild->node,
5950+ doing, todo, &error_msg);
5951+ } else {
5952+ error_msg = "Cannot find node to update key in";
5953+ result = RETERR(-EIO);
5954+ }
5955+ /* operation will be reposted to the next level by the
5956+ ->update_item_key() method of node plugin, if necessary. */
5957+
5958+ if (result != 0) {
5959+ warning("nikita-999", "Error updating delimiting key: %s (%i)",
5960+ error_msg ? : "", result);
5961+ }
5962+ return result;
5963+}
5964+
5965+/* move items from @node during carry */
5966+static int carry_shift_data(sideof side /* in what direction to move data */ ,
5967+ coord_t * insert_coord /* coord where new item
5968+ * is to be inserted */ ,
5969+ znode * node /* node which data are moved from */ ,
5970+ carry_level * doing /* active carry queue */ ,
5971+ carry_level * todo /* carry queue where new
5972+ * operations are to be put
5973+ * in */ ,
5974+ unsigned int including_insert_coord_p /* true if
5975+ * @insertion_coord
5976+ * can be moved */ )
5977+{
5978+ int result;
5979+ znode *source;
5980+ carry_plugin_info info;
5981+ node_plugin *nplug;
5982+
5983+ source = insert_coord->node;
5984+
5985+ info.doing = doing;
5986+ info.todo = todo;
5987+
5988+ nplug = node_plugin_by_node(node);
5989+ result = nplug->shift(insert_coord, node,
5990+ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
5991+ (int)including_insert_coord_p, &info);
5992+ /* the only error ->shift() method of node plugin can return is
5993+ -ENOMEM due to carry node/operation allocation. */
5994+ assert("nikita-915", result >= 0 || result == -ENOMEM);
5995+ if (result > 0) {
5996+ /*
5997+ * if some number of bytes was actually shifted, mark nodes
5998+ * dirty, and carry level as non-restartable.
5999+ */
6000+ doing->restartable = 0;
6001+ znode_make_dirty(source);
6002+ znode_make_dirty(node);
6003+ }
6004+
6005+ assert("nikita-2077", coord_check(insert_coord));
6006+ return 0;
6007+}
6008+
6009+typedef carry_node *(*carry_iterator) (carry_node * node);
6010+static carry_node *find_dir_carry(carry_node * node, carry_level * level,
6011+ carry_iterator iterator);
6012+
6013+static carry_node *pool_level_list_prev(carry_node *node)
6014+{
6015+ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
6016+}
6017+
6018+/* look for the left neighbor of given carry node in a carry queue.
6019+
6020+ This is used by find_left_neighbor(), but I am not sure that this
6021+ really gives any advantage. More statistics required.
6022+
6023+*/
6024+carry_node *find_left_carry(carry_node * node /* node to find left neighbor
6025+ * of */ ,
6026+ carry_level * level /* level to scan */ )
6027+{
6028+ return find_dir_carry(node, level,
6029+ (carry_iterator) pool_level_list_prev);
6030+}
6031+
6032+static carry_node *pool_level_list_next(carry_node *node)
6033+{
6034+ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
6035+}
6036+
6037+/* look for the right neighbor of given carry node in a
6038+ carry queue.
6039+
6040+ This is used by find_right_neighbor(), but I am not sure that this
6041+ really gives any advantage. More statistics required.
6042+
6043+*/
6044+carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6045+ * of */ ,
6046+ carry_level * level /* level to scan */ )
6047+{
6048+ return find_dir_carry(node, level,
6049+ (carry_iterator) pool_level_list_next);
6050+}
6051+
6052+/* look for the left or right neighbor of given carry node in a carry
6053+ queue.
6054+
6055+ Helper function used by find_{left|right}_carry().
6056+*/
6057+static carry_node *find_dir_carry(carry_node * node /* node to start scanning
6058+ * from */ ,
6059+ carry_level * level /* level to scan */ ,
6060+ carry_iterator iterator /* operation to
6061+ * move to the next
6062+ * node */ )
6063+{
6064+ carry_node *neighbor;
6065+
6066+ assert("nikita-1059", node != NULL);
6067+ assert("nikita-1060", level != NULL);
6068+
6069+ /* scan list of carry nodes on this list dir-ward, skipping all
6070+ carry nodes referencing the same znode. */
6071+ neighbor = node;
6072+ while (1) {
6073+ neighbor = iterator(neighbor);
6074+ if (carry_node_end(level, neighbor))
6075+ /* list head is reached */
6076+ return NULL;
6077+ if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
6078+ return neighbor;
6079+ }
6080+}
6081+
6082+/*
6083+ * Memory reservation estimation.
6084+ *
6085+ * Carry process proceeds through tree levels upwards. Carry assumes that it
6086+ * takes tree in consistent state (e.g., that search tree invariants hold),
6087+ * and leaves tree consistent after it finishes. This means that when some
6088+ * error occurs carry cannot simply return if there are pending carry
6089+ * operations. Generic solution for this problem is carry-undo either as
6090+ * transaction manager feature (requiring checkpoints and isolation), or
6091+ * through some carry specific mechanism.
6092+ *
6093+ * Our current approach is to panic if carry hits an error while tree is
6094+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6095+ * this "memory reservation" mechanism was added.
6096+ *
6097+ * Memory reservation is implemented by perthread-pages.diff patch from
6098+ * core-patches. Its API is defined in <linux/gfp.h>
6099+ *
6100+ * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6101+ * void perthread_pages_release(int nrpages);
6102+ * int perthread_pages_count(void);
6103+ *
6104+ * carry estimates its worst case memory requirements at the entry, reserved
6105+ * enough memory, and released unused pages before returning.
6106+ *
6107+ * Code below estimates worst case memory requirements for a given carry
6108+ * queue. This is dome by summing worst case memory requirements for each
6109+ * operation in the queue.
6110+ *
6111+ */
6112+
6113+/*
6114+ * Memory memory requirements of many operations depends on the tree
6115+ * height. For example, item insertion requires new node to be inserted at
6116+ * each tree level in the worst case. What tree height should be used for
6117+ * estimation? Current tree height is wrong, because tree height can change
6118+ * between the time when estimation was done and the time when operation is
6119+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6120+ * is also not desirable, because it would lead to the huge over-estimation
6121+ * all the time. Plausible solution is "capped tree height": if current tree
6122+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6123+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6124+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6125+ * to be increased even more during short interval of time.
6126+ */
6127+#define TREE_HEIGHT_CAP (5)
6128+
6129+/* return capped tree height for the @tree. See comment above. */
6130+static int cap_tree_height(reiser4_tree * tree)
6131+{
6132+ return max_t(int, tree->height, TREE_HEIGHT_CAP);
6133+}
6134+
6135+/* return capped tree height for the current tree. */
6136+static int capped_height(void)
6137+{
6138+ return cap_tree_height(current_tree);
6139+}
6140+
6141+/* return number of pages required to store given number of bytes */
6142+static int bytes_to_pages(int bytes)
6143+{
6144+ return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6145+}
6146+
6147+/* how many pages are required to allocate znodes during item insertion. */
6148+static int carry_estimate_znodes(void)
6149+{
6150+ /*
6151+ * Note, that there we have some problem here: there is no way to
6152+ * reserve pages specifically for the given slab. This means that
6153+ * these pages can be hijacked for some other end.
6154+ */
6155+
6156+ /* in the worst case we need 3 new znode on each tree level */
6157+ return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6158+}
6159+
6160+/*
6161+ * how many pages are required to load bitmaps. One bitmap per level.
6162+ */
6163+static int carry_estimate_bitmaps(void)
6164+{
6165+ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6166+ int bytes;
6167+
6168+ bytes = capped_height() * (0 + /* bnode should be added, but its is private to
6169+ * bitmap.c, skip for now. */
6170+ 2 * sizeof(jnode)); /* working and commit jnodes */
6171+ return bytes_to_pages(bytes) + 2; /* and their contents */
6172+ } else
6173+ /* bitmaps were pre-loaded during mount */
6174+ return 0;
6175+}
6176+
6177+/* worst case item insertion memory requirements */
6178+static int carry_estimate_insert(carry_op * op, carry_level * level)
6179+{
6180+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6181+ capped_height() + /* new block on each level */
6182+ 1 + /* and possibly extra new block at the leaf level */
6183+ 3; /* loading of leaves into memory */
6184+}
6185+
6186+/* worst case item deletion memory requirements */
6187+static int carry_estimate_delete(carry_op * op, carry_level * level)
6188+{
6189+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6190+ 3; /* loading of leaves into memory */
6191+}
6192+
6193+/* worst case tree cut memory requirements */
6194+static int carry_estimate_cut(carry_op * op, carry_level * level)
6195+{
6196+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6197+ 3; /* loading of leaves into memory */
6198+}
6199+
6200+/* worst case memory requirements of pasting into item */
6201+static int carry_estimate_paste(carry_op * op, carry_level * level)
6202+{
6203+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6204+ capped_height() + /* new block on each level */
6205+ 1 + /* and possibly extra new block at the leaf level */
6206+ 3; /* loading of leaves into memory */
6207+}
6208+
6209+/* worst case memory requirements of extent insertion */
6210+static int carry_estimate_extent(carry_op * op, carry_level * level)
6211+{
6212+ return carry_estimate_insert(op, level) + /* insert extent */
6213+ carry_estimate_delete(op, level); /* kill leaf */
6214+}
6215+
6216+/* worst case memory requirements of key update */
6217+static int carry_estimate_update(carry_op * op, carry_level * level)
6218+{
6219+ return 0;
6220+}
6221+
6222+/* worst case memory requirements of flow insertion */
6223+static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6224+{
6225+ int newnodes;
6226+
6227+ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6228+ CARRY_FLOW_NEW_NODES_LIMIT);
6229+ /*
6230+ * roughly estimate insert_flow as a sequence of insertions.
6231+ */
6232+ return newnodes * carry_estimate_insert(op, level);
6233+}
6234+
6235+/* This is dispatch table for carry operations. It can be trivially
6236+ abstracted into useful plugin: tunable balancing policy is a good
6237+ thing. */
6238+carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6239+ [COP_INSERT] = {
6240+ .handler = carry_insert,
6241+ .estimate = carry_estimate_insert}
6242+ ,
6243+ [COP_DELETE] = {
6244+ .handler = carry_delete,
6245+ .estimate = carry_estimate_delete}
6246+ ,
6247+ [COP_CUT] = {
6248+ .handler = carry_cut,
6249+ .estimate = carry_estimate_cut}
6250+ ,
6251+ [COP_PASTE] = {
6252+ .handler = carry_paste,
6253+ .estimate = carry_estimate_paste}
6254+ ,
6255+ [COP_EXTENT] = {
6256+ .handler = carry_extent,
6257+ .estimate = carry_estimate_extent}
6258+ ,
6259+ [COP_UPDATE] = {
6260+ .handler = carry_update,
6261+ .estimate = carry_estimate_update}
6262+ ,
6263+ [COP_INSERT_FLOW] = {
6264+ .handler = carry_insert_flow,
6265+ .estimate = carry_estimate_insert_flow}
6266+};
6267+
6268+/* Make Linus happy.
6269+ Local variables:
6270+ c-indentation-style: "K&R"
6271+ mode-name: "LC"
6272+ c-basic-offset: 8
6273+ tab-width: 8
6274+ fill-column: 120
6275+ scroll-step: 1
6276+ End:
6277+*/
6278diff -urN linux-2.6.23.orig/fs/reiser4/carry_ops.h linux-2.6.23/fs/reiser4/carry_ops.h
6279--- linux-2.6.23.orig/fs/reiser4/carry_ops.h 1970-01-01 03:00:00.000000000 +0300
6280+++ linux-2.6.23/fs/reiser4/carry_ops.h 2007-12-04 16:49:30.000000000 +0300
6281@@ -0,0 +1,42 @@
6282+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6283+
6284+/* implementation of carry operations. See carry_ops.c for details. */
6285+
6286+#if !defined( __CARRY_OPS_H__ )
6287+#define __CARRY_OPS_H__
6288+
6289+#include "forward.h"
6290+#include "znode.h"
6291+#include "carry.h"
6292+
6293+/* carry operation handlers */
6294+typedef struct carry_op_handler {
6295+ /* perform operation */
6296+ int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6297+ /* estimate memory requirements for @op */
6298+ int (*estimate) (carry_op * op, carry_level * level);
6299+} carry_op_handler;
6300+
6301+/* This is dispatch table for carry operations. It can be trivially
6302+ abstracted into useful plugin: tunable balancing policy is a good
6303+ thing. */
6304+extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6305+
6306+unsigned int space_needed(const znode * node, const coord_t * coord,
6307+ const reiser4_item_data * data, int inserting);
6308+extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6309+extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6310+
6311+/* __CARRY_OPS_H__ */
6312+#endif
6313+
6314+/* Make Linus happy.
6315+ Local variables:
6316+ c-indentation-style: "K&R"
6317+ mode-name: "LC"
6318+ c-basic-offset: 8
6319+ tab-width: 8
6320+ fill-column: 120
6321+ scroll-step: 1
6322+ End:
6323+*/
6324diff -urN linux-2.6.23.orig/fs/reiser4/context.c linux-2.6.23/fs/reiser4/context.c
6325--- linux-2.6.23.orig/fs/reiser4/context.c 1970-01-01 03:00:00.000000000 +0300
6326+++ linux-2.6.23/fs/reiser4/context.c 2007-12-04 16:49:30.000000000 +0300
6327@@ -0,0 +1,288 @@
6328+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6329+
6330+/* Manipulation of reiser4_context */
6331+
6332+/*
6333+ * global context used during system call. Variable of this type is allocated
6334+ * on the stack at the beginning of the reiser4 part of the system call and
6335+ * pointer to it is stored in the current->fs_context. This allows us to avoid
6336+ * passing pointer to current transaction and current lockstack (both in
6337+ * one-to-one mapping with threads) all over the call chain.
6338+ *
6339+ * It's kind of like those global variables the prof used to tell you not to
6340+ * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6341+ *
6342+ * In some situations it is desirable to have ability to enter reiser4_context
6343+ * more than once for the same thread (nested contexts). For example, there
6344+ * are some functions that can be called either directly from VFS/VM or from
6345+ * already active reiser4 context (->writepage, for example).
6346+ *
6347+ * In such situations "child" context acts like dummy: all activity is
6348+ * actually performed in the top level context, and get_current_context()
6349+ * always returns top level context.
6350+ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6351+ * nested any way.
6352+ *
6353+ * Note that there is an important difference between reiser4 uses
6354+ * ->fs_context and the way other file systems use it. Other file systems
6355+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6356+ * (this is why ->fs_context was initially called ->journal_info). This means,
6357+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6358+ * to the file system, they assume that some transaction is already underway,
6359+ * and usually bail out, because starting nested transaction would most likely
6360+ * lead to the deadlock. This gives false positives with reiser4, because we
6361+ * set ->fs_context before starting transaction.
6362+ */
6363+
6364+#include "debug.h"
6365+#include "super.h"
6366+#include "context.h"
6367+
6368+#include <linux/writeback.h> /* balance_dirty_pages() */
6369+#include <linux/hardirq.h>
6370+
6371+static void _reiser4_init_context(reiser4_context * context,
6372+ struct super_block *super)
6373+{
6374+ memset(context, 0, sizeof(*context));
6375+
6376+ context->super = super;
6377+ context->magic = context_magic;
6378+ context->outer = current->journal_info;
6379+ current->journal_info = (void *)context;
6380+ context->nr_children = 0;
6381+ context->gfp_mask = GFP_KERNEL;
6382+
6383+ init_lock_stack(&context->stack);
6384+
6385+ reiser4_txn_begin(context);
6386+
6387+ /* initialize head of tap list */
6388+ INIT_LIST_HEAD(&context->taps);
6389+#if REISER4_DEBUG
6390+ context->task = current;
6391+#endif
6392+ grab_space_enable();
6393+}
6394+
6395+/* initialize context and bind it to the current thread
6396+
6397+ This function should be called at the beginning of reiser4 part of
6398+ syscall.
6399+*/
6400+reiser4_context * reiser4_init_context(struct super_block * super)
6401+{
6402+ reiser4_context *context;
6403+
6404+ assert("nikita-2662", !in_interrupt() && !in_irq());
6405+ assert("nikita-3357", super != NULL);
6406+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6407+
6408+ context = get_current_context_check();
6409+ if (context && context->super == super) {
6410+ context = (reiser4_context *) current->journal_info;
6411+ context->nr_children++;
6412+ return context;
6413+ }
6414+
6415+ context = kmalloc(sizeof(*context), GFP_KERNEL);
6416+ if (context == NULL)
6417+ return ERR_PTR(RETERR(-ENOMEM));
6418+
6419+ _reiser4_init_context(context, super);
6420+ return context;
6421+}
6422+
6423+/* this is used in scan_mgr which is called with spinlock held and in
6424+ reiser4_fill_super magic */
6425+void init_stack_context(reiser4_context *context, struct super_block *super)
6426+{
6427+ assert("nikita-2662", !in_interrupt() && !in_irq());
6428+ assert("nikita-3357", super != NULL);
6429+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6430+ assert("vs-12", !is_in_reiser4_context());
6431+
6432+ _reiser4_init_context(context, super);
6433+ context->on_stack = 1;
6434+ return;
6435+}
6436+
6437+/* cast lock stack embedded into reiser4 context up to its container */
6438+reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6439+{
6440+ return container_of(owner, reiser4_context, stack);
6441+}
6442+
6443+/* true if there is already _any_ reiser4 context for the current thread */
6444+int is_in_reiser4_context(void)
6445+{
6446+ reiser4_context *ctx;
6447+
6448+ ctx = current->journal_info;
6449+ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6450+}
6451+
6452+/*
6453+ * call balance dirty pages for the current context.
6454+ *
6455+ * File system is expected to call balance_dirty_pages_ratelimited() whenever
6456+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6457+ * write---this covers vast majority of all dirty traffic), but we cannot do
6458+ * this immediately when formatted node is dirtied, because long term lock is
6459+ * usually held at that time. To work around this, dirtying of formatted node
6460+ * simply increases ->nr_marked_dirty counter in the current reiser4
6461+ * context. When we are about to leave this context,
6462+ * balance_dirty_pages_ratelimited() is called, if necessary.
6463+ *
6464+ * This introduces another problem: sometimes we do not want to run
6465+ * balance_dirty_pages_ratelimited() when leaving a context, for example
6466+ * because some important lock (like ->i_mutex on the parent directory) is
6467+ * held. To achieve this, ->nobalance flag can be set in the current context.
6468+ */
6469+static void balance_dirty_pages_at(reiser4_context *context)
6470+{
6471+ reiser4_super_info_data *sbinfo = get_super_private(context->super);
6472+
6473+ /*
6474+ * call balance_dirty_pages_ratelimited() to process formatted nodes
6475+ * dirtied during this system call. Do that only if we are not in mount
6476+ * and there were nodes dirtied in this context and we are not in
6477+ * writepage (to avoid deadlock) and not in pdflush
6478+ */
6479+ if (sbinfo != NULL && sbinfo->fake != NULL &&
6480+ context->nr_marked_dirty != 0 &&
6481+ !(current->flags & PF_MEMALLOC) &&
6482+ !current_is_pdflush())
6483+ balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6484+}
6485+
6486+/* release resources associated with context.
6487+
6488+ This function should be called at the end of "session" with reiser4,
6489+ typically just before leaving reiser4 driver back to VFS.
6490+
6491+ This is good place to put some degugging consistency checks, like that
6492+ thread released all locks and closed transcrash etc.
6493+
6494+*/
6495+static void reiser4_done_context(reiser4_context * context /* context being released */ )
6496+{
6497+ assert("nikita-860", context != NULL);
6498+ assert("nikita-859", context->magic == context_magic);
6499+ assert("vs-646", (reiser4_context *) current->journal_info == context);
6500+ assert("zam-686", !in_interrupt() && !in_irq());
6501+
6502+ /* only do anything when leaving top-level reiser4 context. All nested
6503+ * contexts are just dummies. */
6504+ if (context->nr_children == 0) {
6505+ assert("jmacd-673", context->trans == NULL);
6506+ assert("jmacd-1002", lock_stack_isclean(&context->stack));
6507+ assert("nikita-1936", reiser4_no_counters_are_held());
6508+ assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6509+ assert("zam-1004", ergo(get_super_private(context->super),
6510+ get_super_private(context->super)->delete_mutex_owner !=
6511+ current));
6512+
6513+ /* release all grabbed but as yet unused blocks */
6514+ if (context->grabbed_blocks != 0)
6515+ all_grabbed2free();
6516+
6517+ /*
6518+ * synchronize against longterm_unlock_znode():
6519+ * wake_up_requestor() wakes up requestors without holding
6520+ * zlock (otherwise they will immediately bump into that lock
6521+ * after wake up on another CPU). To work around (rare)
6522+ * situation where requestor has been woken up asynchronously
6523+ * and managed to run until completion (and destroy its
6524+ * context and lock stack) before wake_up_requestor() called
6525+ * wake_up() on it, wake_up_requestor() synchronize on lock
6526+ * stack spin lock. It has actually been observed that spin
6527+ * lock _was_ locked at this point, because
6528+ * wake_up_requestor() took interrupt.
6529+ */
6530+ spin_lock_stack(&context->stack);
6531+ spin_unlock_stack(&context->stack);
6532+
6533+ assert("zam-684", context->nr_children == 0);
6534+ /* restore original ->fs_context value */
6535+ current->journal_info = context->outer;
6536+ if (context->on_stack == 0)
6537+ kfree(context);
6538+ } else {
6539+ context->nr_children--;
6540+#if REISER4_DEBUG
6541+ assert("zam-685", context->nr_children >= 0);
6542+#endif
6543+ }
6544+}
6545+
6546+/*
6547+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6548+ * transaction. Call done_context() to do context related book-keeping.
6549+ */
6550+void reiser4_exit_context(reiser4_context * context)
6551+{
6552+ assert("nikita-3021", reiser4_schedulable());
6553+
6554+ if (context->nr_children == 0) {
6555+ if (!context->nobalance) {
6556+ reiser4_txn_restart(context);
6557+ balance_dirty_pages_at(context);
6558+ }
6559+
6560+ /* if filesystem is mounted with -o sync or -o dirsync - commit
6561+ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
6562+ commiting on exit_context when inode semaphore is held and
6563+ to have ktxnmgrd to do commit instead to get better
6564+ concurrent filesystem accesses. But, when one mounts with -o
6565+ sync, he cares more about reliability than about
6566+ performance. So, for now we have this simple mount -o sync
6567+ support. */
6568+ if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6569+ txn_atom *atom;
6570+
6571+ atom = get_current_atom_locked_nocheck();
6572+ if (atom) {
6573+ atom->flags |= ATOM_FORCE_COMMIT;
6574+ context->trans->flags &= ~TXNH_DONT_COMMIT;
6575+ spin_unlock_atom(atom);
6576+ }
6577+ }
6578+ reiser4_txn_end(context);
6579+ }
6580+ reiser4_done_context(context);
6581+}
6582+
6583+void reiser4_ctx_gfp_mask_set(void)
6584+{
6585+ reiser4_context *ctx;
6586+
6587+ ctx = get_current_context();
6588+ if (ctx->entd == 0 &&
6589+ list_empty(&ctx->stack.locks) &&
6590+ ctx->trans->atom == NULL)
6591+ ctx->gfp_mask = GFP_KERNEL;
6592+ else
6593+ ctx->gfp_mask = GFP_NOFS;
6594+}
6595+
6596+void reiser4_ctx_gfp_mask_force (gfp_t mask)
6597+{
6598+ reiser4_context *ctx;
6599+ ctx = get_current_context();
6600+
6601+ assert("edward-1454", ctx != NULL);
6602+
6603+ ctx->gfp_mask = mask;
6604+}
6605+
6606+/*
6607+ * Local variables:
6608+ * c-indentation-style: "K&R"
6609+ * mode-name: "LC"
6610+ * c-basic-offset: 8
6611+ * tab-width: 8
6612+ * fill-column: 120
6613+ * scroll-step: 1
6614+ * End:
6615+ */
6616diff -urN linux-2.6.23.orig/fs/reiser4/context.h linux-2.6.23/fs/reiser4/context.h
6617--- linux-2.6.23.orig/fs/reiser4/context.h 1970-01-01 03:00:00.000000000 +0300
6618+++ linux-2.6.23/fs/reiser4/context.h 2007-12-04 16:49:30.000000000 +0300
6619@@ -0,0 +1,228 @@
6620+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6621+ * reiser4/README */
6622+
6623+/* Reiser4 context. See context.c for details. */
6624+
6625+#if !defined( __REISER4_CONTEXT_H__ )
6626+#define __REISER4_CONTEXT_H__
6627+
6628+#include "forward.h"
6629+#include "debug.h"
6630+#include "dformat.h"
6631+#include "tap.h"
6632+#include "lock.h"
6633+
6634+#include <linux/types.h> /* for __u?? */
6635+#include <linux/fs.h> /* for struct super_block */
6636+#include <linux/spinlock.h>
6637+#include <linux/sched.h> /* for struct task_struct */
6638+
6639+/* reiser4 per-thread context */
6640+struct reiser4_context {
6641+ /* magic constant. For identification of reiser4 contexts. */
6642+ __u32 magic;
6643+
6644+ /* current lock stack. See lock.[ch]. This is where list of all
6645+ locks taken by current thread is kept. This is also used in
6646+ deadlock detection. */
6647+ lock_stack stack;
6648+
6649+ /* current transcrash. */
6650+ txn_handle *trans;
6651+ /* transaction handle embedded into reiser4_context. ->trans points
6652+ * here by default. */
6653+ txn_handle trans_in_ctx;
6654+
6655+ /* super block we are working with. To get the current tree
6656+ use &get_super_private (reiser4_get_current_sb ())->tree. */
6657+ struct super_block *super;
6658+
6659+ /* parent fs activation */
6660+ struct fs_activation *outer;
6661+
6662+ /* per-thread grabbed (for further allocation) blocks counter */
6663+ reiser4_block_nr grabbed_blocks;
6664+
6665+ /* list of taps currently monitored. See tap.c */
6666+ struct list_head taps;
6667+
6668+ /* grabbing space is enabled */
6669+ unsigned int grab_enabled:1;
6670+ /* should be set when we are write dirty nodes to disk in jnode_flush or
6671+ * reiser4_write_logs() */
6672+ unsigned int writeout_mode:1;
6673+ /* true, if current thread is an ent thread */
6674+ unsigned int entd:1;
6675+ /* true, if balance_dirty_pages() should not be run when leaving this
6676+ * context. This is used to avoid lengthly balance_dirty_pages()
6677+ * operation when holding some important resource, like directory
6678+ * ->i_mutex */
6679+ unsigned int nobalance:1;
6680+
6681+ /* this bit is used on reiser4_done_context to decide whether context is
6682+ kmalloc-ed and has to be kfree-ed */
6683+ unsigned int on_stack:1;
6684+
6685+ /* count non-trivial jnode_set_dirty() calls */
6686+ unsigned long nr_marked_dirty;
6687+
6688+ /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
6689+ * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6690+ * captures pages. When number of pages captured in one
6691+ * reiser4_sync_inodes reaches some threshold - some atoms get
6692+ * flushed */
6693+ int nr_captured;
6694+ int nr_children; /* number of child contexts */
6695+#if REISER4_DEBUG
6696+ /* debugging information about reiser4 locks held by the current
6697+ * thread */
6698+ reiser4_lock_cnt_info locks;
6699+ struct task_struct *task; /* so we can easily find owner of the stack */
6700+
6701+ /*
6702+ * disk space grabbing debugging support
6703+ */
6704+ /* how many disk blocks were grabbed by the first call to
6705+ * reiser4_grab_space() in this context */
6706+ reiser4_block_nr grabbed_initially;
6707+
6708+ /* list of all threads doing flush currently */
6709+ struct list_head flushers_link;
6710+ /* information about last error encountered by reiser4 */
6711+ err_site err;
6712+#endif
6713+ void *vp;
6714+ gfp_t gfp_mask;
6715+};
6716+
6717+extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6718+
6719+/* Debugging helps. */
6720+#if REISER4_DEBUG
6721+extern void print_contexts(void);
6722+#endif
6723+
6724+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6725+#define current_blocksize reiser4_get_current_sb()->s_blocksize
6726+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6727+
6728+extern reiser4_context *reiser4_init_context(struct super_block *);
6729+extern void init_stack_context(reiser4_context *, struct super_block *);
6730+extern void reiser4_exit_context(reiser4_context *);
6731+
6732+/* magic constant we store in reiser4_context allocated at the stack. Used to
6733+ catch accesses to staled or uninitialized contexts. */
6734+#define context_magic ((__u32) 0x4b1b5d0b)
6735+
6736+extern int is_in_reiser4_context(void);
6737+
6738+/*
6739+ * return reiser4_context for the thread @tsk
6740+ */
6741+static inline reiser4_context *get_context(const struct task_struct *tsk)
6742+{
6743+ assert("vs-1682",
6744+ ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6745+ return (reiser4_context *) tsk->journal_info;
6746+}
6747+
6748+/*
6749+ * return reiser4 context of the current thread, or NULL if there is none.
6750+ */
6751+static inline reiser4_context *get_current_context_check(void)
6752+{
6753+ if (is_in_reiser4_context())
6754+ return get_context(current);
6755+ else
6756+ return NULL;
6757+}
6758+
6759+static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
6760+
6761+/* return context associated with current thread */
6762+static inline reiser4_context *get_current_context(void)
6763+{
6764+ return get_context(current);
6765+}
6766+
6767+static inline gfp_t reiser4_ctx_gfp_mask_get(void)
6768+{
6769+ reiser4_context *ctx;
6770+
6771+ ctx = get_current_context_check();
6772+ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
6773+}
6774+
6775+void reiser4_ctx_gfp_mask_set(void);
6776+void reiser4_ctx_gfp_mask_force (gfp_t mask);
6777+
6778+/*
6779+ * true if current thread is in the write-out mode. Thread enters write-out
6780+ * mode during jnode_flush and reiser4_write_logs().
6781+ */
6782+static inline int is_writeout_mode(void)
6783+{
6784+ return get_current_context()->writeout_mode;
6785+}
6786+
6787+/*
6788+ * enter write-out mode
6789+ */
6790+static inline void writeout_mode_enable(void)
6791+{
6792+ assert("zam-941", !get_current_context()->writeout_mode);
6793+ get_current_context()->writeout_mode = 1;
6794+}
6795+
6796+/*
6797+ * leave write-out mode
6798+ */
6799+static inline void writeout_mode_disable(void)
6800+{
6801+ assert("zam-942", get_current_context()->writeout_mode);
6802+ get_current_context()->writeout_mode = 0;
6803+}
6804+
6805+static inline void grab_space_enable(void)
6806+{
6807+ get_current_context()->grab_enabled = 1;
6808+}
6809+
6810+static inline void grab_space_disable(void)
6811+{
6812+ get_current_context()->grab_enabled = 0;
6813+}
6814+
6815+static inline void grab_space_set_enabled(int enabled)
6816+{
6817+ get_current_context()->grab_enabled = enabled;
6818+}
6819+
6820+static inline int is_grab_enabled(reiser4_context * ctx)
6821+{
6822+ return ctx->grab_enabled;
6823+}
6824+
6825+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
6826+ * flush would be performed when it is closed. This is necessary when handle
6827+ * has to be closed under some coarse semaphore, like i_mutex of
6828+ * directory. Commit will be performed by ktxnmgrd. */
6829+static inline void context_set_commit_async(reiser4_context * context)
6830+{
6831+ context->nobalance = 1;
6832+ context->trans->flags |= TXNH_DONT_COMMIT;
6833+}
6834+
6835+/* __REISER4_CONTEXT_H__ */
6836+#endif
6837+
6838+/* Make Linus happy.
6839+ Local variables:
6840+ c-indentation-style: "K&R"
6841+ mode-name: "LC"
6842+ c-basic-offset: 8
6843+ tab-width: 8
6844+ fill-column: 120
6845+ scroll-step: 1
6846+ End:
6847+*/
6848diff -urN linux-2.6.23.orig/fs/reiser4/coord.c linux-2.6.23/fs/reiser4/coord.c
6849--- linux-2.6.23.orig/fs/reiser4/coord.c 1970-01-01 03:00:00.000000000 +0300
6850+++ linux-2.6.23/fs/reiser4/coord.c 2007-12-04 16:49:30.000000000 +0300
6851@@ -0,0 +1,935 @@
6852+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6853+
6854+#include "forward.h"
6855+#include "debug.h"
6856+#include "dformat.h"
6857+#include "tree.h"
6858+#include "plugin/item/item.h"
6859+#include "znode.h"
6860+#include "coord.h"
6861+
6862+/* Internal constructor. */
6863+static inline void
6864+coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
6865+ pos_in_node_t unit_pos, between_enum between)
6866+{
6867+ coord->node = (znode *) node;
6868+ coord_set_item_pos(coord, item_pos);
6869+ coord->unit_pos = unit_pos;
6870+ coord->between = between;
6871+ ON_DEBUG(coord->plug_v = 0);
6872+ ON_DEBUG(coord->body_v = 0);
6873+
6874+ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
6875+}
6876+
6877+/* after shifting of node content, coord previously set properly may become
6878+ invalid, try to "normalize" it. */
6879+void coord_normalize(coord_t * coord)
6880+{
6881+ znode *node;
6882+
6883+ node = coord->node;
6884+ assert("vs-683", node);
6885+
6886+ coord_clear_iplug(coord);
6887+
6888+ if (node_is_empty(node)) {
6889+ coord_init_first_unit(coord, node);
6890+ } else if ((coord->between == AFTER_ITEM)
6891+ || (coord->between == AFTER_UNIT)) {
6892+ return;
6893+ } else if (coord->item_pos == coord_num_items(coord)
6894+ && coord->between == BEFORE_ITEM) {
6895+ coord_dec_item_pos(coord);
6896+ coord->between = AFTER_ITEM;
6897+ } else if (coord->unit_pos == coord_num_units(coord)
6898+ && coord->between == BEFORE_UNIT) {
6899+ coord->unit_pos--;
6900+ coord->between = AFTER_UNIT;
6901+ } else if (coord->item_pos == coord_num_items(coord)
6902+ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
6903+ coord_dec_item_pos(coord);
6904+ coord->unit_pos = 0;
6905+ coord->between = AFTER_ITEM;
6906+ }
6907+}
6908+
6909+/* Copy a coordinate. */
6910+void coord_dup(coord_t * coord, const coord_t * old_coord)
6911+{
6912+ assert("jmacd-9800", coord_check(old_coord));
6913+ coord_dup_nocheck(coord, old_coord);
6914+}
6915+
6916+/* Copy a coordinate without check. Useful when old_coord->node is not
6917+ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
6918+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
6919+{
6920+ coord->node = old_coord->node;
6921+ coord_set_item_pos(coord, old_coord->item_pos);
6922+ coord->unit_pos = old_coord->unit_pos;
6923+ coord->between = old_coord->between;
6924+ coord->iplugid = old_coord->iplugid;
6925+ ON_DEBUG(coord->plug_v = old_coord->plug_v);
6926+ ON_DEBUG(coord->body_v = old_coord->body_v);
6927+}
6928+
6929+/* Initialize an invalid coordinate. */
6930+void coord_init_invalid(coord_t * coord, const znode * node)
6931+{
6932+ coord_init_values(coord, node, 0, 0, INVALID_COORD);
6933+}
6934+
6935+void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
6936+{
6937+ coord_init_values(coord, node, 0, 0, AT_UNIT);
6938+}
6939+
6940+/* Initialize a coordinate to point at the first unit of the first item. If the node is
6941+ empty, it is positioned at the EMPTY_NODE. */
6942+void coord_init_first_unit(coord_t * coord, const znode * node)
6943+{
6944+ int is_empty = node_is_empty(node);
6945+
6946+ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
6947+
6948+ assert("jmacd-9801", coord_check(coord));
6949+}
6950+
6951+/* Initialize a coordinate to point at the last unit of the last item. If the node is
6952+ empty, it is positioned at the EMPTY_NODE. */
6953+void coord_init_last_unit(coord_t * coord, const znode * node)
6954+{
6955+ int is_empty = node_is_empty(node);
6956+
6957+ coord_init_values(coord, node,
6958+ (is_empty ? 0 : node_num_items(node) - 1), 0,
6959+ (is_empty ? EMPTY_NODE : AT_UNIT));
6960+ if (!is_empty)
6961+ coord->unit_pos = coord_last_unit_pos(coord);
6962+ assert("jmacd-9802", coord_check(coord));
6963+}
6964+
6965+/* Initialize a coordinate to before the first item. If the node is empty, it is
6966+ positioned at the EMPTY_NODE. */
6967+void coord_init_before_first_item(coord_t * coord, const znode * node)
6968+{
6969+ int is_empty = node_is_empty(node);
6970+
6971+ coord_init_values(coord, node, 0, 0,
6972+ (is_empty ? EMPTY_NODE : BEFORE_UNIT));
6973+
6974+ assert("jmacd-9803", coord_check(coord));
6975+}
6976+
6977+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
6978+ at the EMPTY_NODE. */
6979+void coord_init_after_last_item(coord_t * coord, const znode * node)
6980+{
6981+ int is_empty = node_is_empty(node);
6982+
6983+ coord_init_values(coord, node,
6984+ (is_empty ? 0 : node_num_items(node) - 1), 0,
6985+ (is_empty ? EMPTY_NODE : AFTER_ITEM));
6986+
6987+ assert("jmacd-9804", coord_check(coord));
6988+}
6989+
6990+/* Initialize a coordinate to after last unit in the item. Coord must be set
6991+ already to existing item */
6992+void coord_init_after_item_end(coord_t * coord)
6993+{
6994+ coord->between = AFTER_UNIT;
6995+ coord->unit_pos = coord_last_unit_pos(coord);
6996+}
6997+
6998+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
6999+void coord_init_before_item(coord_t * coord)
7000+{
7001+ coord->unit_pos = 0;
7002+ coord->between = BEFORE_ITEM;
7003+}
7004+
7005+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7006+void coord_init_after_item(coord_t * coord)
7007+{
7008+ coord->unit_pos = 0;
7009+ coord->between = AFTER_ITEM;
7010+}
7011+
7012+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7013+ it was not clear how actually */
7014+void coord_init_zero(coord_t * coord)
7015+{
7016+ memset(coord, 0, sizeof(*coord));
7017+}
7018+
7019+/* Return the number of units at the present item. Asserts coord_is_existing_item(). */
7020+unsigned coord_num_units(const coord_t * coord)
7021+{
7022+ assert("jmacd-9806", coord_is_existing_item(coord));
7023+
7024+ return item_plugin_by_coord(coord)->b.nr_units(coord);
7025+}
7026+
7027+/* Returns true if the coord was initializewd by coord_init_invalid (). */
7028+/* Audited by: green(2002.06.15) */
7029+int coord_is_invalid(const coord_t * coord)
7030+{
7031+ return coord->between == INVALID_COORD;
7032+}
7033+
7034+/* Returns true if the coordinate is positioned at an existing item, not before or after
7035+ an item. It may be placed at, before, or after any unit within the item, whether
7036+ existing or not. */
7037+int coord_is_existing_item(const coord_t * coord)
7038+{
7039+ switch (coord->between) {
7040+ case EMPTY_NODE:
7041+ case BEFORE_ITEM:
7042+ case AFTER_ITEM:
7043+ case INVALID_COORD:
7044+ return 0;
7045+
7046+ case BEFORE_UNIT:
7047+ case AT_UNIT:
7048+ case AFTER_UNIT:
7049+ return coord->item_pos < coord_num_items(coord);
7050+ }
7051+
7052+ impossible("jmacd-9900", "unreachable coord: %p", coord);
7053+ return 0;
7054+}
7055+
7056+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7057+ unit. */
7058+/* Audited by: green(2002.06.15) */
7059+int coord_is_existing_unit(const coord_t * coord)
7060+{
7061+ switch (coord->between) {
7062+ case EMPTY_NODE:
7063+ case BEFORE_UNIT:
7064+ case AFTER_UNIT:
7065+ case BEFORE_ITEM:
7066+ case AFTER_ITEM:
7067+ case INVALID_COORD:
7068+ return 0;
7069+
7070+ case AT_UNIT:
7071+ return (coord->item_pos < coord_num_items(coord)
7072+ && coord->unit_pos < coord_num_units(coord));
7073+ }
7074+
7075+ impossible("jmacd-9902", "unreachable");
7076+ return 0;
7077+}
7078+
7079+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7080+ true for empty nodes nor coordinates positioned before the first item. */
7081+/* Audited by: green(2002.06.15) */
7082+int coord_is_leftmost_unit(const coord_t * coord)
7083+{
7084+ return (coord->between == AT_UNIT && coord->item_pos == 0
7085+ && coord->unit_pos == 0);
7086+}
7087+
7088+#if REISER4_DEBUG
7089+/* For assertions only, checks for a valid coordinate. */
7090+int coord_check(const coord_t * coord)
7091+{
7092+ if (coord->node == NULL) {
7093+ return 0;
7094+ }
7095+ if (znode_above_root(coord->node))
7096+ return 1;
7097+
7098+ switch (coord->between) {
7099+ default:
7100+ case INVALID_COORD:
7101+ return 0;
7102+ case EMPTY_NODE:
7103+ if (!node_is_empty(coord->node)) {
7104+ return 0;
7105+ }
7106+ return coord->item_pos == 0 && coord->unit_pos == 0;
7107+
7108+ case BEFORE_UNIT:
7109+ case AFTER_UNIT:
7110+ if (node_is_empty(coord->node) && (coord->item_pos == 0)
7111+ && (coord->unit_pos == 0))
7112+ return 1;
7113+ case AT_UNIT:
7114+ break;
7115+ case AFTER_ITEM:
7116+ case BEFORE_ITEM:
7117+ /* before/after item should not set unit_pos. */
7118+ if (coord->unit_pos != 0) {
7119+ return 0;
7120+ }
7121+ break;
7122+ }
7123+
7124+ if (coord->item_pos >= node_num_items(coord->node)) {
7125+ return 0;
7126+ }
7127+
7128+ /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7129+ between is set either AFTER_ITEM or BEFORE_ITEM */
7130+ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7131+ return 1;
7132+
7133+ if (coord_is_iplug_set(coord) &&
7134+ coord->unit_pos >
7135+ item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7136+ return 0;
7137+ }
7138+ return 1;
7139+}
7140+#endif
7141+
7142+/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7143+ Returns 1 if the new position is does not exist. */
7144+static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7145+{
7146+ /* If the node is invalid, leave it. */
7147+ if (coord->between == INVALID_COORD) {
7148+ return 1;
7149+ }
7150+
7151+ /* If the node is empty, set it appropriately. */
7152+ if (items == 0) {
7153+ coord->between = EMPTY_NODE;
7154+ coord_set_item_pos(coord, 0);
7155+ coord->unit_pos = 0;
7156+ return 1;
7157+ }
7158+
7159+ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7160+ if (coord->between == EMPTY_NODE) {
7161+ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7162+ coord_set_item_pos(coord, 0);
7163+ coord->unit_pos = 0;
7164+ return 0;
7165+ }
7166+
7167+ /* If the item_pos is out-of-range, set it appropriatly. */
7168+ if (coord->item_pos >= items) {
7169+ coord->between = AFTER_ITEM;
7170+ coord_set_item_pos(coord, items - 1);
7171+ coord->unit_pos = 0;
7172+ /* If is_next, return 1 (can't go any further). */
7173+ return is_next;
7174+ }
7175+
7176+ return 0;
7177+}
7178+
7179+/* Advances the coordinate by one unit to the right. If empty, no change. If
7180+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an
7181+ existing unit. */
7182+int coord_next_unit(coord_t * coord)
7183+{
7184+ unsigned items = coord_num_items(coord);
7185+
7186+ if (coord_adjust_items(coord, items, 1) == 1) {
7187+ return 1;
7188+ }
7189+
7190+ switch (coord->between) {
7191+ case BEFORE_UNIT:
7192+ /* Now it is positioned at the same unit. */
7193+ coord->between = AT_UNIT;
7194+ return 0;
7195+
7196+ case AFTER_UNIT:
7197+ case AT_UNIT:
7198+ /* If it was at or after a unit and there are more units in this item,
7199+ advance to the next one. */
7200+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7201+ coord->unit_pos += 1;
7202+ coord->between = AT_UNIT;
7203+ return 0;
7204+ }
7205+
7206+ /* Otherwise, it is crossing an item boundary and treated as if it was
7207+ after the current item. */
7208+ coord->between = AFTER_ITEM;
7209+ coord->unit_pos = 0;
7210+ /* FALLTHROUGH */
7211+
7212+ case AFTER_ITEM:
7213+ /* Check for end-of-node. */
7214+ if (coord->item_pos == items - 1) {
7215+ return 1;
7216+ }
7217+
7218+ coord_inc_item_pos(coord);
7219+ coord->unit_pos = 0;
7220+ coord->between = AT_UNIT;
7221+ return 0;
7222+
7223+ case BEFORE_ITEM:
7224+ /* The adjust_items checks ensure that we are valid here. */
7225+ coord->unit_pos = 0;
7226+ coord->between = AT_UNIT;
7227+ return 0;
7228+
7229+ case INVALID_COORD:
7230+ case EMPTY_NODE:
7231+ /* Handled in coord_adjust_items(). */
7232+ break;
7233+ }
7234+
7235+ impossible("jmacd-9902", "unreachable");
7236+ return 0;
7237+}
7238+
7239+/* Advances the coordinate by one item to the right. If empty, no change. If
7240+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
7241+ an existing item. */
7242+int coord_next_item(coord_t * coord)
7243+{
7244+ unsigned items = coord_num_items(coord);
7245+
7246+ if (coord_adjust_items(coord, items, 1) == 1) {
7247+ return 1;
7248+ }
7249+
7250+ switch (coord->between) {
7251+ case AFTER_UNIT:
7252+ case AT_UNIT:
7253+ case BEFORE_UNIT:
7254+ case AFTER_ITEM:
7255+ /* Check for end-of-node. */
7256+ if (coord->item_pos == items - 1) {
7257+ coord->between = AFTER_ITEM;
7258+ coord->unit_pos = 0;
7259+ coord_clear_iplug(coord);
7260+ return 1;
7261+ }
7262+
7263+ /* Anywhere in an item, go to the next one. */
7264+ coord->between = AT_UNIT;
7265+ coord_inc_item_pos(coord);
7266+ coord->unit_pos = 0;
7267+ return 0;
7268+
7269+ case BEFORE_ITEM:
7270+ /* The out-of-range check ensures that we are valid here. */
7271+ coord->unit_pos = 0;
7272+ coord->between = AT_UNIT;
7273+ return 0;
7274+ case INVALID_COORD:
7275+ case EMPTY_NODE:
7276+ /* Handled in coord_adjust_items(). */
7277+ break;
7278+ }
7279+
7280+ impossible("jmacd-9903", "unreachable");
7281+ return 0;
7282+}
7283+
7284+/* Advances the coordinate by one unit to the left. If empty, no change. If
7285+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7286+ is an existing unit. */
7287+int coord_prev_unit(coord_t * coord)
7288+{
7289+ unsigned items = coord_num_items(coord);
7290+
7291+ if (coord_adjust_items(coord, items, 0) == 1) {
7292+ return 1;
7293+ }
7294+
7295+ switch (coord->between) {
7296+ case AT_UNIT:
7297+ case BEFORE_UNIT:
7298+ if (coord->unit_pos > 0) {
7299+ coord->unit_pos -= 1;
7300+ coord->between = AT_UNIT;
7301+ return 0;
7302+ }
7303+
7304+ if (coord->item_pos == 0) {
7305+ coord->between = BEFORE_ITEM;
7306+ return 1;
7307+ }
7308+
7309+ coord_dec_item_pos(coord);
7310+ coord->unit_pos = coord_last_unit_pos(coord);
7311+ coord->between = AT_UNIT;
7312+ return 0;
7313+
7314+ case AFTER_UNIT:
7315+ /* What if unit_pos is out-of-range? */
7316+ assert("jmacd-5442",
7317+ coord->unit_pos <= coord_last_unit_pos(coord));
7318+ coord->between = AT_UNIT;
7319+ return 0;
7320+
7321+ case BEFORE_ITEM:
7322+ if (coord->item_pos == 0) {
7323+ return 1;
7324+ }
7325+
7326+ coord_dec_item_pos(coord);
7327+ /* FALLTHROUGH */
7328+
7329+ case AFTER_ITEM:
7330+ coord->between = AT_UNIT;
7331+ coord->unit_pos = coord_last_unit_pos(coord);
7332+ return 0;
7333+
7334+ case INVALID_COORD:
7335+ case EMPTY_NODE:
7336+ break;
7337+ }
7338+
7339+ impossible("jmacd-9904", "unreachable");
7340+ return 0;
7341+}
7342+
7343+/* Advances the coordinate by one item to the left. If empty, no change. If
7344+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7345+ is an existing item. */
7346+int coord_prev_item(coord_t * coord)
7347+{
7348+ unsigned items = coord_num_items(coord);
7349+
7350+ if (coord_adjust_items(coord, items, 0) == 1) {
7351+ return 1;
7352+ }
7353+
7354+ switch (coord->between) {
7355+ case AT_UNIT:
7356+ case AFTER_UNIT:
7357+ case BEFORE_UNIT:
7358+ case BEFORE_ITEM:
7359+
7360+ if (coord->item_pos == 0) {
7361+ coord->between = BEFORE_ITEM;
7362+ coord->unit_pos = 0;
7363+ return 1;
7364+ }
7365+
7366+ coord_dec_item_pos(coord);
7367+ coord->unit_pos = 0;
7368+ coord->between = AT_UNIT;
7369+ return 0;
7370+
7371+ case AFTER_ITEM:
7372+ coord->between = AT_UNIT;
7373+ coord->unit_pos = 0;
7374+ return 0;
7375+
7376+ case INVALID_COORD:
7377+ case EMPTY_NODE:
7378+ break;
7379+ }
7380+
7381+ impossible("jmacd-9905", "unreachable");
7382+ return 0;
7383+}
7384+
7385+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7386+void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7387+{
7388+ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7389+ if (dir == LEFT_SIDE) {
7390+ coord_init_first_unit(coord, node);
7391+ } else {
7392+ coord_init_last_unit(coord, node);
7393+ }
7394+}
7395+
7396+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7397+ argument. */
7398+/* Audited by: green(2002.06.15) */
7399+int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7400+{
7401+ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7402+ if (dir == LEFT_SIDE) {
7403+ return coord_is_before_leftmost(coord);
7404+ } else {
7405+ return coord_is_after_rightmost(coord);
7406+ }
7407+}
7408+
7409+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7410+/* Audited by: green(2002.06.15) */
7411+int coord_sideof_unit(coord_t * coord, sideof dir)
7412+{
7413+ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7414+ if (dir == LEFT_SIDE) {
7415+ return coord_prev_unit(coord);
7416+ } else {
7417+ return coord_next_unit(coord);
7418+ }
7419+}
7420+
7421+#if REISER4_DEBUG
7422+int coords_equal(const coord_t * c1, const coord_t * c2)
7423+{
7424+ assert("nikita-2840", c1 != NULL);
7425+ assert("nikita-2841", c2 != NULL);
7426+
7427+ return
7428+ c1->node == c2->node &&
7429+ c1->item_pos == c2->item_pos &&
7430+ c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7431+}
7432+#endif /* REISER4_DEBUG */
7433+
7434+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7435+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7436+/* Audited by: green(2002.06.15) */
7437+coord_wrt_node coord_wrt(const coord_t * coord)
7438+{
7439+ if (coord_is_before_leftmost(coord)) {
7440+ return COORD_ON_THE_LEFT;
7441+ }
7442+
7443+ if (coord_is_after_rightmost(coord)) {
7444+ return COORD_ON_THE_RIGHT;
7445+ }
7446+
7447+ return COORD_INSIDE;
7448+}
7449+
7450+/* Returns true if the coordinate is positioned after the last item or after the last unit
7451+ of the last item or it is an empty node. */
7452+/* Audited by: green(2002.06.15) */
7453+int coord_is_after_rightmost(const coord_t * coord)
7454+{
7455+ assert("jmacd-7313", coord_check(coord));
7456+
7457+ switch (coord->between) {
7458+ case INVALID_COORD:
7459+ case AT_UNIT:
7460+ case BEFORE_UNIT:
7461+ case BEFORE_ITEM:
7462+ return 0;
7463+
7464+ case EMPTY_NODE:
7465+ return 1;
7466+
7467+ case AFTER_ITEM:
7468+ return (coord->item_pos == node_num_items(coord->node) - 1);
7469+
7470+ case AFTER_UNIT:
7471+ return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7472+ coord->unit_pos == coord_last_unit_pos(coord));
7473+ }
7474+
7475+ impossible("jmacd-9908", "unreachable");
7476+ return 0;
7477+}
7478+
7479+/* Returns true if the coordinate is positioned before the first item or it is an empty
7480+ node. */
7481+int coord_is_before_leftmost(const coord_t * coord)
7482+{
7483+ /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7484+ necessary to check if coord is set before leftmost
7485+ assert ("jmacd-7313", coord_check (coord)); */
7486+ switch (coord->between) {
7487+ case INVALID_COORD:
7488+ case AT_UNIT:
7489+ case AFTER_ITEM:
7490+ case AFTER_UNIT:
7491+ return 0;
7492+
7493+ case EMPTY_NODE:
7494+ return 1;
7495+
7496+ case BEFORE_ITEM:
7497+ case BEFORE_UNIT:
7498+ return (coord->item_pos == 0) && (coord->unit_pos == 0);
7499+ }
7500+
7501+ impossible("jmacd-9908", "unreachable");
7502+ return 0;
7503+}
7504+
7505+/* Returns true if the coordinate is positioned after a item, before a item, after the
7506+ last unit of an item, before the first unit of an item, or at an empty node. */
7507+/* Audited by: green(2002.06.15) */
7508+int coord_is_between_items(const coord_t * coord)
7509+{
7510+ assert("jmacd-7313", coord_check(coord));
7511+
7512+ switch (coord->between) {
7513+ case INVALID_COORD:
7514+ case AT_UNIT:
7515+ return 0;
7516+
7517+ case AFTER_ITEM:
7518+ case BEFORE_ITEM:
7519+ case EMPTY_NODE:
7520+ return 1;
7521+
7522+ case BEFORE_UNIT:
7523+ return coord->unit_pos == 0;
7524+
7525+ case AFTER_UNIT:
7526+ return coord->unit_pos == coord_last_unit_pos(coord);
7527+ }
7528+
7529+ impossible("jmacd-9908", "unreachable");
7530+ return 0;
7531+}
7532+
7533+#if REISER4_DEBUG
7534+/* Returns true if the coordinates are positioned at adjacent units, regardless of
7535+ before-after or item boundaries. */
7536+int coord_are_neighbors(coord_t * c1, coord_t * c2)
7537+{
7538+ coord_t *left;
7539+ coord_t *right;
7540+
7541+ assert("nikita-1241", c1 != NULL);
7542+ assert("nikita-1242", c2 != NULL);
7543+ assert("nikita-1243", c1->node == c2->node);
7544+ assert("nikita-1244", coord_is_existing_unit(c1));
7545+ assert("nikita-1245", coord_is_existing_unit(c2));
7546+
7547+ left = right = NULL;
7548+ switch (coord_compare(c1, c2)) {
7549+ case COORD_CMP_ON_LEFT:
7550+ left = c1;
7551+ right = c2;
7552+ break;
7553+ case COORD_CMP_ON_RIGHT:
7554+ left = c2;
7555+ right = c1;
7556+ break;
7557+ case COORD_CMP_SAME:
7558+ return 0;
7559+ default:
7560+ wrong_return_value("nikita-1246", "compare_coords()");
7561+ }
7562+ assert("vs-731", left && right);
7563+ if (left->item_pos == right->item_pos) {
7564+ return left->unit_pos + 1 == right->unit_pos;
7565+ } else if (left->item_pos + 1 == right->item_pos) {
7566+ return (left->unit_pos == coord_last_unit_pos(left))
7567+ && (right->unit_pos == 0);
7568+ } else {
7569+ return 0;
7570+ }
7571+}
7572+#endif /* REISER4_DEBUG */
7573+
7574+/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
7575+ COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */
7576+/* Audited by: green(2002.06.15) */
7577+coord_cmp coord_compare(coord_t * c1, coord_t * c2)
7578+{
7579+ assert("vs-209", c1->node == c2->node);
7580+ assert("vs-194", coord_is_existing_unit(c1)
7581+ && coord_is_existing_unit(c2));
7582+
7583+ if (c1->item_pos > c2->item_pos)
7584+ return COORD_CMP_ON_RIGHT;
7585+ if (c1->item_pos < c2->item_pos)
7586+ return COORD_CMP_ON_LEFT;
7587+ if (c1->unit_pos > c2->unit_pos)
7588+ return COORD_CMP_ON_RIGHT;
7589+ if (c1->unit_pos < c2->unit_pos)
7590+ return COORD_CMP_ON_LEFT;
7591+ return COORD_CMP_SAME;
7592+}
7593+
7594+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
7595+ non-zero if there is no position to the right. */
7596+int coord_set_to_right(coord_t * coord)
7597+{
7598+ unsigned items = coord_num_items(coord);
7599+
7600+ if (coord_adjust_items(coord, items, 1) == 1) {
7601+ return 1;
7602+ }
7603+
7604+ switch (coord->between) {
7605+ case AT_UNIT:
7606+ return 0;
7607+
7608+ case BEFORE_ITEM:
7609+ case BEFORE_UNIT:
7610+ coord->between = AT_UNIT;
7611+ return 0;
7612+
7613+ case AFTER_UNIT:
7614+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7615+ coord->unit_pos += 1;
7616+ coord->between = AT_UNIT;
7617+ return 0;
7618+ } else {
7619+
7620+ coord->unit_pos = 0;
7621+
7622+ if (coord->item_pos == items - 1) {
7623+ coord->between = AFTER_ITEM;
7624+ return 1;
7625+ }
7626+
7627+ coord_inc_item_pos(coord);
7628+ coord->between = AT_UNIT;
7629+ return 0;
7630+ }
7631+
7632+ case AFTER_ITEM:
7633+ if (coord->item_pos == items - 1) {
7634+ return 1;
7635+ }
7636+
7637+ coord_inc_item_pos(coord);
7638+ coord->unit_pos = 0;
7639+ coord->between = AT_UNIT;
7640+ return 0;
7641+
7642+ case EMPTY_NODE:
7643+ return 1;
7644+
7645+ case INVALID_COORD:
7646+ break;
7647+ }
7648+
7649+ impossible("jmacd-9920", "unreachable");
7650+ return 0;
7651+}
7652+
7653+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
7654+ non-zero if there is no position to the left. */
7655+int coord_set_to_left(coord_t * coord)
7656+{
7657+ unsigned items = coord_num_items(coord);
7658+
7659+ if (coord_adjust_items(coord, items, 0) == 1) {
7660+ return 1;
7661+ }
7662+
7663+ switch (coord->between) {
7664+ case AT_UNIT:
7665+ return 0;
7666+
7667+ case AFTER_UNIT:
7668+ coord->between = AT_UNIT;
7669+ return 0;
7670+
7671+ case AFTER_ITEM:
7672+ coord->between = AT_UNIT;
7673+ coord->unit_pos = coord_last_unit_pos(coord);
7674+ return 0;
7675+
7676+ case BEFORE_UNIT:
7677+ if (coord->unit_pos > 0) {
7678+ coord->unit_pos -= 1;
7679+ coord->between = AT_UNIT;
7680+ return 0;
7681+ } else {
7682+
7683+ if (coord->item_pos == 0) {
7684+ coord->between = BEFORE_ITEM;
7685+ return 1;
7686+ }
7687+
7688+ coord->unit_pos = coord_last_unit_pos(coord);
7689+ coord_dec_item_pos(coord);
7690+ coord->between = AT_UNIT;
7691+ return 0;
7692+ }
7693+
7694+ case BEFORE_ITEM:
7695+ if (coord->item_pos == 0) {
7696+ return 1;
7697+ }
7698+
7699+ coord_dec_item_pos(coord);
7700+ coord->unit_pos = coord_last_unit_pos(coord);
7701+ coord->between = AT_UNIT;
7702+ return 0;
7703+
7704+ case EMPTY_NODE:
7705+ return 1;
7706+
7707+ case INVALID_COORD:
7708+ break;
7709+ }
7710+
7711+ impossible("jmacd-9920", "unreachable");
7712+ return 0;
7713+}
7714+
7715+static const char *coord_tween_tostring(between_enum n)
7716+{
7717+ switch (n) {
7718+ case BEFORE_UNIT:
7719+ return "before unit";
7720+ case BEFORE_ITEM:
7721+ return "before item";
7722+ case AT_UNIT:
7723+ return "at unit";
7724+ case AFTER_UNIT:
7725+ return "after unit";
7726+ case AFTER_ITEM:
7727+ return "after item";
7728+ case EMPTY_NODE:
7729+ return "empty node";
7730+ case INVALID_COORD:
7731+ return "invalid";
7732+ default:
7733+ {
7734+ static char buf[30];
7735+
7736+ sprintf(buf, "unknown: %i", n);
7737+ return buf;
7738+ }
7739+ }
7740+}
7741+
7742+void print_coord(const char *mes, const coord_t * coord, int node)
7743+{
7744+ if (coord == NULL) {
7745+ printk("%s: null\n", mes);
7746+ return;
7747+ }
7748+ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7749+ mes, coord->item_pos, coord->unit_pos,
7750+ coord_tween_tostring(coord->between), coord->iplugid);
7751+}
7752+
7753+int
7754+item_utmost_child_real_block(const coord_t * coord, sideof side,
7755+ reiser4_block_nr * blk)
7756+{
7757+ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7758+ side,
7759+ blk);
7760+}
7761+
7762+int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
7763+{
7764+ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
7765+}
7766+
7767+/* @count bytes of flow @f got written, update correspondingly f->length,
7768+ f->data and f->key */
7769+void move_flow_forward(flow_t * f, unsigned count)
7770+{
7771+ if (f->data)
7772+ f->data += count;
7773+ f->length -= count;
7774+ set_key_offset(&f->key, get_key_offset(&f->key) + count);
7775+}
7776+
7777+/*
7778+ Local variables:
7779+ c-indentation-style: "K&R"
7780+ mode-name: "LC"
7781+ c-basic-offset: 8
7782+ tab-width: 8
7783+ fill-column: 120
7784+ scroll-step: 1
7785+ End:
7786+*/
7787diff -urN linux-2.6.23.orig/fs/reiser4/coord.h linux-2.6.23/fs/reiser4/coord.h
7788--- linux-2.6.23.orig/fs/reiser4/coord.h 1970-01-01 03:00:00.000000000 +0300
7789+++ linux-2.6.23/fs/reiser4/coord.h 2007-12-04 16:49:30.000000000 +0300
7790@@ -0,0 +1,389 @@
7791+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7792+
7793+/* Coords */
7794+
7795+#if !defined( __REISER4_COORD_H__ )
7796+#define __REISER4_COORD_H__
7797+
7798+#include "forward.h"
7799+#include "debug.h"
7800+#include "dformat.h"
7801+#include "key.h"
7802+
7803+/* insertions happen between coords in the tree, so we need some means
7804+ of specifying the sense of betweenness. */
7805+typedef enum {
7806+ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
7807+ AT_UNIT,
7808+ AFTER_UNIT,
7809+ BEFORE_ITEM,
7810+ AFTER_ITEM,
7811+ INVALID_COORD,
7812+ EMPTY_NODE,
7813+} between_enum;
7814+
7815+/* location of coord w.r.t. its node */
7816+typedef enum {
7817+ COORD_ON_THE_LEFT = -1,
7818+ COORD_ON_THE_RIGHT = +1,
7819+ COORD_INSIDE = 0
7820+} coord_wrt_node;
7821+
7822+typedef enum {
7823+ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
7824+} coord_cmp;
7825+
7826+struct coord {
7827+ /* node in a tree */
7828+ /* 0 */ znode *node;
7829+
7830+ /* position of item within node */
7831+ /* 4 */ pos_in_node_t item_pos;
7832+ /* position of unit within item */
7833+ /* 6 */ pos_in_node_t unit_pos;
7834+ /* optimization: plugin of item is stored in coord_t. Until this was
7835+ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
7836+ is invalidated (set to 0xff) on each modification of ->item_pos,
7837+ and all such modifications are funneled through coord_*_item_pos()
7838+ functions below.
7839+ */
7840+ /* 8 */ char iplugid;
7841+ /* position of coord w.r.t. to neighboring items and/or units.
7842+ Values are taken from &between_enum above.
7843+ */
7844+ /* 9 */ char between;
7845+ /* padding. It will be added by the compiler anyway to conform to the
7846+ * C language alignment requirements. We keep it here to be on the
7847+ * safe side and to have a clear picture of the memory layout of this
7848+ * structure. */
7849+ /* 10 */ __u16 pad;
7850+ /* 12 */ int offset;
7851+#if REISER4_DEBUG
7852+ unsigned long plug_v;
7853+ unsigned long body_v;
7854+#endif
7855+};
7856+
7857+#define INVALID_PLUGID ((char)((1 << 8) - 1))
7858+#define INVALID_OFFSET -1
7859+
7860+static inline void coord_clear_iplug(coord_t * coord)
7861+{
7862+ assert("nikita-2835", coord != NULL);
7863+ coord->iplugid = INVALID_PLUGID;
7864+ coord->offset = INVALID_OFFSET;
7865+}
7866+
7867+static inline int coord_is_iplug_set(const coord_t * coord)
7868+{
7869+ assert("nikita-2836", coord != NULL);
7870+ return coord->iplugid != INVALID_PLUGID;
7871+}
7872+
7873+static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
7874+{
7875+ assert("nikita-2478", coord != NULL);
7876+ coord->item_pos = pos;
7877+ coord_clear_iplug(coord);
7878+}
7879+
7880+static inline void coord_dec_item_pos(coord_t * coord)
7881+{
7882+ assert("nikita-2480", coord != NULL);
7883+ --coord->item_pos;
7884+ coord_clear_iplug(coord);
7885+}
7886+
7887+static inline void coord_inc_item_pos(coord_t * coord)
7888+{
7889+ assert("nikita-2481", coord != NULL);
7890+ ++coord->item_pos;
7891+ coord_clear_iplug(coord);
7892+}
7893+
7894+static inline void coord_add_item_pos(coord_t * coord, int delta)
7895+{
7896+ assert("nikita-2482", coord != NULL);
7897+ coord->item_pos += delta;
7898+ coord_clear_iplug(coord);
7899+}
7900+
7901+static inline void coord_invalid_item_pos(coord_t * coord)
7902+{
7903+ assert("nikita-2832", coord != NULL);
7904+ coord->item_pos = (unsigned short)~0;
7905+ coord_clear_iplug(coord);
7906+}
7907+
7908+/* Reverse a direction. */
7909+static inline sideof sideof_reverse(sideof side)
7910+{
7911+ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
7912+}
7913+
7914+/* NOTE: There is a somewhat odd mixture of the following opposed terms:
7915+
7916+ "first" and "last"
7917+ "next" and "prev"
7918+ "before" and "after"
7919+ "leftmost" and "rightmost"
7920+
7921+ But I think the chosen names are decent the way they are.
7922+*/
7923+
7924+/* COORD INITIALIZERS */
7925+
7926+/* Initialize an invalid coordinate. */
7927+extern void coord_init_invalid(coord_t * coord, const znode * node);
7928+
7929+extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
7930+
7931+/* Initialize a coordinate to point at the first unit of the first item. If the node is
7932+ empty, it is positioned at the EMPTY_NODE. */
7933+extern void coord_init_first_unit(coord_t * coord, const znode * node);
7934+
7935+/* Initialize a coordinate to point at the last unit of the last item. If the node is
7936+ empty, it is positioned at the EMPTY_NODE. */
7937+extern void coord_init_last_unit(coord_t * coord, const znode * node);
7938+
7939+/* Initialize a coordinate to before the first item. If the node is empty, it is
7940+ positioned at the EMPTY_NODE. */
7941+extern void coord_init_before_first_item(coord_t * coord, const znode * node);
7942+
7943+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
7944+ at the EMPTY_NODE. */
7945+extern void coord_init_after_last_item(coord_t * coord, const znode * node);
7946+
7947+/* Initialize a coordinate to after last unit in the item. Coord must be set
7948+ already to existing item */
7949+void coord_init_after_item_end(coord_t * coord);
7950+
7951+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7952+void coord_init_before_item(coord_t *);
7953+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7954+void coord_init_after_item(coord_t *);
7955+
7956+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7957+extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
7958+ sideof dir);
7959+
7960+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7961+ it was not clear how actually
7962+ FIXME-VS: added by vs (2002, june, 8) */
7963+extern void coord_init_zero(coord_t * coord);
7964+
7965+/* COORD METHODS */
7966+
7967+/* after shifting of node content, coord previously set properly may become
7968+ invalid, try to "normalize" it. */
7969+void coord_normalize(coord_t * coord);
7970+
7971+/* Copy a coordinate. */
7972+extern void coord_dup(coord_t * coord, const coord_t * old_coord);
7973+
7974+/* Copy a coordinate without check. */
7975+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
7976+
7977+unsigned coord_num_units(const coord_t * coord);
7978+
7979+/* Return the last valid unit number at the present item (i.e.,
7980+ coord_num_units() - 1). */
7981+static inline unsigned coord_last_unit_pos(const coord_t * coord)
7982+{
7983+ return coord_num_units(coord) - 1;
7984+}
7985+
7986+#if REISER4_DEBUG
7987+/* For assertions only, checks for a valid coordinate. */
7988+extern int coord_check(const coord_t * coord);
7989+
7990+extern unsigned long znode_times_locked(const znode * z);
7991+
7992+static inline void coord_update_v(coord_t * coord)
7993+{
7994+ coord->plug_v = coord->body_v = znode_times_locked(coord->node);
7995+}
7996+#endif
7997+
7998+extern int coords_equal(const coord_t * c1, const coord_t * c2);
7999+
8000+extern void print_coord(const char *mes, const coord_t * coord, int print_node);
8001+
8002+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
8003+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
8004+extern coord_wrt_node coord_wrt(const coord_t * coord);
8005+
8006+/* Returns true if the coordinates are positioned at adjacent units, regardless of
8007+ before-after or item boundaries. */
8008+extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
8009+
8010+/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
8011+ NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */
8012+extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
8013+
8014+/* COORD PREDICATES */
8015+
8016+/* Returns true if the coord was initializewd by coord_init_invalid (). */
8017+extern int coord_is_invalid(const coord_t * coord);
8018+
8019+/* Returns true if the coordinate is positioned at an existing item, not before or after
8020+ an item. It may be placed at, before, or after any unit within the item, whether
8021+ existing or not. If this is true you can call methods of the item plugin. */
8022+extern int coord_is_existing_item(const coord_t * coord);
8023+
8024+/* Returns true if the coordinate is positioned after a item, before a item, after the
8025+ last unit of an item, before the first unit of an item, or at an empty node. */
8026+extern int coord_is_between_items(const coord_t * coord);
8027+
8028+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
8029+ unit. */
8030+extern int coord_is_existing_unit(const coord_t * coord);
8031+
8032+/* Returns true if the coordinate is positioned at an empty node. */
8033+extern int coord_is_empty(const coord_t * coord);
8034+
8035+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
8036+ true for empty nodes nor coordinates positioned before the first item. */
8037+extern int coord_is_leftmost_unit(const coord_t * coord);
8038+
8039+/* Returns true if the coordinate is positioned after the last item or after the last unit
8040+ of the last item or it is an empty node. */
8041+extern int coord_is_after_rightmost(const coord_t * coord);
8042+
8043+/* Returns true if the coordinate is positioned before the first item or it is an empty
8044+ node. */
8045+extern int coord_is_before_leftmost(const coord_t * coord);
8046+
8047+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8048+ argument. */
8049+extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8050+
8051+/* COORD MODIFIERS */
8052+
8053+/* Advances the coordinate by one unit to the right. If empty, no change. If
8054+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8055+ an existing unit. */
8056+extern int coord_next_unit(coord_t * coord);
8057+
8058+/* Advances the coordinate by one item to the right. If empty, no change. If
8059+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8060+ an existing item. */
8061+extern int coord_next_item(coord_t * coord);
8062+
8063+/* Advances the coordinate by one unit to the left. If empty, no change. If
8064+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8065+ is an existing unit. */
8066+extern int coord_prev_unit(coord_t * coord);
8067+
8068+/* Advances the coordinate by one item to the left. If empty, no change. If
8069+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8070+ is an existing item. */
8071+extern int coord_prev_item(coord_t * coord);
8072+
8073+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8074+ non-zero if there is no position to the right. */
8075+extern int coord_set_to_right(coord_t * coord);
8076+
8077+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8078+ non-zero if there is no position to the left. */
8079+extern int coord_set_to_left(coord_t * coord);
8080+
8081+/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success
8082+ and non-zero if the unit did not exist. */
8083+extern int coord_set_after_unit(coord_t * coord);
8084+
8085+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8086+extern int coord_sideof_unit(coord_t * coord, sideof dir);
8087+
8088+/* iterate over all units in @node */
8089+#define for_all_units( coord, node ) \
8090+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8091+ coord_next_unit( coord ) == 0 ; )
8092+
8093+/* iterate over all items in @node */
8094+#define for_all_items( coord, node ) \
8095+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8096+ coord_next_item( coord ) == 0 ; )
8097+
8098+/* COORD/ITEM METHODS */
8099+
8100+extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8101+ reiser4_block_nr * blk);
8102+extern int item_utmost_child(const coord_t * coord, sideof side,
8103+ jnode ** child);
8104+
8105+/* a flow is a sequence of bytes being written to or read from the tree. The
8106+ tree will slice the flow into items while storing it into nodes, but all of
8107+ that is hidden from anything outside the tree. */
8108+
8109+struct flow {
8110+ reiser4_key key; /* key of start of flow's sequence of bytes */
8111+ loff_t length; /* length of flow's sequence of bytes */
8112+ char *data; /* start of flow's sequence of bytes */
8113+ int user; /* if 1 data is user space, 0 - kernel space */
8114+ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8115+};
8116+
8117+void move_flow_forward(flow_t * f, unsigned count);
8118+
8119+/* &reiser4_item_data - description of data to be inserted or pasted
8120+
8121+ Q: articulate the reasons for the difference between this and flow.
8122+
8123+ A: Becides flow we insert into tree other things: stat data, directory
8124+ entry, etc. To insert them into tree one has to provide this structure. If
8125+ one is going to insert flow - he can use insert_flow, where this structure
8126+ does not have to be created
8127+*/
8128+struct reiser4_item_data {
8129+ /* actual data to be inserted. If NULL, ->create_item() will not
8130+ do xmemcpy itself, leaving this up to the caller. This can
8131+ save some amount of unnecessary memory copying, for example,
8132+ during insertion of stat data.
8133+
8134+ */
8135+ char *data;
8136+ /* 1 if 'char * data' contains pointer to user space and 0 if it is
8137+ kernel space */
8138+ int user;
8139+ /* amount of data we are going to insert or paste */
8140+ int length;
8141+ /* "Arg" is opaque data that is passed down to the
8142+ ->create_item() method of node layout, which in turn
8143+ hands it to the ->create_hook() of item being created. This
8144+ arg is currently used by:
8145+
8146+ . ->create_hook() of internal item
8147+ (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8148+ . ->paste() method of directory item.
8149+ . ->create_hook() of extent item
8150+
8151+ For internal item, this is left "brother" of new node being
8152+ inserted and it is used to add new node into sibling list
8153+ after parent to it was just inserted into parent.
8154+
8155+ While ->arg does look somewhat of unnecessary compication,
8156+ it actually saves a lot of headache in many places, because
8157+ all data necessary to insert or paste new data into tree are
8158+ collected in one place, and this eliminates a lot of extra
8159+ argument passing and storing everywhere.
8160+
8161+ */
8162+ void *arg;
8163+ /* plugin of item we are inserting */
8164+ item_plugin *iplug;
8165+};
8166+
8167+/* __REISER4_COORD_H__ */
8168+#endif
8169+
8170+/* Make Linus happy.
8171+ Local variables:
8172+ c-indentation-style: "K&R"
8173+ mode-name: "LC"
8174+ c-basic-offset: 8
8175+ tab-width: 8
8176+ fill-column: 120
8177+ scroll-step: 1
8178+ End:
8179+*/
8180diff -urN linux-2.6.23.orig/fs/reiser4/debug.c linux-2.6.23/fs/reiser4/debug.c
8181--- linux-2.6.23.orig/fs/reiser4/debug.c 1970-01-01 03:00:00.000000000 +0300
8182+++ linux-2.6.23/fs/reiser4/debug.c 2007-12-04 16:49:30.000000000 +0300
8183@@ -0,0 +1,308 @@
8184+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8185+ * reiser4/README */
8186+
8187+/* Debugging facilities. */
8188+
8189+/*
8190+ * This file contains generic debugging functions used by reiser4. Roughly
8191+ * following:
8192+ *
8193+ * panicking: reiser4_do_panic(), reiser4_print_prefix().
8194+ *
8195+ * locking:
8196+ * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8197+ * reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8198+ *
8199+ * error code monitoring (see comment before RETERR macro):
8200+ * reiser4_return_err(), reiser4_report_err().
8201+ *
8202+ * stack back-tracing: fill_backtrace()
8203+ *
8204+ * miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8205+ * reiser4_debugtrap().
8206+ *
8207+ */
8208+
8209+#include "reiser4.h"
8210+#include "context.h"
8211+#include "super.h"
8212+#include "txnmgr.h"
8213+#include "znode.h"
8214+
8215+#include <linux/sysfs.h>
8216+#include <linux/slab.h>
8217+#include <linux/types.h>
8218+#include <linux/fs.h>
8219+#include <linux/spinlock.h>
8220+#include <linux/kallsyms.h>
8221+#include <linux/vmalloc.h>
8222+#include <linux/ctype.h>
8223+#include <linux/sysctl.h>
8224+#include <linux/hardirq.h>
8225+
8226+#if 0
8227+#if REISER4_DEBUG
8228+static void reiser4_report_err(void);
8229+#else
8230+#define reiser4_report_err() noop
8231+#endif
8232+#endif /* 0 */
8233+
8234+/*
8235+ * global buffer where message given to reiser4_panic is formatted.
8236+ */
8237+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8238+
8239+/*
8240+ * lock protecting consistency of panic_buf under concurrent panics
8241+ */
8242+static DEFINE_SPINLOCK(panic_guard);
8243+
8244+/* Your best friend. Call it on each occasion. This is called by
8245+ fs/reiser4/debug.h:reiser4_panic(). */
8246+void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8247+{
8248+ static int in_panic = 0;
8249+ va_list args;
8250+
8251+ /*
8252+ * check for recursive panic.
8253+ */
8254+ if (in_panic == 0) {
8255+ in_panic = 1;
8256+
8257+ spin_lock(&panic_guard);
8258+ va_start(args, format);
8259+ vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8260+ va_end(args);
8261+ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8262+ spin_unlock(&panic_guard);
8263+
8264+ /*
8265+ * if kernel debugger is configured---drop in. Early dropping
8266+ * into kgdb is not always convenient, because panic message
8267+ * is not yet printed most of the times. But:
8268+ *
8269+ * (1) message can be extracted from printk_buf[]
8270+ * (declared static inside of printk()), and
8271+ *
8272+ * (2) sometimes serial/kgdb combo dies while printing
8273+ * long panic message, so it's more prudent to break into
8274+ * debugger earlier.
8275+ *
8276+ */
8277+ DEBUGON(1);
8278+ }
8279+ /* to make gcc happy about noreturn attribute */
8280+ panic("%s", panic_buf);
8281+}
8282+
8283+#if 0
8284+void
8285+reiser4_print_prefix(const char *level, int reperr, const char *mid,
8286+ const char *function, const char *file, int lineno)
8287+{
8288+ const char *comm;
8289+ int pid;
8290+
8291+ if (unlikely(in_interrupt() || in_irq())) {
8292+ comm = "interrupt";
8293+ pid = 0;
8294+ } else {
8295+ comm = current->comm;
8296+ pid = current->pid;
8297+ }
8298+ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8299+ level, comm, pid, function, file, lineno, mid);
8300+ if (reperr)
8301+ reiser4_report_err();
8302+}
8303+#endif /* 0 */
8304+
8305+/* Preemption point: this should be called periodically during long running
8306+ operations (carry, allocate, and squeeze are best examples) */
8307+int reiser4_preempt_point(void)
8308+{
8309+ assert("nikita-3008", reiser4_schedulable());
8310+ cond_resched();
8311+ return signal_pending(current);
8312+}
8313+
8314+#if REISER4_DEBUG
8315+/* Debugging aid: return struct where information about locks taken by current
8316+ thread is accumulated. This can be used to formulate lock ordering
8317+ constraints and various assertions.
8318+
8319+*/
8320+reiser4_lock_cnt_info *reiser4_lock_counters(void)
8321+{
8322+ reiser4_context *ctx = get_current_context();
8323+ assert("jmacd-1123", ctx != NULL);
8324+ return &ctx->locks;
8325+}
8326+
8327+/*
8328+ * print human readable information about locks held by the reiser4 context.
8329+ */
8330+static void print_lock_counters(const char *prefix,
8331+ const reiser4_lock_cnt_info * info)
8332+{
8333+ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8334+ "jload: %i, "
8335+ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8336+ "ktxnmgrd: %i, fq: %i\n"
8337+ "inode: %i, "
8338+ "cbk_cache: %i (r:%i,w%i), "
8339+ "eflush: %i, "
8340+ "zlock: %i,\n"
8341+ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8342+ "d: %i, x: %i, t: %i\n", prefix,
8343+ info->spin_locked_jnode,
8344+ info->rw_locked_tree, info->read_locked_tree,
8345+ info->write_locked_tree,
8346+ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8347+ info->spin_locked_jload,
8348+ info->spin_locked_txnh,
8349+ info->spin_locked_atom, info->spin_locked_stack,
8350+ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8351+ info->spin_locked_fq,
8352+ info->spin_locked_inode,
8353+ info->rw_locked_cbk_cache,
8354+ info->read_locked_cbk_cache,
8355+ info->write_locked_cbk_cache,
8356+ info->spin_locked_super_eflush,
8357+ info->spin_locked_zlock,
8358+ info->spin_locked,
8359+ info->long_term_locked_znode,
8360+ info->inode_sem_r, info->inode_sem_w,
8361+ info->d_refs, info->x_refs, info->t_refs);
8362+}
8363+
8364+/* check that no spinlocks are held */
8365+int reiser4_schedulable(void)
8366+{
8367+ if (get_current_context_check() != NULL) {
8368+ if (!LOCK_CNT_NIL(spin_locked)) {
8369+ print_lock_counters("in atomic", reiser4_lock_counters());
8370+ return 0;
8371+ }
8372+ }
8373+ might_sleep();
8374+ return 1;
8375+}
8376+/*
8377+ * return true, iff no locks are held.
8378+ */
8379+int reiser4_no_counters_are_held(void)
8380+{
8381+ reiser4_lock_cnt_info *counters;
8382+
8383+ counters = reiser4_lock_counters();
8384+ return
8385+ (counters->spin_locked_zlock == 0) &&
8386+ (counters->spin_locked_jnode == 0) &&
8387+ (counters->rw_locked_tree == 0) &&
8388+ (counters->read_locked_tree == 0) &&
8389+ (counters->write_locked_tree == 0) &&
8390+ (counters->rw_locked_dk == 0) &&
8391+ (counters->read_locked_dk == 0) &&
8392+ (counters->write_locked_dk == 0) &&
8393+ (counters->spin_locked_txnh == 0) &&
8394+ (counters->spin_locked_atom == 0) &&
8395+ (counters->spin_locked_stack == 0) &&
8396+ (counters->spin_locked_txnmgr == 0) &&
8397+ (counters->spin_locked_inode == 0) &&
8398+ (counters->spin_locked == 0) &&
8399+ (counters->long_term_locked_znode == 0) &&
8400+ (counters->inode_sem_r == 0) &&
8401+ (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8402+}
8403+
8404+/*
8405+ * return true, iff transaction commit can be done under locks held by the
8406+ * current thread.
8407+ */
8408+int reiser4_commit_check_locks(void)
8409+{
8410+ reiser4_lock_cnt_info *counters;
8411+ int inode_sem_r;
8412+ int inode_sem_w;
8413+ int result;
8414+
8415+ /*
8416+ * inode's read/write semaphore is the only reiser4 lock that can be
8417+ * held during commit.
8418+ */
8419+
8420+ counters = reiser4_lock_counters();
8421+ inode_sem_r = counters->inode_sem_r;
8422+ inode_sem_w = counters->inode_sem_w;
8423+
8424+ counters->inode_sem_r = counters->inode_sem_w = 0;
8425+ result = reiser4_no_counters_are_held();
8426+ counters->inode_sem_r = inode_sem_r;
8427+ counters->inode_sem_w = inode_sem_w;
8428+ return result;
8429+}
8430+
8431+/*
8432+ * fill "error site" in the current reiser4 context. See comment before RETERR
8433+ * macro for more details.
8434+ */
8435+void reiser4_return_err(int code, const char *file, int line)
8436+{
8437+ if (code < 0 && is_in_reiser4_context()) {
8438+ reiser4_context *ctx = get_current_context();
8439+
8440+ if (ctx != NULL) {
8441+ ctx->err.code = code;
8442+ ctx->err.file = file;
8443+ ctx->err.line = line;
8444+ }
8445+ }
8446+}
8447+
8448+#if 0
8449+/*
8450+ * report error information recorder by reiser4_return_err().
8451+ */
8452+static void reiser4_report_err(void)
8453+{
8454+ reiser4_context *ctx = get_current_context_check();
8455+
8456+ if (ctx != NULL) {
8457+ if (ctx->err.code != 0) {
8458+ printk("code: %i at %s:%i\n",
8459+ ctx->err.code, ctx->err.file, ctx->err.line);
8460+ }
8461+ }
8462+}
8463+#endif /* 0 */
8464+
8465+#endif /* REISER4_DEBUG */
8466+
8467+#if KERNEL_DEBUGGER
8468+
8469+/*
8470+ * this functions just drops into kernel debugger. It is a convenient place to
8471+ * put breakpoint in.
8472+ */
8473+void reiser4_debugtrap(void)
8474+{
8475+ /* do nothing. Put break point here. */
8476+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8477+ extern void breakpoint(void);
8478+ breakpoint();
8479+#endif
8480+}
8481+#endif
8482+
8483+/* Make Linus happy.
8484+ Local variables:
8485+ c-indentation-style: "K&R"
8486+ mode-name: "LC"
8487+ c-basic-offset: 8
8488+ tab-width: 8
8489+ fill-column: 120
8490+ End:
8491+*/
8492diff -urN linux-2.6.23.orig/fs/reiser4/debug.h linux-2.6.23/fs/reiser4/debug.h
8493--- linux-2.6.23.orig/fs/reiser4/debug.h 1970-01-01 03:00:00.000000000 +0300
8494+++ linux-2.6.23/fs/reiser4/debug.h 2007-12-04 20:42:06.138861845 +0300
8495@@ -0,0 +1,350 @@
8496+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8497+
8498+/* Declarations of debug macros. */
8499+
8500+#if !defined( __FS_REISER4_DEBUG_H__ )
8501+#define __FS_REISER4_DEBUG_H__
8502+
8503+#include "forward.h"
8504+#include "reiser4.h"
8505+
8506+/* generic function to produce formatted output, decorating it with
8507+ whatever standard prefixes/postfixes we want. "Fun" is a function
8508+ that will be actually called, can be printk, panic etc.
8509+ This is for use by other debugging macros, not by users. */
8510+#define DCALL(lev, fun, reperr, label, format, ...) \
8511+({ \
8512+ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8513+ current->comm, current->pid, __FUNCTION__, \
8514+ __FILE__, __LINE__, label, ## __VA_ARGS__); \
8515+})
8516+
8517+/*
8518+ * cause kernel to crash
8519+ */
8520+#define reiser4_panic(mid, format, ...) \
8521+ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8522+
8523+/* print message with indication of current process, file, line and
8524+ function */
8525+#define reiser4_log(label, format, ...) \
8526+ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8527+
8528+/* Assertion checked during compilation.
8529+ If "cond" is false (0) we get duplicate case label in switch.
8530+ Use this to check something like famous
8531+ cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8532+ in 3.x journal.c. If cassertion fails you get compiler error,
8533+ so no "maintainer-id".
8534+*/
8535+#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8536+
8537+#define noop do {;} while(0)
8538+
8539+#if REISER4_DEBUG
8540+/* version of info that only actually prints anything when _d_ebugging
8541+ is on */
8542+#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8543+/* macro to catch logical errors. Put it into `default' clause of
8544+ switch() statement. */
8545+#define impossible(label, format, ...) \
8546+ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8547+/* assert assures that @cond is true. If it is not, reiser4_panic() is
8548+ called. Use this for checking logical consistency and _never_ call
8549+ this to check correctness of external data: disk blocks and user-input . */
8550+#define assert(label, cond) \
8551+({ \
8552+ /* call_on_each_assert(); */ \
8553+ if (cond) { \
8554+ /* put negated check to avoid using !(cond) that would lose \
8555+ * warnings for things like assert(a = b); */ \
8556+ ; \
8557+ } else { \
8558+ DEBUGON(1); \
8559+ reiser4_panic(label, "assertion failed: %s", #cond); \
8560+ } \
8561+})
8562+
8563+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8564+#define check_me( label, expr ) assert( label, ( expr ) )
8565+
8566+#define ON_DEBUG( exp ) exp
8567+
8568+extern int reiser4_schedulable(void);
8569+extern void call_on_each_assert(void);
8570+
8571+#else
8572+
8573+#define dinfo( format, args... ) noop
8574+#define impossible( label, format, args... ) noop
8575+#define assert( label, cond ) noop
8576+#define check_me( label, expr ) ( ( void ) ( expr ) )
8577+#define ON_DEBUG( exp )
8578+#define reiser4_schedulable() might_sleep()
8579+
8580+/* REISER4_DEBUG */
8581+#endif
8582+
8583+#if REISER4_DEBUG
8584+/* per-thread information about lock acquired by this thread. Used by lock
8585+ * ordering checking in spin_macros.h */
8586+typedef struct reiser4_lock_cnt_info {
8587+ int rw_locked_tree;
8588+ int read_locked_tree;
8589+ int write_locked_tree;
8590+
8591+ int rw_locked_dk;
8592+ int read_locked_dk;
8593+ int write_locked_dk;
8594+
8595+ int rw_locked_cbk_cache;
8596+ int read_locked_cbk_cache;
8597+ int write_locked_cbk_cache;
8598+
8599+ int spin_locked_zlock;
8600+ int spin_locked_jnode;
8601+ int spin_locked_jload;
8602+ int spin_locked_txnh;
8603+ int spin_locked_atom;
8604+ int spin_locked_stack;
8605+ int spin_locked_txnmgr;
8606+ int spin_locked_ktxnmgrd;
8607+ int spin_locked_fq;
8608+ int spin_locked_inode;
8609+ int spin_locked_super_eflush;
8610+ int spin_locked;
8611+ int long_term_locked_znode;
8612+
8613+ int inode_sem_r;
8614+ int inode_sem_w;
8615+
8616+ int d_refs;
8617+ int x_refs;
8618+ int t_refs;
8619+} reiser4_lock_cnt_info;
8620+
8621+extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
8622+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8623+
8624+/* increment lock-counter @counter, if present */
8625+#define LOCK_CNT_INC(counter) \
8626+ IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
8627+
8628+/* decrement lock-counter @counter, if present */
8629+#define LOCK_CNT_DEC(counter) \
8630+ IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
8631+
8632+/* check that lock-counter is zero. This is for use in assertions */
8633+#define LOCK_CNT_NIL(counter) \
8634+ IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
8635+
8636+/* check that lock-counter is greater than zero. This is for use in
8637+ * assertions */
8638+#define LOCK_CNT_GTZ(counter) \
8639+ IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
8640+#define LOCK_CNT_LT(counter,n) \
8641+ IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
8642+
8643+#else /* REISER4_DEBUG */
8644+
8645+/* no-op versions on the above */
8646+
8647+typedef struct reiser4_lock_cnt_info {
8648+} reiser4_lock_cnt_info;
8649+
8650+#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
8651+#define LOCK_CNT_INC(counter) noop
8652+#define LOCK_CNT_DEC(counter) noop
8653+#define LOCK_CNT_NIL(counter) (1)
8654+#define LOCK_CNT_GTZ(counter) (1)
8655+#define LOCK_CNT_LT(counter,n) (1)
8656+
8657+#endif /* REISER4_DEBUG */
8658+
8659+#define assert_spin_not_locked(lock) BUG_ON(0)
8660+#define assert_rw_write_locked(lock) BUG_ON(0)
8661+#define assert_rw_read_locked(lock) BUG_ON(0)
8662+#define assert_rw_locked(lock) BUG_ON(0)
8663+#define assert_rw_not_write_locked(lock) BUG_ON(0)
8664+#define assert_rw_not_read_locked(lock) BUG_ON(0)
8665+#define assert_rw_not_locked(lock) BUG_ON(0)
8666+
8667+/* flags controlling debugging behavior. Are set through debug_flags=N mount
8668+ option. */
8669+typedef enum {
8670+ /* print a lot of information during panic. When this is on all jnodes
8671+ * are listed. This can be *very* large output. Usually you don't want
8672+ * this. Especially over serial line. */
8673+ REISER4_VERBOSE_PANIC = 0x00000001,
8674+ /* print a lot of information during umount */
8675+ REISER4_VERBOSE_UMOUNT = 0x00000002,
8676+ /* print gathered statistics on umount */
8677+ REISER4_STATS_ON_UMOUNT = 0x00000004,
8678+ /* check node consistency */
8679+ REISER4_CHECK_NODE = 0x00000008
8680+} reiser4_debug_flags;
8681+
8682+extern int is_in_reiser4_context(void);
8683+
8684+/*
8685+ * evaluate expression @e only if with reiser4 context
8686+ */
8687+#define ON_CONTEXT(e) do { \
8688+ if(is_in_reiser4_context()) { \
8689+ e; \
8690+ } } while(0)
8691+
8692+/*
8693+ * evaluate expression @e only when within reiser4_context and debugging is
8694+ * on.
8695+ */
8696+#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
8697+
8698+/*
8699+ * complain about unexpected function result and crash. Used in "default"
8700+ * branches of switch statements and alike to assert that invalid results are
8701+ * not silently ignored.
8702+ */
8703+#define wrong_return_value( label, function ) \
8704+ impossible( label, "wrong return value from " function )
8705+
8706+/* Issue different types of reiser4 messages to the console */
8707+#define warning( label, format, ... ) \
8708+ DCALL( KERN_WARNING, \
8709+ printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
8710+#define notice( label, format, ... ) \
8711+ DCALL( KERN_NOTICE, \
8712+ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
8713+
8714+/* mark not yet implemented functionality */
8715+#define not_yet( label, format, ... ) \
8716+ reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
8717+
8718+extern void reiser4_do_panic(const char *format, ...)
8719+ __attribute__ ((noreturn, format(printf, 1, 2)));
8720+
8721+extern int reiser4_preempt_point(void);
8722+extern void reiser4_print_stats(void);
8723+
8724+#if REISER4_DEBUG
8725+extern int reiser4_no_counters_are_held(void);
8726+extern int reiser4_commit_check_locks(void);
8727+#else
8728+#define reiser4_no_counters_are_held() (1)
8729+#define reiser4_commit_check_locks() (1)
8730+#endif
8731+
8732+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8733+#define IS_POW(i) \
8734+({ \
8735+ typeof(i) __i; \
8736+ \
8737+ __i = (i); \
8738+ !(__i & (__i - 1)); \
8739+})
8740+
8741+#define KERNEL_DEBUGGER (1)
8742+
8743+#if KERNEL_DEBUGGER
8744+
8745+extern void reiser4_debugtrap(void);
8746+
8747+/*
8748+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8749+ * kgdb is not compiled in, do nothing.
8750+ */
8751+#define DEBUGON(cond) \
8752+({ \
8753+ if (unlikely(cond)) \
8754+ reiser4_debugtrap(); \
8755+})
8756+#else
8757+#define DEBUGON(cond) noop
8758+#endif
8759+
8760+/*
8761+ * Error code tracing facility. (Idea is borrowed from XFS code.)
8762+ *
8763+ * Suppose some strange and/or unexpected code is returned from some function
8764+ * (for example, write(2) returns -EEXIST). It is possible to place a
8765+ * breakpoint in the reiser4_write(), but it is too late here. How to find out
8766+ * in what particular place -EEXIST was generated first?
8767+ *
8768+ * In reiser4 all places where actual error codes are produced (that is,
8769+ * statements of the form
8770+ *
8771+ * return -EFOO; // (1), or
8772+ *
8773+ * result = -EFOO; // (2)
8774+ *
8775+ * are replaced with
8776+ *
8777+ * return RETERR(-EFOO); // (1a), and
8778+ *
8779+ * result = RETERR(-EFOO); // (2a) respectively
8780+ *
8781+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
8782+ * printed in error and warning messages. Moreover, it's possible to put a
8783+ * conditional breakpoint in reiser4_return_err (low-level function called
8784+ * by RETERR() to do the actual work) to break into debugger immediately
8785+ * when particular error happens.
8786+ *
8787+ */
8788+
8789+#if REISER4_DEBUG
8790+
8791+/*
8792+ * data-type to store information about where error happened ("error site").
8793+ */
8794+typedef struct err_site {
8795+ int code; /* error code */
8796+ const char *file; /* source file, filled by __FILE__ */
8797+ int line; /* source file line, filled by __LINE__ */
8798+} err_site;
8799+
8800+extern void reiser4_return_err(int code, const char *file, int line);
8801+
8802+/*
8803+ * fill &get_current_context()->err_site with error information.
8804+ */
8805+#define RETERR(code) \
8806+({ \
8807+ typeof(code) __code; \
8808+ \
8809+ __code = (code); \
8810+ reiser4_return_err(__code, __FILE__, __LINE__); \
8811+ __code; \
8812+})
8813+
8814+#else
8815+
8816+/*
8817+ * no-op versions of the above
8818+ */
8819+
8820+typedef struct err_site {
8821+} err_site;
8822+#define RETERR(code) code
8823+#endif
8824+
8825+#if REISER4_LARGE_KEY
8826+/*
8827+ * conditionally compile arguments only if REISER4_LARGE_KEY is on.
8828+ */
8829+#define ON_LARGE_KEY(...) __VA_ARGS__
8830+#else
8831+#define ON_LARGE_KEY(...)
8832+#endif
8833+
8834+/* __FS_REISER4_DEBUG_H__ */
8835+#endif
8836+
8837+/* Make Linus happy.
8838+ Local variables:
8839+ c-indentation-style: "K&R"
8840+ mode-name: "LC"
8841+ c-basic-offset: 8
8842+ tab-width: 8
8843+ fill-column: 120
8844+ End:
8845+*/
8846diff -urN linux-2.6.23.orig/fs/reiser4/dformat.h linux-2.6.23/fs/reiser4/dformat.h
8847--- linux-2.6.23.orig/fs/reiser4/dformat.h 1970-01-01 03:00:00.000000000 +0300
8848+++ linux-2.6.23/fs/reiser4/dformat.h 2007-12-04 16:49:30.000000000 +0300
8849@@ -0,0 +1,70 @@
8850+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8851+
8852+/* Formats of on-disk data and conversion functions. */
8853+
8854+/* put all item formats in the files describing the particular items,
8855+ our model is, everything you need to do to add an item to reiser4,
8856+ (excepting the changes to the plugin that uses the item which go
8857+ into the file defining that plugin), you put into one file. */
8858+/* Data on disk are stored in little-endian format.
8859+ To declare fields of on-disk structures, use d8, d16, d32 and d64.
8860+ d??tocpu() and cputod??() to convert. */
8861+
8862+#if !defined( __FS_REISER4_DFORMAT_H__ )
8863+#define __FS_REISER4_DFORMAT_H__
8864+
8865+#include <asm/byteorder.h>
8866+#include <asm/unaligned.h>
8867+#include <linux/types.h>
8868+
8869+typedef __u8 d8;
8870+typedef __le16 d16;
8871+typedef __le32 d32;
8872+typedef __le64 d64;
8873+
8874+#define PACKED __attribute__((packed))
8875+
8876+/* data-type for block number */
8877+typedef __u64 reiser4_block_nr;
8878+
8879+/* data-type for block number on disk, disk format */
8880+typedef __le64 reiser4_dblock_nr;
8881+
8882+/**
8883+ * disk_addr_eq - compare disk addresses
8884+ * @b1: pointer to block number ot compare
8885+ * @b2: pointer to block number ot compare
8886+ *
8887+ * Returns true if if disk addresses are the same
8888+ */
8889+static inline int disk_addr_eq(const reiser4_block_nr *b1,
8890+ const reiser4_block_nr * b2)
8891+{
8892+ assert("nikita-1033", b1 != NULL);
8893+ assert("nikita-1266", b2 != NULL);
8894+
8895+ return !memcmp(b1, b2, sizeof *b1);
8896+}
8897+
8898+/* structure of master reiser4 super block */
8899+typedef struct reiser4_master_sb {
8900+ char magic[16]; /* "ReIsEr4" */
8901+ __le16 disk_plugin_id; /* id of disk layout plugin */
8902+ __le16 blocksize;
8903+ char uuid[16]; /* unique id */
8904+ char label[16]; /* filesystem label */
8905+ __le64 diskmap; /* location of the diskmap. 0 if not present */
8906+} reiser4_master_sb;
8907+
8908+/* __FS_REISER4_DFORMAT_H__ */
8909+#endif
8910+
8911+/*
8912+ * Local variables:
8913+ * c-indentation-style: "K&R"
8914+ * mode-name: "LC"
8915+ * c-basic-offset: 8
8916+ * tab-width: 8
8917+ * fill-column: 79
8918+ * End:
8919+ */
8920diff -urN linux-2.6.23.orig/fs/reiser4/dscale.c linux-2.6.23/fs/reiser4/dscale.c
8921--- linux-2.6.23.orig/fs/reiser4/dscale.c 1970-01-01 03:00:00.000000000 +0300
8922+++ linux-2.6.23/fs/reiser4/dscale.c 2007-12-04 22:59:05.786366833 +0300
8923@@ -0,0 +1,174 @@
8924+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8925+ * reiser4/README */
8926+
8927+/* Scalable on-disk integers */
8928+
8929+/*
8930+ * Various on-disk structures contain integer-like structures. Stat-data
8931+ * contain [yes, "data" is plural, check the dictionary] file size, link
8932+ * count; extent unit contains extent width etc. To accommodate for general
8933+ * case enough space is reserved to keep largest possible value. 64 bits in
8934+ * all cases above. But in overwhelming majority of cases numbers actually
8935+ * stored in these fields will be comparatively small and reserving 8 bytes is
8936+ * a waste of precious disk bandwidth.
8937+ *
8938+ * Scalable integers are one way to solve this problem. dscale_write()
8939+ * function stores __u64 value in the given area consuming from 1 to 9 bytes,
8940+ * depending on the magnitude of the value supplied. dscale_read() reads value
8941+ * previously stored by dscale_write().
8942+ *
8943+ * dscale_write() produces format not completely unlike of UTF: two highest
8944+ * bits of the first byte are used to store "tag". One of 4 possible tag
8945+ * values is chosen depending on the number being encoded:
8946+ *
8947+ * 0 ... 0x3f => 0 [table 1]
8948+ * 0x40 ... 0x3fff => 1
8949+ * 0x4000 ... 0x3fffffff => 2
8950+ * 0x40000000 ... 0xffffffffffffffff => 3
8951+ *
8952+ * (see dscale_range() function)
8953+ *
8954+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
8955+ * to be stored, so in this case there is no place in the first byte to store
8956+ * tag. For such values tag is stored in an extra 9th byte.
8957+ *
8958+ * As _highest_ bits are used for the test (which is natural) scaled integers
8959+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
8960+ * uses LITTLE-ENDIAN.
8961+ *
8962+ */
8963+
8964+#include "debug.h"
8965+#include "dscale.h"
8966+
8967+/* return tag of scaled integer stored at @address */
8968+static int gettag(const unsigned char *address)
8969+{
8970+ /* tag is stored in two highest bits */
8971+ return (*address) >> 6;
8972+}
8973+
8974+/* clear tag from value. Clear tag embedded into @value. */
8975+static void cleartag(__u64 * value, int tag)
8976+{
8977+ /*
8978+ * W-w-what ?!
8979+ *
8980+ * Actually, this is rather simple: @value passed here was read by
8981+ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
8982+ * zeroes. Tag is still stored in the highest (arithmetically)
8983+ * non-zero bits of @value, but relative position of tag within __u64
8984+ * depends on @tag.
8985+ *
8986+ * For example if @tag is 0, it's stored 2 highest bits of lowest
8987+ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
8988+ *
8989+ * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
8990+ * and it's offset if (2 * 8) - 2 == 14 bits.
8991+ *
8992+ * See table 1 above for details.
8993+ *
8994+ * All these cases are captured by the formula:
8995+ */
8996+ *value &= ~(3 << (((1 << tag) << 3) - 2));
8997+ /*
8998+ * That is, clear two (3 == 0t11) bits at the offset
8999+ *
9000+ * 8 * (2 ^ tag) - 2,
9001+ *
9002+ * that is, two highest bits of (2 ^ tag)-th byte of @value.
9003+ */
9004+}
9005+
9006+/* return tag for @value. See table 1 above for details. */
9007+static int dscale_range(__u64 value)
9008+{
9009+ if (value > 0x3fffffff)
9010+ return 3;
9011+ if (value > 0x3fff)
9012+ return 2;
9013+ if (value > 0x3f)
9014+ return 1;
9015+ return 0;
9016+}
9017+
9018+/* restore value stored at @adderss by dscale_write() and return number of
9019+ * bytes consumed */
9020+int dscale_read(unsigned char *address, __u64 * value)
9021+{
9022+ int tag;
9023+
9024+ /* read tag */
9025+ tag = gettag(address);
9026+ switch (tag) {
9027+ case 3:
9028+ /* In this case tag is stored in an extra byte, skip this byte
9029+ * and decode value stored in the next 8 bytes.*/
9030+ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
9031+ /* worst case: 8 bytes for value itself plus one byte for
9032+ * tag. */
9033+ return 9;
9034+ case 0:
9035+ *value = get_unaligned(address);
9036+ break;
9037+ case 1:
9038+ *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9039+ break;
9040+ case 2:
9041+ *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9042+ break;
9043+ default:
9044+ return RETERR(-EIO);
9045+ }
9046+ /* clear tag embedded into @value */
9047+ cleartag(value, tag);
9048+ /* number of bytes consumed is (2 ^ tag)---see table 1. */
9049+ return 1 << tag;
9050+}
9051+
9052+/* store @value at @address and return number of bytes consumed */
9053+int dscale_write(unsigned char *address, __u64 value)
9054+{
9055+ int tag;
9056+ int shift;
9057+ __be64 v;
9058+ unsigned char *valarr;
9059+
9060+ tag = dscale_range(value);
9061+ v = __cpu_to_be64(value);
9062+ valarr = (unsigned char *)&v;
9063+ shift = (tag == 3) ? 1 : 0;
9064+ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9065+ *address |= (tag << 6);
9066+ return shift + (1 << tag);
9067+}
9068+
9069+/* number of bytes required to store @value */
9070+int dscale_bytes(__u64 value)
9071+{
9072+ int bytes;
9073+
9074+ bytes = 1 << dscale_range(value);
9075+ if (bytes == 8)
9076+ ++bytes;
9077+ return bytes;
9078+}
9079+
9080+/* returns true if @value and @other require the same number of bytes to be
9081+ * stored. Used by detect when data structure (like stat-data) has to be
9082+ * expanded or contracted. */
9083+int dscale_fit(__u64 value, __u64 other)
9084+{
9085+ return dscale_range(value) == dscale_range(other);
9086+}
9087+
9088+/* Make Linus happy.
9089+ Local variables:
9090+ c-indentation-style: "K&R"
9091+ mode-name: "LC"
9092+ c-basic-offset: 8
9093+ tab-width: 8
9094+ fill-column: 120
9095+ scroll-step: 1
9096+ End:
9097+*/
9098diff -urN linux-2.6.23.orig/fs/reiser4/dscale.h linux-2.6.23/fs/reiser4/dscale.h
9099--- linux-2.6.23.orig/fs/reiser4/dscale.h 1970-01-01 03:00:00.000000000 +0300
9100+++ linux-2.6.23/fs/reiser4/dscale.h 2007-12-04 22:59:05.790367863 +0300
9101@@ -0,0 +1,27 @@
9102+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9103+ * reiser4/README */
9104+
9105+/* Scalable on-disk integers. See dscale.h for details. */
9106+
9107+#if !defined( __FS_REISER4_DSCALE_H__ )
9108+#define __FS_REISER4_DSCALE_H__
9109+
9110+#include "dformat.h"
9111+
9112+extern int dscale_read(unsigned char *address, __u64 * value);
9113+extern int dscale_write(unsigned char *address, __u64 value);
9114+extern int dscale_bytes(__u64 value);
9115+extern int dscale_fit(__u64 value, __u64 other);
9116+
9117+/* __FS_REISER4_DSCALE_H__ */
9118+#endif
9119+
9120+/* Make Linus happy.
9121+ Local variables:
9122+ c-indentation-style: "K&R"
9123+ mode-name: "LC"
9124+ c-basic-offset: 8
9125+ tab-width: 8
9126+ fill-column: 120
9127+ End:
9128+*/
9129diff -urN linux-2.6.23.orig/fs/reiser4/entd.c linux-2.6.23/fs/reiser4/entd.c
9130--- linux-2.6.23.orig/fs/reiser4/entd.c 1970-01-01 03:00:00.000000000 +0300
9131+++ linux-2.6.23/fs/reiser4/entd.c 2007-12-04 16:49:30.000000000 +0300
9132@@ -0,0 +1,335 @@
9133+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9134+ * reiser4/README */
9135+
9136+/* Ent daemon. */
9137+
9138+#include "debug.h"
9139+#include "txnmgr.h"
9140+#include "tree.h"
9141+#include "entd.h"
9142+#include "super.h"
9143+#include "context.h"
9144+#include "reiser4.h"
9145+#include "vfs_ops.h"
9146+#include "page_cache.h"
9147+#include "inode.h"
9148+
9149+#include <linux/sched.h> /* struct task_struct */
9150+#include <linux/suspend.h>
9151+#include <linux/kernel.h>
9152+#include <linux/writeback.h>
9153+#include <linux/time.h> /* INITIAL_JIFFIES */
9154+#include <linux/backing-dev.h> /* bdi_write_congested */
9155+#include <linux/wait.h>
9156+#include <linux/kthread.h>
9157+#include <linux/freezer.h>
9158+
9159+#define DEF_PRIORITY 12
9160+#define MAX_ENTD_ITERS 10
9161+
9162+static void entd_flush(struct super_block *, struct wbq *);
9163+static int entd(void *arg);
9164+
9165+/*
9166+ * set ->comm field of end thread to make its state visible to the user level
9167+ */
9168+#define entd_set_comm(state) \
9169+ snprintf(current->comm, sizeof(current->comm), \
9170+ "ent:%s%s", super->s_id, (state))
9171+
9172+/**
9173+ * reiser4_init_entd - initialize entd context and start kernel daemon
9174+ * @super: super block to start ent thread for
9175+ *
9176+ * Creates entd contexts, starts kernel thread and waits until it
9177+ * initializes.
9178+ */
9179+int reiser4_init_entd(struct super_block *super)
9180+{
9181+ entd_context *ctx;
9182+
9183+ assert("nikita-3104", super != NULL);
9184+
9185+ ctx = get_entd_context(super);
9186+
9187+ memset(ctx, 0, sizeof *ctx);
9188+ spin_lock_init(&ctx->guard);
9189+ init_waitqueue_head(&ctx->wait);
9190+#if REISER4_DEBUG
9191+ INIT_LIST_HEAD(&ctx->flushers_list);
9192+#endif
9193+ /* lists of writepage requests */
9194+ INIT_LIST_HEAD(&ctx->todo_list);
9195+ INIT_LIST_HEAD(&ctx->done_list);
9196+ /* start entd */
9197+ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9198+ if (IS_ERR(ctx->tsk))
9199+ return PTR_ERR(ctx->tsk);
9200+ return 0;
9201+}
9202+
9203+static void put_wbq(struct wbq *rq)
9204+{
9205+ iput(rq->mapping->host);
9206+ complete(&rq->completion);
9207+}
9208+
9209+/* ent should be locked */
9210+static struct wbq *__get_wbq(entd_context * ent)
9211+{
9212+ struct wbq *wbq;
9213+
9214+ if (list_empty(&ent->todo_list))
9215+ return NULL;
9216+
9217+ ent->nr_todo_reqs --;
9218+ wbq = list_entry(ent->todo_list.next, struct wbq, link);
9219+ list_del_init(&wbq->link);
9220+ return wbq;
9221+}
9222+
9223+/* ent thread function */
9224+static int entd(void *arg)
9225+{
9226+ struct super_block *super;
9227+ entd_context *ent;
9228+ int done = 0;
9229+
9230+ super = arg;
9231+ /* do_fork() just copies task_struct into the new
9232+ thread. ->fs_context shouldn't be copied of course. This shouldn't
9233+ be a problem for the rest of the code though.
9234+ */
9235+ current->journal_info = NULL;
9236+
9237+ ent = get_entd_context(super);
9238+
9239+ while (!done) {
9240+ try_to_freeze();
9241+
9242+ spin_lock(&ent->guard);
9243+ while (ent->nr_todo_reqs != 0) {
9244+ struct wbq *rq;
9245+
9246+ assert("", list_empty(&ent->done_list));
9247+
9248+ /* take request from the queue head */
9249+ rq = __get_wbq(ent);
9250+ assert("", rq != NULL);
9251+ ent->cur_request = rq;
9252+ spin_unlock(&ent->guard);
9253+
9254+ entd_set_comm("!");
9255+ entd_flush(super, rq);
9256+
9257+ put_wbq(rq);
9258+
9259+ /*
9260+ * wakeup all requestors and iput their inodes
9261+ */
9262+ spin_lock(&ent->guard);
9263+ while (!list_empty(&ent->done_list)) {
9264+ rq = list_entry(ent->done_list.next, struct wbq, link);
9265+ list_del_init(&rq->link);
9266+ ent->nr_done_reqs --;
9267+ spin_unlock(&ent->guard);
9268+ assert("", rq->written == 1);
9269+ put_wbq(rq);
9270+ spin_lock(&ent->guard);
9271+ }
9272+ }
9273+ spin_unlock(&ent->guard);
9274+
9275+ entd_set_comm(".");
9276+
9277+ {
9278+ DEFINE_WAIT(__wait);
9279+
9280+ do {
9281+ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9282+ if (kthread_should_stop()) {
9283+ done = 1;
9284+ break;
9285+ }
9286+ if (ent->nr_todo_reqs != 0)
9287+ break;
9288+ schedule();
9289+ } while (0);
9290+ finish_wait(&ent->wait, &__wait);
9291+ }
9292+ }
9293+ BUG_ON(ent->nr_todo_reqs != 0);
9294+ return 0;
9295+}
9296+
9297+/**
9298+ * reiser4_done_entd - stop entd kernel thread
9299+ * @super: super block to stop ent thread for
9300+ *
9301+ * It is called on umount. Sends stop signal to entd and wait until it handles
9302+ * it.
9303+ */
9304+void reiser4_done_entd(struct super_block *super)
9305+{
9306+ entd_context *ent;
9307+
9308+ assert("nikita-3103", super != NULL);
9309+
9310+ ent = get_entd_context(super);
9311+ assert("zam-1055", ent->tsk != NULL);
9312+ kthread_stop(ent->tsk);
9313+}
9314+
9315+/* called at the beginning of jnode_flush to register flusher thread with ent
9316+ * daemon */
9317+void reiser4_enter_flush(struct super_block *super)
9318+{
9319+ entd_context *ent;
9320+
9321+ assert("zam-1029", super != NULL);
9322+ ent = get_entd_context(super);
9323+
9324+ assert("zam-1030", ent != NULL);
9325+
9326+ spin_lock(&ent->guard);
9327+ ent->flushers++;
9328+#if REISER4_DEBUG
9329+ list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9330+#endif
9331+ spin_unlock(&ent->guard);
9332+}
9333+
9334+/* called at the end of jnode_flush */
9335+void reiser4_leave_flush(struct super_block *super)
9336+{
9337+ entd_context *ent;
9338+ int wake_up_ent;
9339+
9340+ assert("zam-1027", super != NULL);
9341+ ent = get_entd_context(super);
9342+
9343+ assert("zam-1028", ent != NULL);
9344+
9345+ spin_lock(&ent->guard);
9346+ ent->flushers--;
9347+ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9348+#if REISER4_DEBUG
9349+ list_del_init(&get_current_context()->flushers_link);
9350+#endif
9351+ spin_unlock(&ent->guard);
9352+ if (wake_up_ent)
9353+ wake_up(&ent->wait);
9354+}
9355+
9356+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9357+
9358+static void entd_flush(struct super_block *super, struct wbq *rq)
9359+{
9360+ reiser4_context ctx;
9361+ int tmp;
9362+
9363+ init_stack_context(&ctx, super);
9364+ ctx.entd = 1;
9365+ ctx.gfp_mask = GFP_NOFS;
9366+
9367+ rq->wbc->range_start = page_offset(rq->page);
9368+ rq->wbc->range_end = rq->wbc->range_start +
9369+ (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9370+ tmp = rq->wbc->nr_to_write;
9371+ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9372+
9373+ if (rq->wbc->nr_to_write > 0) {
9374+ rq->wbc->range_start = 0;
9375+ rq->wbc->range_end = LLONG_MAX;
9376+ generic_sync_sb_inodes(super, rq->wbc);
9377+ }
9378+ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9379+ reiser4_writeout(super, rq->wbc);
9380+
9381+ context_set_commit_async(&ctx);
9382+ reiser4_exit_context(&ctx);
9383+}
9384+
9385+/**
9386+ * write_page_by_ent - ask entd thread to flush this page as part of slum
9387+ * @page: page to be written
9388+ * @wbc: writeback control passed to reiser4_writepage
9389+ *
9390+ * Creates a request, puts it on entd list of requests, wakeups entd if
9391+ * necessary, waits until entd completes with the request.
9392+ */
9393+int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9394+{
9395+ struct super_block *sb;
9396+ struct inode *inode;
9397+ entd_context *ent;
9398+ struct wbq rq;
9399+
9400+ assert("", PageLocked(page));
9401+ assert("", page->mapping != NULL);
9402+
9403+ sb = page->mapping->host->i_sb;
9404+ ent = get_entd_context(sb);
9405+ assert("", ent && ent->done == 0);
9406+
9407+ /*
9408+ * we are going to unlock page and ask ent thread to write the
9409+ * page. Re-dirty page before unlocking so that if ent thread fails to
9410+ * write it - it will remain dirty
9411+ */
9412+ reiser4_set_page_dirty_internal(page);
9413+
9414+ /*
9415+ * pin inode in memory, unlock page, entd_flush will iput. We can not
9416+ * iput here becasue we can not allow delete_inode to be called here
9417+ */
9418+ inode = igrab(page->mapping->host);
9419+ unlock_page(page);
9420+ if (inode == NULL)
9421+ /* inode is getting freed */
9422+ return 0;
9423+
9424+ /* init wbq */
9425+ INIT_LIST_HEAD(&rq.link);
9426+ rq.magic = WBQ_MAGIC;
9427+ rq.wbc = wbc;
9428+ rq.page = page;
9429+ rq.mapping = inode->i_mapping;
9430+ rq.node = NULL;
9431+ rq.written = 0;
9432+ init_completion(&rq.completion);
9433+
9434+ /* add request to entd's list of writepage requests */
9435+ spin_lock(&ent->guard);
9436+ ent->nr_todo_reqs++;
9437+ list_add_tail(&rq.link, &ent->todo_list);
9438+ if (ent->nr_todo_reqs == 1)
9439+ wake_up(&ent->wait);
9440+
9441+ spin_unlock(&ent->guard);
9442+
9443+ /* wait until entd finishes */
9444+ wait_for_completion(&rq.completion);
9445+
9446+ if (rq.written)
9447+ /* Eventually ENTD has written the page to disk. */
9448+ return 0;
9449+ return 0;
9450+}
9451+
9452+int wbq_available(void)
9453+{
9454+ struct super_block *sb = reiser4_get_current_sb();
9455+ entd_context *ent = get_entd_context(sb);
9456+ return ent->nr_todo_reqs;
9457+}
9458+
9459+/*
9460+ * Local variables:
9461+ * c-indentation-style: "K&R"
9462+ * mode-name: "LC"
9463+ * c-basic-offset: 8
9464+ * tab-width: 8
9465+ * fill-column: 79
9466+ * End:
9467+ */
9468diff -urN linux-2.6.23.orig/fs/reiser4/entd.h linux-2.6.23/fs/reiser4/entd.h
9469--- linux-2.6.23.orig/fs/reiser4/entd.h 1970-01-01 03:00:00.000000000 +0300
9470+++ linux-2.6.23/fs/reiser4/entd.h 2007-12-04 16:49:30.000000000 +0300
9471@@ -0,0 +1,90 @@
9472+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9473+
9474+/* Ent daemon. */
9475+
9476+#ifndef __ENTD_H__
9477+#define __ENTD_H__
9478+
9479+#include "context.h"
9480+
9481+#include <linux/fs.h>
9482+#include <linux/completion.h>
9483+#include <linux/wait.h>
9484+#include <linux/spinlock.h>
9485+#include <linux/sched.h> /* for struct task_struct */
9486+
9487+#define WBQ_MAGIC 0x7876dc76
9488+
9489+/* write-back request. */
9490+struct wbq {
9491+ int magic;
9492+ struct list_head link; /* list head of this list is in entd context */
9493+ struct writeback_control *wbc;
9494+ struct page *page;
9495+ struct address_space *mapping;
9496+ struct completion completion;
9497+ jnode *node; /* set if ent thread captured requested page */
9498+ int written; /* set if ent thread wrote requested page */
9499+};
9500+
9501+/* ent-thread context. This is used to synchronize starting/stopping ent
9502+ * threads. */
9503+typedef struct entd_context {
9504+ /* wait queue that ent thread waits on for more work. It's
9505+ * signaled by write_page_by_ent(). */
9506+ wait_queue_head_t wait;
9507+ /* spinlock protecting other fields */
9508+ spinlock_t guard;
9509+ /* ent thread */
9510+ struct task_struct *tsk;
9511+ /* set to indicate that ent thread should leave. */
9512+ int done;
9513+ /* counter of active flushers */
9514+ int flushers;
9515+ /*
9516+ * when reiser4_writepage asks entd to write a page - it adds struct
9517+ * wbq to this list
9518+ */
9519+ struct list_head todo_list;
9520+ /* number of elements on the above list */
9521+ int nr_todo_reqs;
9522+
9523+ struct wbq *cur_request;
9524+ /*
9525+ * when entd writes a page it moves write-back request from todo_list
9526+ * to done_list. This list is used at the end of entd iteration to
9527+ * wakeup requestors and iput inodes.
9528+ */
9529+ struct list_head done_list;
9530+ /* number of elements on the above list */
9531+ int nr_done_reqs;
9532+
9533+#if REISER4_DEBUG
9534+ /* list of all active flushers */
9535+ struct list_head flushers_list;
9536+#endif
9537+} entd_context;
9538+
9539+extern int reiser4_init_entd(struct super_block *);
9540+extern void reiser4_done_entd(struct super_block *);
9541+
9542+extern void reiser4_enter_flush(struct super_block *);
9543+extern void reiser4_leave_flush(struct super_block *);
9544+
9545+extern int write_page_by_ent(struct page *, struct writeback_control *);
9546+extern int wbq_available(void);
9547+extern void ent_writes_page(struct super_block *, struct page *);
9548+
9549+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9550+/* __ENTD_H__ */
9551+#endif
9552+
9553+/* Make Linus happy.
9554+ Local variables:
9555+ c-indentation-style: "K&R"
9556+ mode-name: "LC"
9557+ c-basic-offset: 8
9558+ tab-width: 8
9559+ fill-column: 120
9560+ End:
9561+*/
9562diff -urN linux-2.6.23.orig/fs/reiser4/eottl.c linux-2.6.23/fs/reiser4/eottl.c
9563--- linux-2.6.23.orig/fs/reiser4/eottl.c 1970-01-01 03:00:00.000000000 +0300
9564+++ linux-2.6.23/fs/reiser4/eottl.c 2007-12-04 16:49:30.000000000 +0300
9565@@ -0,0 +1,509 @@
9566+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9567+
9568+#include "forward.h"
9569+#include "debug.h"
9570+#include "key.h"
9571+#include "coord.h"
9572+#include "plugin/item/item.h"
9573+#include "plugin/node/node.h"
9574+#include "znode.h"
9575+#include "block_alloc.h"
9576+#include "tree_walk.h"
9577+#include "tree_mod.h"
9578+#include "carry.h"
9579+#include "tree.h"
9580+#include "super.h"
9581+
9582+#include <linux/types.h> /* for __u?? */
9583+
9584+/*
9585+ * Extents on the twig level (EOTTL) handling.
9586+ *
9587+ * EOTTL poses some problems to the tree traversal, that are better explained
9588+ * by example.
9589+ *
9590+ * Suppose we have block B1 on the twig level with the following items:
9591+ *
9592+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9593+ * offset)
9594+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9595+ * 2. internal item I2 with key (10:0:0:0)
9596+ *
9597+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9598+ * then intra-node lookup is done. This lookup finished on the E1, because the
9599+ * key we are looking for is larger than the key of E1 and is smaller than key
9600+ * the of I2.
9601+ *
9602+ * Here search is stuck.
9603+ *
9604+ * After some thought it is clear what is wrong here: extents on the twig level
9605+ * break some basic property of the *search* tree (on the pretext, that they
9606+ * restore property of balanced tree).
9607+ *
9608+ * Said property is the following: if in the internal node of the search tree
9609+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9610+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9611+ * through the Pointer.
9612+ *
9613+ * This is not true, when Pointer is Extent-Pointer, simply because extent
9614+ * cannot expand indefinitely to the right to include any item with
9615+ *
9616+ * Key1 <= Key <= Key2.
9617+ *
9618+ * For example, our E1 extent is only responsible for the data with keys
9619+ *
9620+ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9621+ *
9622+ * so, key range
9623+ *
9624+ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9625+ *
9626+ * is orphaned: there is no way to get there from the tree root.
9627+ *
9628+ * In other words, extent pointers are different than normal child pointers as
9629+ * far as search tree is concerned, and this creates such problems.
9630+ *
9631+ * Possible solution for this problem is to insert our item into node pointed
9632+ * to by I2. There are some problems through:
9633+ *
9634+ * (1) I2 can be in a different node.
9635+ * (2) E1 can be immediately followed by another extent E2.
9636+ *
9637+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9638+ * for locks/coords as necessary.
9639+ *
9640+ * (2) is more complex. Solution here is to insert new empty leaf node and
9641+ * insert internal item between E1 and E2 pointing to said leaf node. This is
9642+ * further complicated by possibility that E2 is in a different node, etc.
9643+ *
9644+ * Problems:
9645+ *
9646+ * (1) if there was internal item I2 immediately on the right of an extent E1
9647+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9648+ * key of S1 will be less than smallest key in the N2. Normally, search key
9649+ * checks that key we are looking for is in the range of keys covered by the
9650+ * node key is being looked in. To work around of this situation, while
9651+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9652+ * cbk falgs bitmask. This flag is automatically set on entrance to the
9653+ * coord_by_key() and is only cleared when we are about to enter situation
9654+ * described above.
9655+ *
9656+ * (2) If extent E1 is immediately followed by another extent E2 and we are
9657+ * searching for the key that is between E1 and E2 we only have to insert new
9658+ * empty leaf node when coord_by_key was called for insertion, rather than just
9659+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9660+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9661+ * performed by insert_by_key() and friends.
9662+ *
9663+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9664+ * case it requires modification of node content which is only possible under
9665+ * write lock. It may well happen that we only have read lock on the node where
9666+ * new internal pointer is to be inserted (common case: lookup of non-existent
9667+ * stat-data that fells between two extents). If only read lock is held, tree
9668+ * traversal is restarted with lock_level modified so that next time we hit
9669+ * this problem, write lock will be held. Once we have write lock, balancing
9670+ * will be performed.
9671+ */
9672+
9673+/**
9674+ * is_next_item_internal - check whether next item is internal
9675+ * @coord: coordinate of extent item in twig node
9676+ * @key: search key
9677+ * @lh: twig node lock handle
9678+ *
9679+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9680+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9681+ * to that node, @coord is set to its first unit. If next item is not internal
9682+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9683+ * is returned if search restart has to be done.
9684+ */
9685+static int
9686+is_next_item_internal(coord_t *coord, const reiser4_key *key,
9687+ lock_handle *lh)
9688+{
9689+ coord_t next;
9690+ lock_handle rn;
9691+ int result;
9692+
9693+ coord_dup(&next, coord);
9694+ if (coord_next_unit(&next) == 0) {
9695+ /* next unit is in this node */
9696+ if (item_is_internal(&next)) {
9697+ coord_dup(coord, &next);
9698+ return 1;
9699+ }
9700+ assert("vs-3", item_is_extent(&next));
9701+ return 0;
9702+ }
9703+
9704+ /*
9705+ * next unit either does not exist or is in right neighbor. If it is in
9706+ * right neighbor we have to check right delimiting key because
9707+ * concurrent thread could get their first and insert item with a key
9708+ * smaller than @key
9709+ */
9710+ read_lock_dk(current_tree);
9711+ result = keycmp(key, znode_get_rd_key(coord->node));
9712+ read_unlock_dk(current_tree);
9713+ assert("vs-6", result != EQUAL_TO);
9714+ if (result == GREATER_THAN)
9715+ return 2;
9716+
9717+ /* lock right neighbor */
9718+ init_lh(&rn);
9719+ result = reiser4_get_right_neighbor(&rn, coord->node,
9720+ znode_is_wlocked(coord->node) ?
9721+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9722+ GN_CAN_USE_UPPER_LEVELS);
9723+ if (result == -E_NO_NEIGHBOR) {
9724+ /* we are on the rightmost edge of the tree */
9725+ done_lh(&rn);
9726+ return 0;
9727+ }
9728+
9729+ if (result) {
9730+ assert("vs-4", result < 0);
9731+ done_lh(&rn);
9732+ return result;
9733+ }
9734+
9735+ /*
9736+ * check whether concurrent thread managed to insert item with a key
9737+ * smaller than @key
9738+ */
9739+ read_lock_dk(current_tree);
9740+ result = keycmp(key, znode_get_ld_key(rn.node));
9741+ read_unlock_dk(current_tree);
9742+ assert("vs-6", result != EQUAL_TO);
9743+ if (result == GREATER_THAN) {
9744+ done_lh(&rn);
9745+ return 2;
9746+ }
9747+
9748+ result = zload(rn.node);
9749+ if (result) {
9750+ assert("vs-5", result < 0);
9751+ done_lh(&rn);
9752+ return result;
9753+ }
9754+
9755+ coord_init_first_unit(&next, rn.node);
9756+ if (item_is_internal(&next)) {
9757+ /*
9758+ * next unit is in right neighbor and it is an unit of internal
9759+ * item. Unlock coord->node. Move @lh to right neighbor. @coord
9760+ * is set to the first unit of right neighbor.
9761+ */
9762+ coord_dup(coord, &next);
9763+ zrelse(rn.node);
9764+ done_lh(lh);
9765+ move_lh(lh, &rn);
9766+ return 1;
9767+ }
9768+
9769+ /*
9770+ * next unit is unit of extent item. Return without chaning @lh and
9771+ * @coord.
9772+ */
9773+ assert("vs-6", item_is_extent(&next));
9774+ zrelse(rn.node);
9775+ done_lh(&rn);
9776+ return 0;
9777+}
9778+
9779+/**
9780+ * rd_key - calculate key of an item next to the given one
9781+ * @coord: position in a node
9782+ * @key: storage for result key
9783+ *
9784+ * @coord is set between items or after the last item in a node. Calculate key
9785+ * of item to the right of @coord.
9786+ */
9787+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
9788+{
9789+ coord_t dup;
9790+
9791+ assert("nikita-2281", coord_is_between_items(coord));
9792+ coord_dup(&dup, coord);
9793+
9794+ if (coord_set_to_right(&dup) == 0)
9795+ /* next item is in this node. Return its key. */
9796+ unit_key_by_coord(&dup, key);
9797+ else {
9798+ /*
9799+ * next item either does not exist or is in right
9800+ * neighbor. Return znode's right delimiting key.
9801+ */
9802+ read_lock_dk(current_tree);
9803+ *key = *znode_get_rd_key(coord->node);
9804+ read_unlock_dk(current_tree);
9805+ }
9806+ return key;
9807+}
9808+
9809+/**
9810+ * add_empty_leaf - insert empty leaf between two extents
9811+ * @insert_coord: position in twig node between two extents
9812+ * @lh: twig node lock handle
9813+ * @key: left delimiting key of new node
9814+ * @rdkey: right delimiting key of new node
9815+ *
9816+ * Inserts empty leaf node between two extent items. It is necessary when we
9817+ * have to insert an item on leaf level between two extents (items on the twig
9818+ * level).
9819+ */
9820+static int
9821+add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
9822+ const reiser4_key *key, const reiser4_key *rdkey)
9823+{
9824+ int result;
9825+ carry_pool *pool;
9826+ carry_level *todo;
9827+ reiser4_item_data *item;
9828+ carry_insert_data *cdata;
9829+ carry_op *op;
9830+ znode *node;
9831+ reiser4_tree *tree;
9832+
9833+ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
9834+ tree = znode_get_tree(insert_coord->node);
9835+ node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
9836+ if (IS_ERR(node))
9837+ return PTR_ERR(node);
9838+
9839+ /* setup delimiting keys for node being inserted */
9840+ write_lock_dk(tree);
9841+ znode_set_ld_key(node, key);
9842+ znode_set_rd_key(node, rdkey);
9843+ ON_DEBUG(node->creator = current);
9844+ ON_DEBUG(node->first_key = *key);
9845+ write_unlock_dk(tree);
9846+
9847+ ZF_SET(node, JNODE_ORPHAN);
9848+
9849+ /*
9850+ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
9851+ * carry_insert_data
9852+ */
9853+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
9854+ sizeof(*item) + sizeof(*cdata));
9855+ if (IS_ERR(pool))
9856+ return PTR_ERR(pool);
9857+ todo = (carry_level *) (pool + 1);
9858+ init_carry_level(todo, pool);
9859+
9860+ item = (reiser4_item_data *) (todo + 3);
9861+ cdata = (carry_insert_data *) (item + 1);
9862+
9863+ op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
9864+ if (!IS_ERR(op)) {
9865+ cdata->coord = insert_coord;
9866+ cdata->key = key;
9867+ cdata->data = item;
9868+ op->u.insert.d = cdata;
9869+ op->u.insert.type = COPT_ITEM_DATA;
9870+ build_child_ptr_data(node, item);
9871+ item->arg = NULL;
9872+ /* have @insert_coord to be set at inserted item after
9873+ insertion is done */
9874+ todo->track_type = CARRY_TRACK_CHANGE;
9875+ todo->tracked = lh;
9876+
9877+ result = reiser4_carry(todo, NULL);
9878+ if (result == 0) {
9879+ /*
9880+ * pin node in memory. This is necessary for
9881+ * znode_make_dirty() below.
9882+ */
9883+ result = zload(node);
9884+ if (result == 0) {
9885+ lock_handle local_lh;
9886+
9887+ /*
9888+ * if we inserted new child into tree we have
9889+ * to mark it dirty so that flush will be able
9890+ * to process it.
9891+ */
9892+ init_lh(&local_lh);
9893+ result = longterm_lock_znode(&local_lh, node,
9894+ ZNODE_WRITE_LOCK,
9895+ ZNODE_LOCK_LOPRI);
9896+ if (result == 0) {
9897+ znode_make_dirty(node);
9898+
9899+ /*
9900+ * when internal item pointing to @node
9901+ * was inserted into twig node
9902+ * create_hook_internal did not connect
9903+ * it properly because its right
9904+ * neighbor was not known. Do it
9905+ * here
9906+ */
9907+ write_lock_tree(tree);
9908+ assert("nikita-3312",
9909+ znode_is_right_connected(node));
9910+ assert("nikita-2984",
9911+ node->right == NULL);
9912+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
9913+ write_unlock_tree(tree);
9914+ result =
9915+ connect_znode(insert_coord, node);
9916+ ON_DEBUG(if (result == 0) check_dkeys(node););
9917+
9918+ done_lh(lh);
9919+ move_lh(lh, &local_lh);
9920+ assert("vs-1676", node_is_empty(node));
9921+ coord_init_first_unit(insert_coord,
9922+ node);
9923+ } else {
9924+ warning("nikita-3136",
9925+ "Cannot lock child");
9926+ }
9927+ done_lh(&local_lh);
9928+ zrelse(node);
9929+ }
9930+ }
9931+ } else
9932+ result = PTR_ERR(op);
9933+ zput(node);
9934+ done_carry_pool(pool);
9935+ return result;
9936+}
9937+
9938+/**
9939+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
9940+ * @h: search handle
9941+ * @outcome: flag saying whether search has to restart or is done
9942+ *
9943+ * Handles search on twig level. If this function completes search itself then
9944+ * it returns 1. If search has to go one level down then 0 is returned. If
9945+ * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
9946+ * in @h->result.
9947+ */
9948+int handle_eottl(cbk_handle *h, int *outcome)
9949+{
9950+ int result;
9951+ reiser4_key key;
9952+ coord_t *coord;
9953+
9954+ coord = h->coord;
9955+
9956+ if (h->level != TWIG_LEVEL ||
9957+ (coord_is_existing_item(coord) && item_is_internal(coord))) {
9958+ /* Continue to traverse tree downward. */
9959+ return 0;
9960+ }
9961+
9962+ /*
9963+ * make sure that @h->coord is set to twig node and that it is either
9964+ * set to extent item or after extent item
9965+ */
9966+ assert("vs-356", h->level == TWIG_LEVEL);
9967+ assert("vs-357", ( {
9968+ coord_t lcoord;
9969+ coord_dup(&lcoord, coord);
9970+ check_me("vs-733", coord_set_to_left(&lcoord) == 0);
9971+ item_is_extent(&lcoord);
9972+ }
9973+ ));
9974+
9975+ if (*outcome == NS_FOUND) {
9976+ /* we have found desired key on twig level in extent item */
9977+ h->result = CBK_COORD_FOUND;
9978+ *outcome = LOOKUP_DONE;
9979+ return 1;
9980+ }
9981+
9982+ if (!(h->flags & CBK_FOR_INSERT)) {
9983+ /* tree traversal is not for insertion. Just return
9984+ CBK_COORD_NOTFOUND. */
9985+ h->result = CBK_COORD_NOTFOUND;
9986+ *outcome = LOOKUP_DONE;
9987+ return 1;
9988+ }
9989+
9990+ /* take a look at the item to the right of h -> coord */
9991+ result = is_next_item_internal(coord, h->key, h->active_lh);
9992+ if (unlikely(result < 0)) {
9993+ h->error = "get_right_neighbor failed";
9994+ h->result = result;
9995+ *outcome = LOOKUP_DONE;
9996+ return 1;
9997+ }
9998+ if (result == 0) {
9999+ /*
10000+ * item to the right is also an extent one. Allocate a new node
10001+ * and insert pointer to it after item h -> coord.
10002+ *
10003+ * This is a result of extents being located at the twig
10004+ * level. For explanation, see comment just above
10005+ * is_next_item_internal().
10006+ */
10007+ znode *loaded;
10008+
10009+ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
10010+ /*
10011+ * we got node read locked, restart coord_by_key to
10012+ * have write lock on twig level
10013+ */
10014+ h->lock_level = TWIG_LEVEL;
10015+ h->lock_mode = ZNODE_WRITE_LOCK;
10016+ *outcome = LOOKUP_REST;
10017+ return 1;
10018+ }
10019+
10020+ loaded = coord->node;
10021+ result =
10022+ add_empty_leaf(coord, h->active_lh, h->key,
10023+ rd_key(coord, &key));
10024+ if (result) {
10025+ h->error = "could not add empty leaf";
10026+ h->result = result;
10027+ *outcome = LOOKUP_DONE;
10028+ return 1;
10029+ }
10030+ /* added empty leaf is locked (h->active_lh), its parent node
10031+ is unlocked, h->coord is set as EMPTY */
10032+ assert("vs-13", coord->between == EMPTY_NODE);
10033+ assert("vs-14", znode_is_write_locked(coord->node));
10034+ assert("vs-15",
10035+ WITH_DATA(coord->node, node_is_empty(coord->node)));
10036+ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10037+ assert("vs-17", coord->node == h->active_lh->node);
10038+ *outcome = LOOKUP_DONE;
10039+ h->result = CBK_COORD_NOTFOUND;
10040+ return 1;
10041+ } else if (result == 1) {
10042+ /*
10043+ * this is special case mentioned in the comment on
10044+ * tree.h:cbk_flags. We have found internal item immediately on
10045+ * the right of extent, and we are going to insert new item
10046+ * there. Key of item we are going to insert is smaller than
10047+ * leftmost key in the node pointed to by said internal item
10048+ * (otherwise search wouldn't come to the extent in the first
10049+ * place).
10050+ *
10051+ * This is a result of extents being located at the twig
10052+ * level. For explanation, see comment just above
10053+ * is_next_item_internal().
10054+ */
10055+ h->flags &= ~CBK_TRUST_DK;
10056+ } else {
10057+ assert("vs-8", result == 2);
10058+ *outcome = LOOKUP_REST;
10059+ return 1;
10060+ }
10061+ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10062+ return 0;
10063+}
10064+
10065+/*
10066+ * Local variables:
10067+ * c-indentation-style: "K&R"
10068+ * mode-name: "LC"
10069+ * c-basic-offset: 8
10070+ * tab-width: 8
10071+ * fill-column: 120
10072+ * scroll-step: 1
10073+ * End:
10074+ */
10075diff -urN linux-2.6.23.orig/fs/reiser4/estimate.c linux-2.6.23/fs/reiser4/estimate.c
10076--- linux-2.6.23.orig/fs/reiser4/estimate.c 1970-01-01 03:00:00.000000000 +0300
10077+++ linux-2.6.23/fs/reiser4/estimate.c 2007-12-04 16:49:30.000000000 +0300
10078@@ -0,0 +1,120 @@
10079+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10080+
10081+#include "debug.h"
10082+#include "dformat.h"
10083+#include "tree.h"
10084+#include "carry.h"
10085+#include "inode.h"
10086+#include "plugin/cluster.h"
10087+#include "plugin/item/ctail.h"
10088+
10089+/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10090+
10091+ Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10092+ is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10093+ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10094+ leaf level, 3 for twig level, 2 on upper + 1 for root.
10095+
10096+ Do not calculate the current node of the lowest level here - this is overhead only.
10097+
10098+ children is almost always 1 here. Exception is flow insertion
10099+*/
10100+static reiser4_block_nr
10101+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10102+{
10103+ reiser4_block_nr ten_percent;
10104+
10105+ ten_percent = ((103 * childen) >> 10);
10106+
10107+ /* If we have too many balancings at the time, tree height can raise on more
10108+ then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10109+ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10110+}
10111+
10112+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10113+ perform insertion of one item into the tree */
10114+/* it is only called when tree height changes, or gets initialized */
10115+reiser4_block_nr calc_estimate_one_insert(tree_level height)
10116+{
10117+ return 1 + max_balance_overhead(1, height);
10118+}
10119+
10120+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10121+{
10122+ return tree->estimate_one_insert;
10123+}
10124+
10125+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10126+ perform insertion of one unit into an item in the tree */
10127+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10128+{
10129+ /* estimate insert into item just like item insertion */
10130+ return tree->estimate_one_insert;
10131+}
10132+
10133+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10134+{
10135+ /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10136+ level */
10137+ return tree->estimate_one_insert;
10138+}
10139+
10140+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10141+ both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10142+ levels */
10143+reiser4_block_nr estimate_insert_flow(tree_level height)
10144+{
10145+ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10146+ CARRY_FLOW_NEW_NODES_LIMIT,
10147+ height);
10148+}
10149+
10150+/* returnes max number of nodes can be occupied by disk cluster */
10151+static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10152+{
10153+ int per_cluster;
10154+ per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10155+ return 3 + per_cluster +
10156+ max_balance_overhead(3 + per_cluster,
10157+ REISER4_MAX_ZTREE_HEIGHT);
10158+}
10159+
10160+/* how many nodes might get dirty and added
10161+ during insertion of a disk cluster */
10162+reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10163+{
10164+ return estimate_cluster(inode, 1); /* 24 */
10165+}
10166+
10167+/* how many nodes might get dirty and added
10168+ during update of a (prepped or unprepped) disk cluster */
10169+reiser4_block_nr estimate_update_cluster(struct inode * inode)
10170+{
10171+ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10172+}
10173+
10174+/* How many nodes occupied by a disk cluster might get dirty.
10175+ Note that this estimation is not precise (i.e. disk cluster
10176+ can occupy more nodes).
10177+ Q: Why we don't use precise estimation?
10178+ A: 1.Because precise estimation is fairly bad: 65536 nodes
10179+ for 64K logical cluster, it means 256M of dead space on
10180+ a partition
10181+ 2.It is a very rare case when disk cluster occupies more
10182+ nodes then this estimation returns.
10183+*/
10184+reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10185+{
10186+ return cluster_nrpages(inode) + 4;
10187+}
10188+
10189+/* Make Linus happy.
10190+ Local variables:
10191+ c-indentation-style: "K&R"
10192+ mode-name: "LC"
10193+ c-basic-offset: 8
10194+ tab-width: 8
10195+ fill-column: 120
10196+ scroll-step: 1
10197+ End:
10198+*/
10199diff -urN linux-2.6.23.orig/fs/reiser4/export_ops.c linux-2.6.23/fs/reiser4/export_ops.c
10200--- linux-2.6.23.orig/fs/reiser4/export_ops.c 1970-01-01 03:00:00.000000000 +0300
10201+++ linux-2.6.23/fs/reiser4/export_ops.c 2007-12-04 22:59:05.774363742 +0300
10202@@ -0,0 +1,297 @@
10203+/* Copyright 2005 by Hans Reiser, licensing governed by
10204+ * reiser4/README */
10205+
10206+#include "inode.h"
10207+#include "plugin/plugin.h"
10208+
10209+/*
10210+ * Supported file-handle types
10211+ */
10212+typedef enum {
10213+ FH_WITH_PARENT = 0x10, /* file handle with parent */
10214+ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10215+} reiser4_fhtype;
10216+
10217+#define NFSERROR (255)
10218+
10219+/* initialize place-holder for object */
10220+static void object_on_wire_init(reiser4_object_on_wire *o)
10221+{
10222+ o->plugin = NULL;
10223+}
10224+
10225+/* finish with @o */
10226+static void object_on_wire_done(reiser4_object_on_wire *o)
10227+{
10228+ if (o->plugin != NULL)
10229+ o->plugin->wire.done(o);
10230+}
10231+
10232+/*
10233+ * read serialized object identity from @addr and store information about
10234+ * object in @obj. This is dual to encode_inode().
10235+ */
10236+static char *decode_inode(struct super_block *s, char *addr,
10237+ reiser4_object_on_wire * obj)
10238+{
10239+ file_plugin *fplug;
10240+
10241+ /* identifier of object plugin is stored in the first two bytes,
10242+ * followed by... */
10243+ fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10244+ if (fplug != NULL) {
10245+ addr += sizeof(d16);
10246+ obj->plugin = fplug;
10247+ assert("nikita-3520", fplug->wire.read != NULL);
10248+ /* plugin specific encoding of object identity. */
10249+ addr = fplug->wire.read(addr, obj);
10250+ } else
10251+ addr = ERR_PTR(RETERR(-EINVAL));
10252+ return addr;
10253+}
10254+
10255+/**
10256+ * reiser4_decode_fh - decode_fh of export operations
10257+ * @super: super block
10258+ * @fh: nfsd file handle
10259+ * @len: length of file handle
10260+ * @fhtype: type of file handle
10261+ * @acceptable: acceptability testing function
10262+ * @context: argument for @acceptable
10263+ *
10264+ * Returns dentry referring to the same file as @fh.
10265+ */
10266+static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10267+ int len, int fhtype,
10268+ int (*acceptable) (void *context,
10269+ struct dentry *de),
10270+ void *context)
10271+{
10272+ reiser4_context *ctx;
10273+ reiser4_object_on_wire object;
10274+ reiser4_object_on_wire parent;
10275+ char *addr;
10276+ int with_parent;
10277+
10278+ ctx = reiser4_init_context(super);
10279+ if (IS_ERR(ctx))
10280+ return (struct dentry *)ctx;
10281+
10282+ assert("vs-1482",
10283+ fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10284+
10285+ with_parent = (fhtype == FH_WITH_PARENT);
10286+
10287+ addr = (char *)fh;
10288+
10289+ object_on_wire_init(&object);
10290+ object_on_wire_init(&parent);
10291+#if 0
10292+ addr = decode_inode(super, addr, &object);
10293+ if (!IS_ERR(addr)) {
10294+ if (with_parent)
10295+ addr = decode_inode(super, addr, &parent);
10296+ if (!IS_ERR(addr)) {
10297+ struct dentry *d;
10298+ typeof(super->s_export_op->find_exported_dentry) fn;
10299+
10300+ fn = super->s_export_op->find_exported_dentry;
10301+ assert("nikita-3521", fn != NULL);
10302+ d = fn(super, &object, with_parent ? &parent : NULL,
10303+ acceptable, context);
10304+ if (d != NULL && !IS_ERR(d))
10305+ /* FIXME check for -ENOMEM */
10306+ reiser4_get_dentry_fsdata(d)->stateless = 1;
10307+ addr = (char *)d;
10308+ }
10309+ }
10310+ object_on_wire_done(&object);
10311+ object_on_wire_done(&parent);
10312+
10313+ reiser4_exit_context(ctx);
10314+ return (void *)addr;
10315+#else
10316+ return ERR_PTR(-EINVAL);
10317+#endif
10318+}
10319+
10320+/*
10321+ * Object serialization support.
10322+ *
10323+ * To support knfsd file system provides export_operations that are used to
10324+ * construct and interpret NFS file handles. As a generalization of this,
10325+ * reiser4 object plugins have serialization support: it provides methods to
10326+ * create on-wire representation of identity of reiser4 object, and
10327+ * re-create/locate object given its on-wire identity.
10328+ *
10329+ */
10330+
10331+/*
10332+ * return number of bytes that on-wire representation of @inode's identity
10333+ * consumes.
10334+ */
10335+static int encode_inode_size(struct inode *inode)
10336+{
10337+ assert("nikita-3514", inode != NULL);
10338+ assert("nikita-3515", inode_file_plugin(inode) != NULL);
10339+ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10340+
10341+ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10342+}
10343+
10344+/*
10345+ * store on-wire representation of @inode's identity at the area beginning at
10346+ * @start.
10347+ */
10348+static char *encode_inode(struct inode *inode, char *start)
10349+{
10350+ assert("nikita-3517", inode != NULL);
10351+ assert("nikita-3518", inode_file_plugin(inode) != NULL);
10352+ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10353+
10354+ /*
10355+ * first, store two-byte identifier of object plugin, then
10356+ */
10357+ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10358+ (d16 *) start);
10359+ start += sizeof(d16);
10360+ /*
10361+ * call plugin to serialize object's identity
10362+ */
10363+ return inode_file_plugin(inode)->wire.write(inode, start);
10364+}
10365+
10366+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10367+ * returned if file handle can not be stored */
10368+/**
10369+ * reiser4_encode_fh - encode_fh of export operations
10370+ * @dentry:
10371+ * @fh:
10372+ * @lenp:
10373+ * @need_parent:
10374+ *
10375+ */
10376+static int
10377+reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10378+ int need_parent)
10379+{
10380+ struct inode *inode;
10381+ struct inode *parent;
10382+ char *addr;
10383+ int need;
10384+ int delta;
10385+ int result;
10386+ reiser4_context *ctx;
10387+
10388+ /*
10389+ * knfsd asks as to serialize object in @dentry, and, optionally its
10390+ * parent (if need_parent != 0).
10391+ *
10392+ * encode_inode() and encode_inode_size() is used to build
10393+ * representation of object and its parent. All hard work is done by
10394+ * object plugins.
10395+ */
10396+ inode = dentry->d_inode;
10397+ parent = dentry->d_parent->d_inode;
10398+
10399+ addr = (char *)fh;
10400+
10401+ need = encode_inode_size(inode);
10402+ if (need < 0)
10403+ return NFSERROR;
10404+ if (need_parent) {
10405+ delta = encode_inode_size(parent);
10406+ if (delta < 0)
10407+ return NFSERROR;
10408+ need += delta;
10409+ }
10410+
10411+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
10412+ if (IS_ERR(ctx))
10413+ return PTR_ERR(ctx);
10414+
10415+ if (need <= sizeof(__u32) * (*lenp)) {
10416+ addr = encode_inode(inode, addr);
10417+ if (need_parent)
10418+ addr = encode_inode(parent, addr);
10419+
10420+ /* store in lenp number of 32bit words required for file
10421+ * handle. */
10422+ *lenp = (need + sizeof(__u32) - 1) >> 2;
10423+ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10424+ } else
10425+ /* no enough space in file handle */
10426+ result = NFSERROR;
10427+ reiser4_exit_context(ctx);
10428+ return result;
10429+}
10430+
10431+/**
10432+ * reiser4_get_dentry_parent - get_parent of export operations
10433+ * @child:
10434+ *
10435+ */
10436+static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10437+{
10438+ struct inode *dir;
10439+ dir_plugin *dplug;
10440+
10441+ assert("nikita-3527", child != NULL);
10442+ /* see comment in reiser4_get_dentry() about following assertion */
10443+ assert("nikita-3528", is_in_reiser4_context());
10444+
10445+ dir = child->d_inode;
10446+ assert("nikita-3529", dir != NULL);
10447+ dplug = inode_dir_plugin(dir);
10448+ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10449+ if (dplug != NULL)
10450+ return dplug->get_parent(dir);
10451+ else
10452+ return ERR_PTR(RETERR(-ENOTDIR));
10453+}
10454+
10455+/**
10456+ * reiser4_get_dentry - get_dentry of export operations
10457+ * @super:
10458+ * @data:
10459+ *
10460+ *
10461+ */
10462+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10463+{
10464+ reiser4_object_on_wire *o;
10465+
10466+ assert("nikita-3522", super != NULL);
10467+ assert("nikita-3523", data != NULL);
10468+ /*
10469+ * this is only supposed to be called by
10470+ *
10471+ * reiser4_decode_fh->find_exported_dentry
10472+ *
10473+ * so, reiser4_context should be here already.
10474+ */
10475+ assert("nikita-3526", is_in_reiser4_context());
10476+
10477+ o = (reiser4_object_on_wire *)data;
10478+ assert("nikita-3524", o->plugin != NULL);
10479+ assert("nikita-3525", o->plugin->wire.get != NULL);
10480+
10481+ return o->plugin->wire.get(super, o);
10482+}
10483+
10484+struct export_operations reiser4_export_operations = {
10485+ .encode_fh = reiser4_encode_fh,
10486+// .decode_fh = reiser4_decode_fh,
10487+ .get_parent = reiser4_get_dentry_parent,
10488+// .get_dentry = reiser4_get_dentry
10489+};
10490+
10491+/*
10492+ * Local variables:
10493+ * c-indentation-style: "K&R"
10494+ * mode-name: "LC"
10495+ * c-basic-offset: 8
10496+ * tab-width: 8
10497+ * fill-column: 79
10498+ * End:
10499+ */
10500diff -urN linux-2.6.23.orig/fs/reiser4/flush.c linux-2.6.23/fs/reiser4/flush.c
10501--- linux-2.6.23.orig/fs/reiser4/flush.c 1970-01-01 03:00:00.000000000 +0300
10502+++ linux-2.6.23/fs/reiser4/flush.c 2007-12-04 16:49:30.000000000 +0300
10503@@ -0,0 +1,3625 @@
10504+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10505+
10506+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10507+
10508+#include "forward.h"
10509+#include "debug.h"
10510+#include "dformat.h"
10511+#include "key.h"
10512+#include "coord.h"
10513+#include "plugin/item/item.h"
10514+#include "plugin/plugin.h"
10515+#include "plugin/object.h"
10516+#include "txnmgr.h"
10517+#include "jnode.h"
10518+#include "znode.h"
10519+#include "block_alloc.h"
10520+#include "tree_walk.h"
10521+#include "carry.h"
10522+#include "tree.h"
10523+#include "vfs_ops.h"
10524+#include "inode.h"
10525+#include "page_cache.h"
10526+#include "wander.h"
10527+#include "super.h"
10528+#include "entd.h"
10529+#include "reiser4.h"
10530+#include "flush.h"
10531+#include "writeout.h"
10532+
10533+#include <asm/atomic.h>
10534+#include <linux/fs.h> /* for struct super_block */
10535+#include <linux/mm.h> /* for struct page */
10536+#include <linux/bio.h> /* for struct bio */
10537+#include <linux/pagemap.h>
10538+#include <linux/blkdev.h>
10539+
10540+/* IMPLEMENTATION NOTES */
10541+
10542+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
10543+ order to the nodes of the tree in which the parent is placed before its children, which
10544+ are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it
10545+ describes the node that "came before in forward parent-first order". When we speak of a
10546+ "parent-first follower", it describes the node that "comes next in parent-first
10547+ order" (alternatively the node that "came before in reverse parent-first order").
10548+
10549+ The following pseudo-code prints the nodes of a tree in forward parent-first order:
10550+
10551+ void parent_first (node)
10552+ {
10553+ print_node (node);
10554+ if (node->level > leaf) {
10555+ for (i = 0; i < num_children; i += 1) {
10556+ parent_first (node->child[i]);
10557+ }
10558+ }
10559+ }
10560+*/
10561+
10562+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so
10563+ that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
10564+ can be accomplished with sequential reads, which results in reading nodes in their
10565+ parent-first order. This is a read-optimization aspect of the flush algorithm, and
10566+ there is also a write-optimization aspect, which is that we wish to make large
10567+ sequential writes to the disk by allocating or reallocating blocks so that they can be
10568+ written in sequence. Sometimes the read-optimization and write-optimization goals
10569+ conflict with each other, as we discuss in more detail below.
10570+*/
10571+
10572+/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are
10573+ the relevant jnode->state bits and their relevence to flush:
10574+
10575+ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it
10576+ must be allocated first. In order to be considered allocated, the jnode must have
10577+ exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and
10578+ all dirtied jnodes eventually have one of these bits set during each transaction.
10579+
10580+ JNODE_CREATED: The node was freshly created in its transaction and has no previous
10581+ block address, so it is unconditionally assigned to be relocated, although this is
10582+ mainly for code-convenience. It is not being 'relocated' from anything, but in
10583+ almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit
10584+ remains set even after JNODE_RELOC is set, so the actual relocate can be
10585+ distinguished from the created-and-allocated set easily: relocate-set members
10586+ (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
10587+ have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10588+
10589+ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
10590+ decision to maintain the pre-existing location for this node and it will be written
10591+ to the wandered-log.
10592+
10593+ JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
10594+ not created, see note above). A block with JNODE_RELOC set is eligible for
10595+ early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC
10596+ bit is set on a znode, the parent node's internal item is modified and the znode is
10597+ rehashed.
10598+
10599+ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
10600+ and calls plugin->f.squeeze() method for its items. By this technology we update disk
10601+ clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
10602+ has this flag (races with write(), rare case) the flush algorythm makes the decision
10603+ to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
10604+ repeated allocation.
10605+
10606+ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
10607+ flush queue. This means the jnode is not on any clean or dirty list, instead it is
10608+ moved to one of the flush queue (see flush_queue.h) object private list. This
10609+ prevents multiple concurrent flushes from attempting to start flushing from the
10610+ same node.
10611+
10612+ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10613+ squeeze-and-allocate on a node while its children are actively being squeezed and
10614+ allocated. This flag was created to avoid submitting a write request for a node
10615+ while its children are still being allocated and squeezed. Then flush queue was
10616+ re-implemented to allow unlimited number of nodes be queued. This flag support was
10617+ commented out in source code because we decided that there was no reason to submit
10618+ queued nodes before jnode_flush() finishes. However, current code calls fq_write()
10619+ during a slum traversal and may submit "busy nodes" to disk. Probably we can
10620+ re-enable the JNODE_FLUSH_BUSY bit support in future.
10621+
10622+ With these state bits, we describe a test used frequently in the code below,
10623+ jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The
10624+ test for "flushprepped" returns true if any of the following are true:
10625+
10626+ - The node is not dirty
10627+ - The node has JNODE_RELOC set
10628+ - The node has JNODE_OVRWR set
10629+
10630+ If either the node is not dirty or it has already been processed by flush (and assigned
10631+ JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns
10632+ true then flush has work to do on that node.
10633+*/
10634+
10635+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10636+ flushprepped twice (unless an explicit call to flush_unprep is made as described in
10637+ detail below). For example a node is dirtied, allocated, and then early-flushed to
10638+ disk and set clean. Before the transaction commits, the page is dirtied again and, due
10639+ to memory pressure, the node is flushed again. The flush algorithm will not relocate
10640+ the node to a new disk location, it will simply write it to the same, previously
10641+ relocated position again.
10642+*/
10643+
10644+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
10645+ start at a leaf node and allocate in parent-first order by iterating to the right. At
10646+ each step of the iteration, we check for the right neighbor. Before advancing to the
10647+ right neighbor, we check if the current position and the right neighbor share the same
10648+ parent. If they do not share the same parent, the parent is allocated before the right
10649+ neighbor.
10650+
10651+ This process goes recursively up the tree and squeeze nodes level by level as long as
10652+ the right neighbor and the current position have different parents, then it allocates
10653+ the right-neighbors-with-different-parents on the way back down. This process is
10654+ described in more detail in flush_squalloc_changed_ancestor and the recursive function
10655+ squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10656+ specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
10657+ approaches.
10658+
10659+ The top-down algorithm was implemented earlier (April-May 2002). In the top-down
10660+ approach, we find a starting point by scanning left along each level past dirty nodes,
10661+ then going up and repeating the process until the left node and the parent node are
10662+ clean. We then perform a parent-first traversal from the starting point, which makes
10663+ allocating in parent-first order trivial. After one subtree has been allocated in this
10664+ manner, we move to the right, try moving upward, then repeat the parent-first
10665+ traversal.
10666+
10667+ Both approaches have problems that need to be addressed. Both are approximately the
10668+ same amount of code, but the bottom-up approach has advantages in the order it acquires
10669+ locks which, at the very least, make it the better approach. At first glance each one
10670+ makes the other one look simpler, so it is important to remember a few of the problems
10671+ with each one.
10672+
10673+ Main problem with the top-down approach: When you encounter a clean child during the
10674+ parent-first traversal, what do you do? You would like to avoid searching through a
10675+ large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
10676+ obvious solution. One of the advantages of the top-down approach is that during the
10677+ parent-first traversal you check every child of a parent to see if it is dirty. In
10678+ this way, the top-down approach easily handles the main problem of the bottom-up
10679+ approach: unallocated children.
10680+
10681+ The unallocated children problem is that before writing a node to disk we must make
10682+ sure that all of its children are allocated. Otherwise, the writing the node means
10683+ extra I/O because the node will have to be written again when the child is finally
10684+ allocated.
10685+
10686+ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this
10687+ should not cause any file system corruption, it only degrades I/O performance because a
10688+ node may be written when it is sure to be written at least one more time in the same
10689+ transaction when the remaining children are allocated. What follows is a description
10690+ of how we will solve the problem.
10691+*/
10692+
10693+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
10694+ proceeding in parent first order, allocate some of its left-children, then encounter a
10695+ clean child in the middle of the parent. We do not allocate the clean child, but there
10696+ may remain unallocated (dirty) children to the right of the clean child. If we were to
10697+ stop flushing at this moment and write everything to disk, the parent might still
10698+ contain unallocated children.
10699+
10700+ We could try to allocate all the descendents of every node that we allocate, but this
10701+ is not necessary. Doing so could result in allocating the entire tree: if the root
10702+ node is allocated then every unallocated node would have to be allocated before
10703+ flushing. Actually, we do not have to write a node just because we allocate it. It is
10704+ possible to allocate but not write a node during flush, when it still has unallocated
10705+ children. However, this approach is probably not optimal for the following reason.
10706+
10707+ The flush algorithm is designed to allocate nodes in parent-first order in an attempt
10708+ to optimize reads that occur in the same order. Thus we are read-optimizing for a
10709+ left-to-right scan through all the leaves in the system, and we are hoping to
10710+ write-optimize at the same time because those nodes will be written together in batch.
10711+ What happens, however, if we assign a block number to a node in its read-optimized
10712+ order but then avoid writing it because it has unallocated children? In that
10713+ situation, we lose out on the write-optimization aspect because a node will have to be
10714+ written again to the its location on the device, later, which likely means seeking back
10715+ to that location.
10716+
10717+ So there are tradeoffs. We can choose either:
10718+
10719+ A. Allocate all unallocated children to preserve both write-optimization and
10720+ read-optimization, but this is not always desirable because it may mean having to
10721+ allocate and flush very many nodes at once.
10722+
10723+ B. Defer writing nodes with unallocated children, keep their read-optimized locations,
10724+ but sacrifice write-optimization because those nodes will be written again.
10725+
10726+ C. Defer writing nodes with unallocated children, but do not keep their read-optimized
10727+ locations. Instead, choose to write-optimize them later, when they are written. To
10728+ facilitate this, we "undo" the read-optimized allocation that was given to the node so
10729+ that later it can be write-optimized, thus "unpreparing" the flush decision. This is a
10730+ case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a
10731+ call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10732+ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
10733+ location, and set the JNODE_CREATED bit, effectively setting the node back to an
10734+ unallocated state.
10735+
10736+ We will take the following approach in v4.0: for twig nodes we will always finish
10737+ allocating unallocated children (A). For nodes with (level > TWIG) we will defer
10738+ writing and choose write-optimization (C).
10739+
10740+ To summarize, there are several parts to a solution that avoids the problem with
10741+ unallocated children:
10742+
10743+ FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
10744+ problem because there was an experiment which was done showed that we have 1-2 nodes
10745+ with unallocated children for thousands of written nodes. The experiment was simple
10746+ like coping / deletion of linux kernel sources. However the problem can arise in more
10747+ complex tests. I think we have jnode_io_hook to insert a check for unallocated
10748+ children and see what kind of problem we have.
10749+
10750+ 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
10751+ squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to
10752+ implement: should be simple -- amounts to adding a while loop to jnode_flush, see
10753+ comments in that function.
10754+
10755+ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
10756+ have unallocated children. If the twig level has unallocated children it is an
10757+ assertion failure. If a higher-level node has unallocated children, then it should be
10758+ explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement:
10759+ should be simple.
10760+
10761+ 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
10762+ CPU cycles than we would like, and it is possible (but medium complexity) to optimize
10763+ this somewhat in the case where large sub-trees are flushed. The following observation
10764+ helps: if both the left- and right-neighbor of a node are processed by the flush
10765+ algorithm then the node itself is guaranteed to have all of its children allocated.
10766+ However, the cost of this check may not be so expensive after all: it is not needed for
10767+ leaves and flush can guarantee this property for twigs. That leaves only (level >
10768+ TWIG) nodes that have to be checked, so this optimization only helps if at least three
10769+ (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
10770+ there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes
10771+ then the number of blocks being written will be very large, so the savings may be
10772+ insignificant. That said, the idea is to maintain both the left and right edges of
10773+ nodes that are processed in flush. When flush_empty_queue() is called, a relatively
10774+ simple test will tell whether the (level > TWIG) node is on the edge. If it is on the
10775+ edge, the slow check is necessary, but if it is in the interior then it can be assumed
10776+ to have all of its children allocated. FIXME: medium complexity to implement, but
10777+ simple to verify given that we must have a slow check anyway.
10778+
10779+ 4. (Optional) This part is optional, not for v4.0--flush should work independently of
10780+ whether this option is used or not. Called RAPID_SCAN, the idea is to amend the
10781+ left-scan operation to take unallocated children into account. Normally, the left-scan
10782+ operation goes left as long as adjacent nodes are dirty up until some large maximum
10783+ value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left
10784+ may stop at a position where there are unallocated children to the left with the same
10785+ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
10786+ FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
10787+ with a rapid scan. The rapid scan skips all the interior children of a node--if the
10788+ leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
10789+ twig to the left). If the left neighbor of the leftmost child is also dirty, then
10790+ continue the scan at the left twig and repeat. This option will cause flush to
10791+ allocate more twigs in a single pass, but it also has the potential to write many more
10792+ nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN
10793+ was partially implemented, code removed August 12, 2002 by JMACD.
10794+*/
10795+
10796+/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the
10797+ starting point for flush is a leaf node, but actually the flush code cares very little
10798+ about whether or not this is true. It is possible that all the leaf nodes are flushed
10799+ and dirty parent nodes still remain, in which case jnode_flush() is called on a
10800+ non-leaf argument. Flush doesn't care--it treats the argument node as if it were a
10801+ leaf, even when it is not. This is a simple approach, and there may be a more optimal
10802+ policy but until a problem with this approach is discovered, simplest is probably best.
10803+
10804+ NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
10805+ the leaves. This is done as a matter of simplicity and there is only one (shaky)
10806+ justification. When an atom commits, it flushes all leaf level nodes first, followed
10807+ by twigs, and so on. With flushing done in this order, if flush is eventually called
10808+ on a non-leaf node it means that (somehow) we reached a point where all leaves are
10809+ clean and only internal nodes need to be flushed. If that it the case, then it means
10810+ there were no leaves that were the parent-first preceder/follower of the parent. This
10811+ is expected to be a rare case, which is why we do nothing special about it. However,
10812+ memory pressure may pass an internal node to flush when there are still dirty leaf
10813+ nodes that need to be flushed, which could prove our original assumptions
10814+ "inoperative". If this needs to be fixed, then scan_left/right should have
10815+ special checks for the non-leaf levels. For example, instead of passing from a node to
10816+ the left neighbor, it should pass from the node to the left neighbor's rightmost
10817+ descendent (if dirty).
10818+
10819+*/
10820+
10821+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting
10822+ it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the
10823+ logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
10824+ device if we are walking from right to left. We then make passes in alternating directions, and as we do this the
10825+ device becomes sorted such that tree order and block number order fully correlate.
10826+
10827+ Resizing is done by shifting everything either all the way to the left or all the way
10828+ to the right, and then reporting the last block.
10829+*/
10830+
10831+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This
10832+ descibes the policy from the highest level:
10833+
10834+ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
10835+ leaf level during flush-scan (right, left), then we unconditionally decide to relocate
10836+ leaf nodes.
10837+
10838+ Otherwise, there are two contexts in which we make a decision to relocate:
10839+
10840+ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
10841+ During the initial stages of flush, after scan-right completes, we want to ask the
10842+ question: should we relocate this leaf node and thus dirty the parent node. Then if
10843+ the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
10844+ the question at the next level up, and so on. In these cases we are moving in the
10845+ reverse-parent first direction.
10846+
10847+ There is another case which is considered the reverse direction, which comes at the end
10848+ of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may
10849+ reach a point where there is a clean twig to the right with a dirty leftmost child. In
10850+ this case, we may wish to relocate the child by testing if it should be relocated
10851+ relative to its parent.
10852+
10853+ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
10854+ allocate_znode. What distinguishes the forward parent-first case from the
10855+ reverse-parent first case is that the preceder has already been allocated in the
10856+ forward case, whereas in the reverse case we don't know what the preceder is until we
10857+ finish "going in reverse". That simplifies the forward case considerably, and there we
10858+ actually use the block allocator to determine whether, e.g., a block closer to the
10859+ preceder is available.
10860+*/
10861+
10862+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we
10863+ finish scan-left and find a starting point, if the parent's left neighbor is dirty then
10864+ squeeze the parent's left neighbor and the parent. This may change the
10865+ flush-starting-node's parent. Repeat until the child's parent is stable. If the child
10866+ is a leftmost child, repeat this left-edge squeezing operation at the next level up.
10867+ Note that we cannot allocate extents during this or they will be out of parent-first
10868+ order. There is also some difficult coordinate maintenence issues. We can't do a tree
10869+ search to find coordinates again (because we hold locks), we have to determine them
10870+ from the two nodes being squeezed. Looks difficult, but has potential to increase
10871+ space utilization. */
10872+
10873+/* Flush-scan helper functions. */
10874+static void scan_init(flush_scan * scan);
10875+static void scan_done(flush_scan * scan);
10876+
10877+/* Flush-scan algorithm. */
10878+static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
10879+ unsigned limit);
10880+static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
10881+static int scan_common(flush_scan * scan, flush_scan * other);
10882+static int scan_formatted(flush_scan * scan);
10883+static int scan_unformatted(flush_scan * scan, flush_scan * other);
10884+static int scan_by_coord(flush_scan * scan);
10885+
10886+/* Initial flush-point ancestor allocation. */
10887+static int alloc_pos_and_ancestors(flush_pos_t * pos);
10888+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
10889+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
10890+
10891+/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */
10892+static int squalloc(flush_pos_t * pos);
10893+
10894+/* Flush squeeze implementation. */
10895+static int squeeze_right_non_twig(znode * left, znode * right);
10896+static int shift_one_internal_unit(znode * left, znode * right);
10897+
10898+/* Flush reverse parent-first relocation routines. */
10899+static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
10900+ const reiser4_block_nr * nblk);
10901+static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
10902+ flush_pos_t * pos);
10903+static int reverse_relocate_check_dirty_parent(jnode * node,
10904+ const coord_t * parent_coord,
10905+ flush_pos_t * pos);
10906+
10907+/* Flush allocate write-queueing functions: */
10908+static int allocate_znode(znode * node, const coord_t * parent_coord,
10909+ flush_pos_t * pos);
10910+static int allocate_znode_update(znode * node, const coord_t * parent_coord,
10911+ flush_pos_t * pos);
10912+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
10913+
10914+/* Flush helper functions: */
10915+static int jnode_lock_parent_coord(jnode * node,
10916+ coord_t * coord,
10917+ lock_handle * parent_lh,
10918+ load_count * parent_zh,
10919+ znode_lock_mode mode, int try);
10920+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
10921+ znode_lock_mode mode, int check_dirty, int expected);
10922+static int znode_same_parents(znode * a, znode * b);
10923+
10924+static int znode_check_flushprepped(znode * node)
10925+{
10926+ return jnode_check_flushprepped(ZJNODE(node));
10927+}
10928+
10929+/* Flush position functions */
10930+static void pos_init(flush_pos_t * pos);
10931+static int pos_valid(flush_pos_t * pos);
10932+static void pos_done(flush_pos_t * pos);
10933+static int pos_stop(flush_pos_t * pos);
10934+
10935+/* check that @org is first jnode extent unit, if extent is unallocated,
10936+ * because all jnodes of unallocated extent are dirty and of the same atom. */
10937+#define checkchild(scan) \
10938+assert("nikita-3435", \
10939+ ergo(scan->direction == LEFT_SIDE && \
10940+ (scan->parent_coord.node->level == TWIG_LEVEL) && \
10941+ jnode_is_unformatted(scan->node) && \
10942+ extent_is_unallocated(&scan->parent_coord), \
10943+ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
10944+
10945+/* This flush_cnt variable is used to track the number of concurrent flush operations,
10946+ useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has
10947+ no static initializer function...) */
10948+ON_DEBUG(atomic_t flush_cnt;
10949+ )
10950+
10951+/* check fs backing device for write congestion */
10952+static int check_write_congestion(void)
10953+{
10954+ struct super_block *sb;
10955+ struct backing_dev_info *bdi;
10956+
10957+ sb = reiser4_get_current_sb();
10958+ bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
10959+ return bdi_write_congested(bdi);
10960+}
10961+
10962+/* conditionally write flush queue */
10963+static int write_prepped_nodes(flush_pos_t * pos)
10964+{
10965+ int ret;
10966+
10967+ assert("zam-831", pos);
10968+ assert("zam-832", pos->fq);
10969+
10970+ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
10971+ return 0;
10972+
10973+ if (check_write_congestion())
10974+ return 0;
10975+
10976+ ret = reiser4_write_fq(pos->fq, pos->nr_written,
10977+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
10978+ return ret;
10979+}
10980+
10981+/* Proper release all flush pos. resources then move flush position to new
10982+ locked node */
10983+static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
10984+ load_count * new_load, const coord_t * new_coord)
10985+{
10986+ assert("zam-857", new_lock->node == new_load->node);
10987+
10988+ if (new_coord) {
10989+ assert("zam-858", new_coord->node == new_lock->node);
10990+ coord_dup(&pos->coord, new_coord);
10991+ } else {
10992+ coord_init_first_unit(&pos->coord, new_lock->node);
10993+ }
10994+
10995+ if (pos->child) {
10996+ jput(pos->child);
10997+ pos->child = NULL;
10998+ }
10999+
11000+ move_load_count(&pos->load, new_load);
11001+ done_lh(&pos->lock);
11002+ move_lh(&pos->lock, new_lock);
11003+}
11004+
11005+/* delete empty node which link from the parent still exists. */
11006+static int delete_empty_node(znode * node)
11007+{
11008+ reiser4_key smallest_removed;
11009+
11010+ assert("zam-1019", node != NULL);
11011+ assert("zam-1020", node_is_empty(node));
11012+ assert("zam-1023", znode_is_wlocked(node));
11013+
11014+ return reiser4_delete_node(node, &smallest_removed, NULL, 1);
11015+}
11016+
11017+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
11018+static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
11019+{
11020+ int ret;
11021+ load_count load;
11022+ lock_handle lock;
11023+
11024+ init_lh(&lock);
11025+ init_load_count(&load);
11026+
11027+ if (jnode_is_znode(org)) {
11028+ ret = longterm_lock_znode(&lock, JZNODE(org),
11029+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
11030+ if (ret)
11031+ return ret;
11032+
11033+ ret = incr_load_count_znode(&load, JZNODE(org));
11034+ if (ret)
11035+ return ret;
11036+
11037+ pos->state =
11038+ (jnode_get_level(org) ==
11039+ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11040+ move_flush_pos(pos, &lock, &load, NULL);
11041+ } else {
11042+ coord_t parent_coord;
11043+ ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11044+ &load, ZNODE_WRITE_LOCK, 0);
11045+ if (ret)
11046+ goto done;
11047+ if (!item_is_extent(&parent_coord)) {
11048+ /* file was converted to tail, org became HB, we found internal
11049+ item */
11050+ ret = -EAGAIN;
11051+ goto done;
11052+ }
11053+
11054+ pos->state = POS_ON_EPOINT;
11055+ move_flush_pos(pos, &lock, &load, &parent_coord);
11056+ pos->child = jref(org);
11057+ if (extent_is_unallocated(&parent_coord)
11058+ && extent_unit_index(&parent_coord) != index_jnode(org)) {
11059+ /* @org is not first child of its parent unit. This may happen
11060+ because longerm lock of its parent node was released between
11061+ scan_left and scan_right. For now work around this having flush to repeat */
11062+ ret = -EAGAIN;
11063+ }
11064+ }
11065+
11066+ done:
11067+ done_load_count(&load);
11068+ done_lh(&lock);
11069+ return ret;
11070+}
11071+
11072+/* TODO LIST (no particular order): */
11073+/* I have labelled most of the legitimate FIXME comments in this file with letters to
11074+ indicate which issue they relate to. There are a few miscellaneous FIXMEs with
11075+ specific names mentioned instead that need to be inspected/resolved. */
11076+/* B. There is an issue described in reverse_relocate_test having to do with an
11077+ imprecise is_preceder? check having to do with partially-dirty extents. The code that
11078+ sets preceder hints and computes the preceder is basically untested. Careful testing
11079+ needs to be done that preceder calculations are done correctly, since if it doesn't
11080+ affect correctness we will not catch this stuff during regular testing. */
11081+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are
11082+ considered expected but unlikely conditions. Flush currently returns 0 (i.e., success
11083+ but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11084+ Many of the calls that may produce one of these return values (i.e.,
11085+ longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11086+ values themselves and, for instance, stop flushing instead of resulting in a restart.
11087+ If any of these results are true error conditions then flush will go into a busy-loop,
11088+ as we noticed during testing when a corrupt tree caused find_child_ptr to return
11089+ ENOENT. It needs careful thought and testing of corner conditions.
11090+*/
11091+/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created
11092+ block is assigned a block number then early-flushed to disk. It is dirtied again and
11093+ flush is called again. Concurrently, that block is deleted, and the de-allocation of
11094+ its block number does not need to be deferred, since it is not part of the preserve set
11095+ (i.e., it didn't exist before the transaction). I think there may be a race condition
11096+ where flush writes the dirty, created block after the non-deferred deallocated block
11097+ number is re-allocated, making it possible to write deleted data on top of non-deleted
11098+ data. Its just a theory, but it needs to be thought out. */
11099+/* F. bio_alloc() failure is not handled gracefully. */
11100+/* G. Unallocated children. */
11101+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11102+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11103+
11104+/* JNODE_FLUSH: MAIN ENTRY POINT */
11105+/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11106+ neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty
11107+ blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11108+ a part of transaction commit.
11109+
11110+ Our objective here is to prep and flush the slum the jnode belongs to. We want to
11111+ squish the slum together, and allocate the nodes in it as we squish because allocation
11112+ of children affects squishing of parents.
11113+
11114+ The "argument" @node tells flush where to start. From there, flush finds the left edge
11115+ of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a
11116+ "better place" to start squalloc first we perform a flush_scan.
11117+
11118+ Flush-scanning may be performed in both left and right directions, but for different
11119+ purposes. When scanning to the left, we are searching for a node that precedes a
11120+ sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11121+ During flush-scanning, we also take the opportunity to count the number of consecutive
11122+ leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11123+ make a decision to reallocate leaf nodes (thus favoring write-optimization).
11124+
11125+ Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11126+ also be dirty nodes to the right of the argument. If the scan-left operation does not
11127+ count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11128+ operation to see whether there is, in fact, enough nodes to meet the relocate
11129+ threshold. Each right- and left-scan operation uses a single flush_scan object.
11130+
11131+ After left-scan and possibly right-scan, we prepare a flush_position object with the
11132+ starting flush point or parent coordinate, which was determined using scan-left.
11133+
11134+ Next we call the main flush routine, squalloc, which iterates along the
11135+ leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11136+
11137+ After squalloc returns we take extra steps to ensure that all the children
11138+ of the final twig node are allocated--this involves repeating squalloc
11139+ until we finish at a twig with no unallocated children.
11140+
11141+ Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter
11142+ any above-twig nodes during flush_empty_queue that still have unallocated children, we
11143+ flush_unprep them.
11144+
11145+ Flush treats several "failure" cases as non-failures, essentially causing them to start
11146+ over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11147+ probably be handled properly rather than restarting, but there are a bunch of cases to
11148+ audit.
11149+*/
11150+
11151+static int
11152+jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11153+ flush_queue_t * fq, int flags)
11154+{
11155+ long ret = 0;
11156+ flush_scan *right_scan;
11157+ flush_scan *left_scan;
11158+ flush_pos_t *flush_pos;
11159+ int todo;
11160+ struct super_block *sb;
11161+ reiser4_super_info_data *sbinfo;
11162+ jnode *leftmost_in_slum = NULL;
11163+
11164+ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11165+ assert("nikita-3022", reiser4_schedulable());
11166+
11167+ assert("nikita-3185",
11168+ get_current_super_private()->delete_mutex_owner != current);
11169+
11170+ /* allocate right_scan, left_scan and flush_pos */
11171+ right_scan =
11172+ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11173+ reiser4_ctx_gfp_mask_get());
11174+ if (right_scan == NULL)
11175+ return RETERR(-ENOMEM);
11176+ left_scan = right_scan + 1;
11177+ flush_pos = (flush_pos_t *) (left_scan + 1);
11178+
11179+ sb = reiser4_get_current_sb();
11180+ sbinfo = get_super_private(sb);
11181+
11182+ /* Flush-concurrency debug code */
11183+#if REISER4_DEBUG
11184+ atomic_inc(&flush_cnt);
11185+#endif
11186+
11187+ reiser4_enter_flush(sb);
11188+
11189+ /* Initialize a flush position. */
11190+ pos_init(flush_pos);
11191+
11192+ flush_pos->nr_written = nr_written;
11193+ flush_pos->fq = fq;
11194+ flush_pos->flags = flags;
11195+ flush_pos->nr_to_write = nr_to_write;
11196+
11197+ scan_init(right_scan);
11198+ scan_init(left_scan);
11199+
11200+ /* First scan left and remember the leftmost scan position. If the leftmost
11201+ position is unformatted we remember its parent_coord. We scan until counting
11202+ FLUSH_SCAN_MAXNODES.
11203+
11204+ If starting @node is unformatted, at the beginning of left scan its
11205+ parent (twig level node, containing extent item) will be long term
11206+ locked and lock handle will be stored in the
11207+ @right_scan->parent_lock. This lock is used to start the rightward
11208+ scan without redoing the tree traversal (necessary to find parent)
11209+ and, hence, is kept during leftward scan. As a result, we have to
11210+ use try-lock when taking long term locks during the leftward scan.
11211+ */
11212+ ret = scan_left(left_scan, right_scan,
11213+ node, sbinfo->flush.scan_maxnodes);
11214+ if (ret != 0)
11215+ goto failed;
11216+
11217+ leftmost_in_slum = jref(left_scan->node);
11218+ scan_done(left_scan);
11219+
11220+ /* Then possibly go right to decide if we will use a policy of relocating leaves.
11221+ This is only done if we did not scan past (and count) enough nodes during the
11222+ leftward scan. If we do scan right, we only care to go far enough to establish
11223+ that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The
11224+ scan limit is the difference between left_scan.count and the threshold. */
11225+
11226+ todo = sbinfo->flush.relocate_threshold - left_scan->count;
11227+ /* scan right is inherently deadlock prone, because we are
11228+ * (potentially) holding a lock on the twig node at this moment.
11229+ * FIXME: this is incorrect comment: lock is not held */
11230+ if (todo > 0) {
11231+ ret = scan_right(right_scan, node, (unsigned)todo);
11232+ if (ret != 0)
11233+ goto failed;
11234+ }
11235+
11236+ /* Only the right-scan count is needed, release any rightward locks right away. */
11237+ scan_done(right_scan);
11238+
11239+ /* ... and the answer is: we should relocate leaf nodes if at least
11240+ FLUSH_RELOCATE_THRESHOLD nodes were found. */
11241+ flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11242+ (left_scan->count + right_scan->count >=
11243+ sbinfo->flush.relocate_threshold);
11244+
11245+ /* Funny business here. We set the 'point' in the flush_position at prior to
11246+ starting squalloc regardless of whether the first point is
11247+ formatted or unformatted. Without this there would be an invariant, in the
11248+ rest of the code, that if the flush_position is unformatted then
11249+ flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11250+ and if the flush_position is formatted then flush_position->point is non-NULL
11251+ and no parent info is set.
11252+
11253+ This seems lazy, but it makes the initial calls to reverse_relocate_test
11254+ (which ask "is it the pos->point the leftmost child of its parent") much easier
11255+ because we know the first child already. Nothing is broken by this, but the
11256+ reasoning is subtle. Holding an extra reference on a jnode during flush can
11257+ cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11258+ removed from sibling lists until they have zero reference count. Flush would
11259+ never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11260+ deleted to the right. So if nothing is broken, why fix it?
11261+
11262+ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11263+ point and in any moment, because of the concurrent file system
11264+ activity (for example, truncate). */
11265+
11266+ /* Check jnode state after flush_scan completed. Having a lock on this
11267+ node or its parent (in case of unformatted) helps us in case of
11268+ concurrent flushing. */
11269+ if (jnode_check_flushprepped(leftmost_in_slum)
11270+ && !jnode_convertible(leftmost_in_slum)) {
11271+ ret = 0;
11272+ goto failed;
11273+ }
11274+
11275+ /* Now setup flush_pos using scan_left's endpoint. */
11276+ ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11277+ if (ret)
11278+ goto failed;
11279+
11280+ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11281+ && node_is_empty(flush_pos->coord.node)) {
11282+ znode *empty = flush_pos->coord.node;
11283+
11284+ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11285+ ret = delete_empty_node(empty);
11286+ goto failed;
11287+ }
11288+
11289+ if (jnode_check_flushprepped(leftmost_in_slum)
11290+ && !jnode_convertible(leftmost_in_slum)) {
11291+ ret = 0;
11292+ goto failed;
11293+ }
11294+
11295+ /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */
11296+ ret = alloc_pos_and_ancestors(flush_pos);
11297+ if (ret)
11298+ goto failed;
11299+
11300+ /* Do the main rightward-bottom-up squeeze and allocate loop. */
11301+ ret = squalloc(flush_pos);
11302+ pos_stop(flush_pos);
11303+ if (ret)
11304+ goto failed;
11305+
11306+ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11307+ First, the pos_stop() and pos_valid() routines should be modified
11308+ so that pos_stop() sets a flush_position->stop flag to 1 without
11309+ releasing the current position immediately--instead release it in
11310+ pos_done(). This is a better implementation than the current one anyway.
11311+
11312+ It is not clear that all fields of the flush_position should not be released,
11313+ but at the very least the parent_lock, parent_coord, and parent_load should
11314+ remain held because they are hold the last twig when pos_stop() is
11315+ called.
11316+
11317+ When we reach this point in the code, if the parent_coord is set to after the
11318+ last item then we know that flush reached the end of a twig (and according to
11319+ the new flush queueing design, we will return now). If parent_coord is not
11320+ past the last item, we should check if the current twig has any unallocated
11321+ children to the right (we are not concerned with unallocated children to the
11322+ left--in that case the twig itself should not have been allocated). If the
11323+ twig has unallocated children to the right, set the parent_coord to that
11324+ position and then repeat the call to squalloc.
11325+
11326+ Testing for unallocated children may be defined in two ways: if any internal
11327+ item has a fake block number, it is unallocated; if any extent item is
11328+ unallocated then all of its children are unallocated. But there is a more
11329+ aggressive approach: if there are any dirty children of the twig to the right
11330+ of the current position, we may wish to relocate those nodes now. Checking for
11331+ potential relocation is more expensive as it requires knowing whether there are
11332+ any dirty children that are not unallocated. The extent_needs_allocation
11333+ should be used after setting the correct preceder.
11334+
11335+ When we reach the end of a twig at this point in the code, if the flush can
11336+ continue (when the queue is ready) it will need some information on the future
11337+ starting point. That should be stored away in the flush_handle using a seal, I
11338+ believe. Holding a jref() on the future starting point may break other code
11339+ that deletes that node.
11340+ */
11341+
11342+ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11343+ above the twig level. If the VM calls flush above the twig level, do nothing
11344+ and return (but figure out why this happens). The txnmgr should be modified to
11345+ only flush its leaf-level dirty list. This will do all the necessary squeeze
11346+ and allocate steps but leave unallocated branches and possibly unallocated
11347+ twigs (when the twig's leftmost child is not dirty). After flushing the leaf
11348+ level, the remaining unallocated nodes should be given write-optimized
11349+ locations. (Possibly, the remaining unallocated twigs should be allocated just
11350+ before their leftmost child.)
11351+ */
11352+
11353+ /* Any failure reaches this point. */
11354+ failed:
11355+
11356+ switch (ret) {
11357+ case -E_REPEAT:
11358+ case -EINVAL:
11359+ case -E_DEADLOCK:
11360+ case -E_NO_NEIGHBOR:
11361+ case -ENOENT:
11362+ /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11363+ in each case. They already are handled in many cases. */
11364+ /* Something bad happened, but difficult to avoid... Try again! */
11365+ ret = 0;
11366+ }
11367+
11368+ if (leftmost_in_slum)
11369+ jput(leftmost_in_slum);
11370+
11371+ pos_done(flush_pos);
11372+ scan_done(left_scan);
11373+ scan_done(right_scan);
11374+ kfree(right_scan);
11375+
11376+ ON_DEBUG(atomic_dec(&flush_cnt));
11377+
11378+ reiser4_leave_flush(sb);
11379+
11380+ return ret;
11381+}
11382+
11383+/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11384+ * flusher should submit all prepped nodes immediately without keeping them in
11385+ * flush queues for long time. The reason for rapid flush mode is to free
11386+ * memory as fast as possible. */
11387+
11388+#if REISER4_USE_RAPID_FLUSH
11389+
11390+/**
11391+ * submit all prepped nodes if rapid flush mode is set,
11392+ * turn rapid flush mode off.
11393+ */
11394+
11395+static int rapid_flush(flush_pos_t * pos)
11396+{
11397+ if (!wbq_available())
11398+ return 0;
11399+
11400+ return write_prepped_nodes(pos);
11401+}
11402+
11403+#else
11404+
11405+#define rapid_flush(pos) (0)
11406+
11407+#endif /* REISER4_USE_RAPID_FLUSH */
11408+
11409+static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11410+ flush_queue_t *fq, int *nr_queued,
11411+ int flags)
11412+{
11413+ jnode * node;
11414+
11415+ if (start != NULL) {
11416+ spin_lock_jnode(start);
11417+ if (!jnode_is_flushprepped(start)) {
11418+ assert("zam-1056", start->atom == atom);
11419+ node = start;
11420+ goto enter;
11421+ }
11422+ spin_unlock_jnode(start);
11423+ }
11424+ /*
11425+ * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11426+ * nodes. The atom spin lock is not released until all dirty nodes processed or
11427+ * not prepped node found in the atom dirty lists.
11428+ */
11429+ while ((node = find_first_dirty_jnode(atom, flags))) {
11430+ spin_lock_jnode(node);
11431+ enter:
11432+ assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11433+ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11434+
11435+ if (JF_ISSET(node, JNODE_WRITEBACK)) {
11436+ /* move node to the end of atom's writeback list */
11437+ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11438+
11439+ /*
11440+ * jnode is not necessarily on dirty list: if it was dirtied when
11441+ * it was on flush queue - it does not get moved to dirty list
11442+ */
11443+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11444+ WB_LIST, 1));
11445+
11446+ } else if (jnode_is_znode(node)
11447+ && znode_above_root(JZNODE(node))) {
11448+ /*
11449+ * A special case for znode-above-root. The above-root (fake)
11450+ * znode is captured and dirtied when the tree height changes or
11451+ * when the root node is relocated. This causes atoms to fuse so
11452+ * that changes at the root are serialized. However, this node is
11453+ * never flushed. This special case used to be in lock.c to
11454+ * prevent the above-root node from ever being captured, but now
11455+ * that it is captured we simply prevent it from flushing. The
11456+ * log-writer code relies on this to properly log superblock
11457+ * modifications of the tree height.
11458+ */
11459+ jnode_make_wander_nolock(node);
11460+ } else if (JF_ISSET(node, JNODE_RELOC)) {
11461+ queue_jnode(fq, node);
11462+ ++(*nr_queued);
11463+ } else
11464+ break;
11465+
11466+ spin_unlock_jnode(node);
11467+ }
11468+ return node;
11469+}
11470+
11471+/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11472+ * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11473+ * other errors as they are. */
11474+int
11475+flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11476+ txn_atom ** atom, jnode *start)
11477+{
11478+ reiser4_super_info_data *sinfo = get_current_super_private();
11479+ flush_queue_t *fq = NULL;
11480+ jnode *node;
11481+ int nr_queued;
11482+ int ret;
11483+
11484+ assert("zam-889", atom != NULL && *atom != NULL);
11485+ assert_spin_locked(&((*atom)->alock));
11486+ assert("zam-892", get_current_context()->trans->atom == *atom);
11487+
11488+ nr_to_write = LONG_MAX;
11489+ while (1) {
11490+ ret = reiser4_fq_by_atom(*atom, &fq);
11491+ if (ret != -E_REPEAT)
11492+ break;
11493+ *atom = get_current_atom_locked();
11494+ }
11495+ if (ret)
11496+ return ret;
11497+
11498+ assert_spin_locked(&((*atom)->alock));
11499+
11500+ /* parallel flushers limit */
11501+ if (sinfo->tmgr.atom_max_flushers != 0) {
11502+ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11503+ /* An reiser4_atom_send_event() call is inside
11504+ reiser4_fq_put_nolock() which is called when flush is
11505+ finished and nr_flushers is decremented. */
11506+ reiser4_atom_wait_event(*atom);
11507+ *atom = get_current_atom_locked();
11508+ }
11509+ }
11510+
11511+ /* count ourself as a flusher */
11512+ (*atom)->nr_flushers++;
11513+
11514+ writeout_mode_enable();
11515+
11516+ nr_queued = 0;
11517+ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11518+
11519+ if (node == NULL) {
11520+ if (nr_queued == 0) {
11521+ (*atom)->nr_flushers--;
11522+ reiser4_fq_put_nolock(fq);
11523+ reiser4_atom_send_event(*atom);
11524+ /* current atom remains locked */
11525+ writeout_mode_disable();
11526+ return 0;
11527+ }
11528+ spin_unlock_atom(*atom);
11529+ } else {
11530+ jref(node);
11531+ BUG_ON((*atom)->super != node->tree->super);
11532+ spin_unlock_atom(*atom);
11533+ spin_unlock_jnode(node);
11534+ BUG_ON(nr_to_write == 0);
11535+ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11536+ jput(node);
11537+ }
11538+
11539+ ret =
11540+ reiser4_write_fq(fq, nr_submitted,
11541+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11542+
11543+ *atom = get_current_atom_locked();
11544+ (*atom)->nr_flushers--;
11545+ reiser4_fq_put_nolock(fq);
11546+ reiser4_atom_send_event(*atom);
11547+ spin_unlock_atom(*atom);
11548+
11549+ writeout_mode_disable();
11550+
11551+ if (ret == 0)
11552+ ret = -E_REPEAT;
11553+
11554+ return ret;
11555+}
11556+
11557+/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11558+
11559+/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
11560+ reverse parent-first relocate context. Here all we know is the preceder and the block
11561+ number. Since we are going in reverse, the preceder may still be relocated as well, so
11562+ we can't ask the block allocator "is there a closer block available to relocate?" here.
11563+ In the _forward_ parent-first relocate context (not here) we actually call the block
11564+ allocator to try and find a closer location. */
11565+static int
11566+reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11567+ const reiser4_block_nr * nblk)
11568+{
11569+ reiser4_block_nr dist;
11570+
11571+ assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11572+ assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
11573+ assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
11574+
11575+ /* Distance is the absolute value. */
11576+ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11577+
11578+ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
11579+ block, do not relocate. */
11580+ if (dist <= get_current_super_private()->flush.relocate_distance) {
11581+ return 0;
11582+ }
11583+
11584+ return 1;
11585+}
11586+
11587+/* This function is a predicate that tests for relocation. Always called in the
11588+ reverse-parent-first context, when we are asking whether the current node should be
11589+ relocated in order to expand the flush by dirtying the parent level (and thus
11590+ proceeding to flush that level). When traversing in the forward parent-first direction
11591+ (not here), relocation decisions are handled in two places: allocate_znode() and
11592+ extent_needs_allocation(). */
11593+static int
11594+reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11595+ flush_pos_t * pos)
11596+{
11597+ reiser4_block_nr pblk = 0;
11598+ reiser4_block_nr nblk = 0;
11599+
11600+ assert("jmacd-8989", !jnode_is_root(node));
11601+
11602+ /*
11603+ * This function is called only from the
11604+ * reverse_relocate_check_dirty_parent() and only if the parent
11605+ * node is clean. This implies that the parent has the real (i.e., not
11606+ * fake) block number, and, so does the child, because otherwise the
11607+ * parent would be dirty.
11608+ */
11609+
11610+ /* New nodes are treated as if they are being relocated. */
11611+ if (JF_ISSET (node, JNODE_CREATED) ||
11612+ (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
11613+ return 1;
11614+ }
11615+
11616+ /* Find the preceder. FIXME(B): When the child is an unformatted, previously
11617+ existing node, the coord may be leftmost even though the child is not the
11618+ parent-first preceder of the parent. If the first dirty node appears somewhere
11619+ in the middle of the first extent unit, this preceder calculation is wrong.
11620+ Needs more logic in here. */
11621+ if (coord_is_leftmost_unit(parent_coord)) {
11622+ pblk = *znode_get_block(parent_coord->node);
11623+ } else {
11624+ pblk = pos->preceder.blk;
11625+ }
11626+ check_preceder(pblk);
11627+
11628+ /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
11629+ if (pblk == 0) {
11630+ return 1;
11631+ }
11632+
11633+ nblk = *jnode_get_block(node);
11634+
11635+ if (reiser4_blocknr_is_fake(&nblk))
11636+ /* child is unallocated, mark parent dirty */
11637+ return 1;
11638+
11639+ return reverse_relocate_if_close_enough(&pblk, &nblk);
11640+}
11641+
11642+/* This function calls reverse_relocate_test to make a reverse-parent-first
11643+ relocation decision and then, if yes, it marks the parent dirty. */
11644+static int
11645+reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
11646+ flush_pos_t * pos)
11647+{
11648+ int ret;
11649+
11650+ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11651+
11652+ ret = reverse_relocate_test(node, parent_coord, pos);
11653+ if (ret < 0) {
11654+ return ret;
11655+ }
11656+
11657+ /* FIXME-ZAM
11658+ if parent is already relocated - we do not want to grab space, right? */
11659+ if (ret == 1) {
11660+ int grabbed;
11661+
11662+ grabbed = get_current_context()->grabbed_blocks;
11663+ if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11664+ 0)
11665+ reiser4_panic("umka-1250",
11666+ "No space left during flush.");
11667+
11668+ assert("jmacd-18923",
11669+ znode_is_write_locked(parent_coord->node));
11670+ znode_make_dirty(parent_coord->node);
11671+ grabbed2free_mark(grabbed);
11672+ }
11673+ }
11674+
11675+ return 0;
11676+}
11677+
11678+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
11679+ PARENT-FIRST LOOP BEGINS) */
11680+
11681+/* Get the leftmost child for given coord. */
11682+static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
11683+{
11684+ int ret;
11685+
11686+ ret = item_utmost_child(coord, LEFT_SIDE, child);
11687+
11688+ if (ret)
11689+ return ret;
11690+
11691+ if (IS_ERR(*child))
11692+ return PTR_ERR(*child);
11693+
11694+ return 0;
11695+}
11696+
11697+/* This step occurs after the left- and right-scans are completed, before starting the
11698+ forward parent-first traversal. Here we attempt to allocate ancestors of the starting
11699+ flush point, which means continuing in the reverse parent-first direction to the
11700+ parent, grandparent, and so on (as long as the child is a leftmost child). This
11701+ routine calls a recursive process, alloc_one_ancestor, which does the real work,
11702+ except there is special-case handling here for the first ancestor, which may be a twig.
11703+ At each level (here and alloc_one_ancestor), we check for relocation and then, if
11704+ the child is a leftmost child, repeat at the next level. On the way back down (the
11705+ recursion), we allocate the ancestors in parent-first order. */
11706+static int alloc_pos_and_ancestors(flush_pos_t * pos)
11707+{
11708+ int ret = 0;
11709+ lock_handle plock;
11710+ load_count pload;
11711+ coord_t pcoord;
11712+
11713+ if (znode_check_flushprepped(pos->lock.node))
11714+ return 0;
11715+
11716+ coord_init_invalid(&pcoord, NULL);
11717+ init_lh(&plock);
11718+ init_load_count(&pload);
11719+
11720+ if (pos->state == POS_ON_EPOINT) {
11721+ /* a special case for pos on twig level, where we already have
11722+ a lock on parent node. */
11723+ /* The parent may not be dirty, in which case we should decide
11724+ whether to relocate the child now. If decision is made to
11725+ relocate the child, the parent is marked dirty. */
11726+ ret =
11727+ reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11728+ pos);
11729+ if (ret)
11730+ goto exit;
11731+
11732+ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11733+ is leftmost) and the leaf/child, so recursion is not needed.
11734+ Levels above the twig will be allocated for
11735+ write-optimization before the transaction commits. */
11736+
11737+ /* Do the recursive step, allocating zero or more of our
11738+ * ancestors. */
11739+ ret = alloc_one_ancestor(&pos->coord, pos);
11740+
11741+ } else {
11742+ if (!znode_is_root(pos->lock.node)) {
11743+ /* all formatted nodes except tree root */
11744+ ret =
11745+ reiser4_get_parent(&plock, pos->lock.node,
11746+ ZNODE_WRITE_LOCK);
11747+ if (ret)
11748+ goto exit;
11749+
11750+ ret = incr_load_count_znode(&pload, plock.node);
11751+ if (ret)
11752+ goto exit;
11753+
11754+ ret =
11755+ find_child_ptr(plock.node, pos->lock.node, &pcoord);
11756+ if (ret)
11757+ goto exit;
11758+
11759+ ret =
11760+ reverse_relocate_check_dirty_parent(ZJNODE
11761+ (pos->lock.
11762+ node), &pcoord,
11763+ pos);
11764+ if (ret)
11765+ goto exit;
11766+
11767+ ret = alloc_one_ancestor(&pcoord, pos);
11768+ if (ret)
11769+ goto exit;
11770+ }
11771+
11772+ ret = allocate_znode(pos->lock.node, &pcoord, pos);
11773+ }
11774+ exit:
11775+ done_load_count(&pload);
11776+ done_lh(&plock);
11777+ return ret;
11778+}
11779+
11780+/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the
11781+ call to set_preceder, which is the next function described, this checks if the
11782+ child is a leftmost child and returns if it is not. If the child is a leftmost child
11783+ it checks for relocation, possibly dirtying the parent. Then it performs the recursive
11784+ step. */
11785+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
11786+{
11787+ int ret = 0;
11788+ lock_handle alock;
11789+ load_count aload;
11790+ coord_t acoord;
11791+
11792+ /* As we ascend at the left-edge of the region to flush, take this opportunity at
11793+ the twig level to find our parent-first preceder unless we have already set
11794+ it. */
11795+ if (pos->preceder.blk == 0) {
11796+ ret = set_preceder(coord, pos);
11797+ if (ret != 0)
11798+ return ret;
11799+ }
11800+
11801+ /* If the ancestor is clean or already allocated, or if the child is not a
11802+ leftmost child, stop going up, even leaving coord->node not flushprepped. */
11803+ if (znode_check_flushprepped(coord->node)
11804+ || !coord_is_leftmost_unit(coord))
11805+ return 0;
11806+
11807+ init_lh(&alock);
11808+ init_load_count(&aload);
11809+ coord_init_invalid(&acoord, NULL);
11810+
11811+ /* Only ascend to the next level if it is a leftmost child, but write-lock the
11812+ parent in case we will relocate the child. */
11813+ if (!znode_is_root(coord->node)) {
11814+
11815+ ret =
11816+ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
11817+ &alock, &aload, ZNODE_WRITE_LOCK,
11818+ 0);
11819+ if (ret != 0) {
11820+ /* FIXME(C): check EINVAL, E_DEADLOCK */
11821+ goto exit;
11822+ }
11823+
11824+ ret =
11825+ reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
11826+ &acoord, pos);
11827+ if (ret != 0) {
11828+ goto exit;
11829+ }
11830+
11831+ /* Recursive call. */
11832+ if (!znode_check_flushprepped(acoord.node)) {
11833+ ret = alloc_one_ancestor(&acoord, pos);
11834+ if (ret)
11835+ goto exit;
11836+ }
11837+ }
11838+
11839+ /* Note: we call allocate with the parent write-locked (except at the root) in
11840+ case we relocate the child, in which case it will modify the parent during this
11841+ call. */
11842+ ret = allocate_znode(coord->node, &acoord, pos);
11843+
11844+ exit:
11845+ done_load_count(&aload);
11846+ done_lh(&alock);
11847+ return ret;
11848+}
11849+
11850+/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
11851+ a call to this function at the twig level. During alloc_pos_and_ancestors we may ask:
11852+ should this node be relocated (in reverse parent-first context)? We repeat this
11853+ process as long as the child is the leftmost child, eventually reaching an ancestor of
11854+ the flush point that is not a leftmost child. The preceder of that ancestors, which is
11855+ not a leftmost child, is actually on the leaf level. The preceder of that block is the
11856+ left-neighbor of the flush point. The preceder of that block is the rightmost child of
11857+ the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig
11858+ level, it stops momentarily to remember the block of the rightmost child of the twig on
11859+ the left and sets it to the flush_position's preceder_hint.
11860+
11861+ There is one other place where we may set the flush_position's preceder hint, which is
11862+ during scan-left.
11863+*/
11864+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
11865+{
11866+ int ret;
11867+ coord_t coord;
11868+ lock_handle left_lock;
11869+ load_count left_load;
11870+
11871+ coord_dup(&coord, coord_in);
11872+
11873+ init_lh(&left_lock);
11874+ init_load_count(&left_load);
11875+
11876+ /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
11877+ coord_is_leftmost_unit is not the right test if the unformatted child is in the
11878+ middle of the first extent unit. */
11879+ if (!coord_is_leftmost_unit(&coord)) {
11880+ coord_prev_unit(&coord);
11881+ } else {
11882+ ret =
11883+ reiser4_get_left_neighbor(&left_lock, coord.node,
11884+ ZNODE_READ_LOCK, GN_SAME_ATOM);
11885+ if (ret) {
11886+ /* If we fail for any reason it doesn't matter because the
11887+ preceder is only a hint. We are low-priority at this point, so
11888+ this must be the case. */
11889+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
11890+ ret == -ENOENT || ret == -EINVAL
11891+ || ret == -E_DEADLOCK) {
11892+ ret = 0;
11893+ }
11894+ goto exit;
11895+ }
11896+
11897+ ret = incr_load_count_znode(&left_load, left_lock.node);
11898+ if (ret)
11899+ goto exit;
11900+
11901+ coord_init_last_unit(&coord, left_lock.node);
11902+ }
11903+
11904+ ret =
11905+ item_utmost_child_real_block(&coord, RIGHT_SIDE,
11906+ &pos->preceder.blk);
11907+ exit:
11908+ check_preceder(pos->preceder.blk);
11909+ done_load_count(&left_load);
11910+ done_lh(&left_lock);
11911+ return ret;
11912+}
11913+
11914+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
11915+
11916+/* This procedure implements the outer loop of the flush algorithm. To put this in
11917+ context, here is the general list of steps taken by the flush routine as a whole:
11918+
11919+ 1. Scan-left
11920+ 2. Scan-right (maybe)
11921+ 3. Allocate initial flush position and its ancestors
11922+ 4. <handle extents>
11923+ 5. <squeeze and next position and its ancestors to-the-right,
11924+ then update position to-the-right>
11925+ 6. <repeat from #4 until flush is stopped>
11926+
11927+ This procedure implements the loop in steps 4 through 6 in the above listing.
11928+
11929+ Step 4: if the current flush position is an extent item (position on the twig level),
11930+ it allocates the extent (allocate_extent_item_in_place) then shifts to the next
11931+ coordinate. If the next coordinate's leftmost child needs flushprep, we will continue.
11932+ If the next coordinate is an internal item, we descend back to the leaf level,
11933+ otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate"
11934+ brings us past the end of the twig level, then we call
11935+ reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
11936+ step #5 which moves to the right.
11937+
11938+ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
11939+ tree to allocate any ancestors of the next-right flush position that are not also
11940+ ancestors of the current position. Those ancestors (in top-down order) are the next in
11941+ parent-first order. We squeeze adjacent nodes on the way up until the right node and
11942+ current node share the same parent, then allocate on the way back down. Finally, this
11943+ step sets the flush position to the next-right node. Then repeat steps 4 and 5.
11944+*/
11945+
11946+/* SQUEEZE CODE */
11947+
11948+/* squalloc_right_twig helper function, cut a range of extent items from
11949+ cut node to->node from the beginning up to coord @to. */
11950+static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
11951+ znode * left)
11952+{
11953+ coord_t from;
11954+ reiser4_key from_key;
11955+
11956+ coord_init_first_unit(&from, to->node);
11957+ item_key_by_coord(&from, &from_key);
11958+
11959+ return cut_node_content(&from, to, &from_key, to_key, NULL);
11960+}
11961+
11962+/* Copy as much of the leading extents from @right to @left, allocating
11963+ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
11964+ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
11965+ internal item it calls shift_one_internal_unit and may then return
11966+ SUBTREE_MOVED. */
11967+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
11968+{
11969+ int ret = SUBTREE_MOVED;
11970+ coord_t coord; /* used to iterate over items */
11971+ reiser4_key stop_key;
11972+
11973+ assert("jmacd-2008", !node_is_empty(right));
11974+ coord_init_first_unit(&coord, right);
11975+
11976+ /* FIXME: can be optimized to cut once */
11977+ while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
11978+ ON_DEBUG(void *vp);
11979+
11980+ assert("vs-1468", coord_is_leftmost_unit(&coord));
11981+ ON_DEBUG(vp = shift_check_prepare(left, coord.node));
11982+
11983+ /* stop_key is used to find what was copied and what to cut */
11984+ stop_key = *reiser4_min_key();
11985+ ret = squalloc_extent(left, &coord, pos, &stop_key);
11986+ if (ret != SQUEEZE_CONTINUE) {
11987+ ON_DEBUG(kfree(vp));
11988+ break;
11989+ }
11990+ assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
11991+
11992+ /* Helper function to do the cutting. */
11993+ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
11994+ check_me("vs-1466",
11995+ squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
11996+
11997+ ON_DEBUG(shift_check(vp, left, coord.node));
11998+ }
11999+
12000+ if (node_is_empty(coord.node))
12001+ ret = SQUEEZE_SOURCE_EMPTY;
12002+
12003+ if (ret == SQUEEZE_TARGET_FULL) {
12004+ goto out;
12005+ }
12006+
12007+ if (node_is_empty(right)) {
12008+ /* The whole right node was copied into @left. */
12009+ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
12010+ goto out;
12011+ }
12012+
12013+ coord_init_first_unit(&coord, right);
12014+
12015+ if (!item_is_internal(&coord)) {
12016+ /* we do not want to squeeze anything else to left neighbor because "slum"
12017+ is over */
12018+ ret = SQUEEZE_TARGET_FULL;
12019+ goto out;
12020+ }
12021+ assert("jmacd-433", item_is_internal(&coord));
12022+
12023+ /* Shift an internal unit. The child must be allocated before shifting any more
12024+ extents, so we stop here. */
12025+ ret = shift_one_internal_unit(left, right);
12026+
12027+ out:
12028+ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
12029+ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
12030+
12031+ if (ret == SQUEEZE_TARGET_FULL) {
12032+ /* We submit prepped nodes here and expect that this @left twig
12033+ * will not be modified again during this jnode_flush() call. */
12034+ int ret1;
12035+
12036+ /* NOTE: seems like io is done under long term locks. */
12037+ ret1 = write_prepped_nodes(pos);
12038+ if (ret1 < 0)
12039+ return ret1;
12040+ }
12041+
12042+ return ret;
12043+}
12044+
12045+#if REISER4_DEBUG
12046+static void item_convert_invariant(flush_pos_t * pos)
12047+{
12048+ assert("edward-1225", coord_is_existing_item(&pos->coord));
12049+ if (chaining_data_present(pos)) {
12050+ item_plugin *iplug = item_convert_plug(pos);
12051+
12052+ assert("edward-1000",
12053+ iplug == item_plugin_by_coord(&pos->coord));
12054+ assert("edward-1001", iplug->f.convert != NULL);
12055+ } else
12056+ assert("edward-1226", pos->child == NULL);
12057+}
12058+#else
12059+
12060+#define item_convert_invariant(pos) noop
12061+
12062+#endif
12063+
12064+/* Scan node items starting from the first one and apply for each
12065+ item its flush ->convert() method (if any). This method may
12066+ resize/kill the item so the tree will be changed.
12067+*/
12068+static int convert_node(flush_pos_t * pos, znode * node)
12069+{
12070+ int ret = 0;
12071+ item_plugin *iplug;
12072+
12073+ assert("edward-304", pos != NULL);
12074+ assert("edward-305", pos->child == NULL);
12075+ assert("edward-475", znode_convertible(node));
12076+ assert("edward-669", znode_is_wlocked(node));
12077+ assert("edward-1210", !node_is_empty(node));
12078+
12079+ if (znode_get_level(node) != LEAF_LEVEL)
12080+ /* unsupported */
12081+ goto exit;
12082+
12083+ coord_init_first_unit(&pos->coord, node);
12084+
12085+ while (1) {
12086+ ret = 0;
12087+ coord_set_to_left(&pos->coord);
12088+ item_convert_invariant(pos);
12089+
12090+ iplug = item_plugin_by_coord(&pos->coord);
12091+ assert("edward-844", iplug != NULL);
12092+
12093+ if (iplug->f.convert) {
12094+ ret = iplug->f.convert(pos);
12095+ if (ret)
12096+ goto exit;
12097+ }
12098+ assert("edward-307", pos->child == NULL);
12099+
12100+ if (coord_next_item(&pos->coord)) {
12101+ /* node is over */
12102+
12103+ if (!chaining_data_present(pos))
12104+ /* finished this node */
12105+ break;
12106+ if (should_chain_next_node(pos)) {
12107+ /* go to next node */
12108+ move_chaining_data(pos, 0 /* to next node */ );
12109+ break;
12110+ }
12111+ /* repeat this node */
12112+ move_chaining_data(pos, 1 /* this node */ );
12113+ continue;
12114+ }
12115+ /* Node is not over.
12116+ Check if there is attached convert data.
12117+ If so roll one item position back and repeat
12118+ on this node
12119+ */
12120+ if (chaining_data_present(pos)) {
12121+
12122+ if (iplug != item_plugin_by_coord(&pos->coord))
12123+ set_item_convert_count(pos, 0);
12124+
12125+ ret = coord_prev_item(&pos->coord);
12126+ assert("edward-1003", !ret);
12127+
12128+ move_chaining_data(pos, 1 /* this node */ );
12129+ }
12130+ }
12131+ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12132+ znode_make_dirty(node);
12133+ exit:
12134+ assert("edward-1004", !ret);
12135+ return ret;
12136+}
12137+
12138+/* Squeeze and allocate the right neighbor. This is called after @left and
12139+ its current children have been squeezed and allocated already. This
12140+ procedure's job is to squeeze and items from @right to @left.
12141+
12142+ If at the leaf level, use the shift_everything_left memcpy-optimized
12143+ version of shifting (squeeze_right_leaf).
12144+
12145+ If at the twig level, extents are allocated as they are shifted from @right
12146+ to @left (squalloc_right_twig).
12147+
12148+ At any other level, shift one internal item and return to the caller
12149+ (squalloc_parent_first) so that the shifted-subtree can be processed in
12150+ parent-first order.
12151+
12152+ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12153+ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12154+ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12155+ is returned.
12156+*/
12157+
12158+static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12159+ znode * right)
12160+{
12161+ int ret;
12162+
12163+ /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12164+ * tree owing to error (for example, ENOSPC) in write */
12165+ /* assert("jmacd-9321", !node_is_empty(left)); */
12166+ assert("jmacd-9322", !node_is_empty(right));
12167+ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12168+
12169+ switch (znode_get_level(left)) {
12170+ case TWIG_LEVEL:
12171+ /* Shift with extent allocating until either an internal item
12172+ is encountered or everything is shifted or no free space
12173+ left in @left */
12174+ ret = squeeze_right_twig(left, right, pos);
12175+ break;
12176+
12177+ default:
12178+ /* All other levels can use shift_everything until we implement per-item
12179+ flush plugins. */
12180+ ret = squeeze_right_non_twig(left, right);
12181+ break;
12182+ }
12183+
12184+ assert("jmacd-2011", (ret < 0 ||
12185+ ret == SQUEEZE_SOURCE_EMPTY
12186+ || ret == SQUEEZE_TARGET_FULL
12187+ || ret == SUBTREE_MOVED));
12188+ return ret;
12189+}
12190+
12191+static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12192+ znode * right)
12193+{
12194+ int ret;
12195+
12196+ ret = squeeze_right_twig(pos->lock.node, right, pos);
12197+ if (ret < 0)
12198+ return ret;
12199+ if (ret > 0) {
12200+ coord_init_after_last_item(&pos->coord, pos->lock.node);
12201+ return ret;
12202+ }
12203+
12204+ coord_init_last_unit(&pos->coord, pos->lock.node);
12205+ return 0;
12206+}
12207+
12208+/* forward declaration */
12209+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12210+
12211+/* do a fast check for "same parents" condition before calling
12212+ * squalloc_upper_levels() */
12213+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12214+ znode * left,
12215+ znode * right)
12216+{
12217+ if (znode_same_parents(left, right))
12218+ return 0;
12219+
12220+ return squalloc_upper_levels(pos, left, right);
12221+}
12222+
12223+/* Check whether the parent of given @right node needs to be processes
12224+ ((re)allocated) prior to processing of the child. If @left and @right do not
12225+ share at least the parent of the @right is after the @left but before the
12226+ @right in parent-first order, we have to (re)allocate it before the @right
12227+ gets (re)allocated. */
12228+static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12229+{
12230+ int ret;
12231+
12232+ lock_handle left_parent_lock;
12233+ lock_handle right_parent_lock;
12234+
12235+ load_count left_parent_load;
12236+ load_count right_parent_load;
12237+
12238+ init_lh(&left_parent_lock);
12239+ init_lh(&right_parent_lock);
12240+
12241+ init_load_count(&left_parent_load);
12242+ init_load_count(&right_parent_load);
12243+
12244+ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12245+ if (ret)
12246+ goto out;
12247+
12248+ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12249+ if (ret)
12250+ goto out;
12251+
12252+ /* Check for same parents */
12253+ if (left_parent_lock.node == right_parent_lock.node)
12254+ goto out;
12255+
12256+ if (znode_check_flushprepped(right_parent_lock.node)) {
12257+ /* Keep parent-first order. In the order, the right parent node stands
12258+ before the @right node. If it is already allocated, we set the
12259+ preceder (next block search start point) to its block number, @right
12260+ node should be allocated after it.
12261+
12262+ However, preceder is set only if the right parent is on twig level.
12263+ The explanation is the following: new branch nodes are allocated over
12264+ already allocated children while the tree grows, it is difficult to
12265+ keep tree ordered, we assume that only leaves and twings are correctly
12266+ allocated. So, only twigs are used as a preceder for allocating of the
12267+ rest of the slum. */
12268+ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12269+ pos->preceder.blk =
12270+ *znode_get_block(right_parent_lock.node);
12271+ check_preceder(pos->preceder.blk);
12272+ }
12273+ goto out;
12274+ }
12275+
12276+ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12277+ if (ret)
12278+ goto out;
12279+
12280+ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12281+ if (ret)
12282+ goto out;
12283+
12284+ ret =
12285+ squeeze_right_neighbor(pos, left_parent_lock.node,
12286+ right_parent_lock.node);
12287+ /* We stop if error. We stop if some items/units were shifted (ret == 0)
12288+ * and thus @right changed its parent. It means we have not process
12289+ * right_parent node prior to processing of @right. Positive return
12290+ * values say that shifting items was not happen because of "empty
12291+ * source" or "target full" conditions. */
12292+ if (ret <= 0)
12293+ goto out;
12294+
12295+ /* parent(@left) and parent(@right) may have different parents also. We
12296+ * do a recursive call for checking that. */
12297+ ret =
12298+ check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12299+ right_parent_lock.node);
12300+ if (ret)
12301+ goto out;
12302+
12303+ /* allocate znode when going down */
12304+ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12305+
12306+ out:
12307+ done_load_count(&left_parent_load);
12308+ done_load_count(&right_parent_load);
12309+
12310+ done_lh(&left_parent_lock);
12311+ done_lh(&right_parent_lock);
12312+
12313+ return ret;
12314+}
12315+
12316+/* Check the leftmost child "flushprepped" status, also returns true if child
12317+ * node was not found in cache. */
12318+static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12319+{
12320+ int ret;
12321+ int prepped;
12322+
12323+ jnode *child;
12324+
12325+ ret = get_leftmost_child_of_unit(coord, &child);
12326+
12327+ if (ret)
12328+ return ret;
12329+
12330+ if (child) {
12331+ prepped = jnode_check_flushprepped(child);
12332+ jput(child);
12333+ } else {
12334+ /* We consider not existing child as a node which slum
12335+ processing should not continue to. Not cached node is clean,
12336+ so it is flushprepped. */
12337+ prepped = 1;
12338+ }
12339+
12340+ return prepped;
12341+}
12342+
12343+/* (re)allocate znode with automated getting parent node */
12344+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12345+{
12346+ int ret;
12347+ lock_handle parent_lock;
12348+ load_count parent_load;
12349+ coord_t pcoord;
12350+
12351+ assert("zam-851", znode_is_write_locked(node));
12352+
12353+ init_lh(&parent_lock);
12354+ init_load_count(&parent_load);
12355+
12356+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12357+ if (ret)
12358+ goto out;
12359+
12360+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
12361+ if (ret)
12362+ goto out;
12363+
12364+ ret = find_child_ptr(parent_lock.node, node, &pcoord);
12365+ if (ret)
12366+ goto out;
12367+
12368+ ret = allocate_znode(node, &pcoord, pos);
12369+
12370+ out:
12371+ done_load_count(&parent_load);
12372+ done_lh(&parent_lock);
12373+ return ret;
12374+}
12375+
12376+/* Process nodes on leaf level until unformatted node or rightmost node in the
12377+ * slum reached. */
12378+static int handle_pos_on_formatted(flush_pos_t * pos)
12379+{
12380+ int ret;
12381+ lock_handle right_lock;
12382+ load_count right_load;
12383+
12384+ init_lh(&right_lock);
12385+ init_load_count(&right_load);
12386+
12387+ if (should_convert_node(pos, pos->lock.node)) {
12388+ ret = convert_node(pos, pos->lock.node);
12389+ if (ret)
12390+ return ret;
12391+ }
12392+
12393+ while (1) {
12394+ int expected;
12395+ expected = should_convert_next_node(pos);
12396+ ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12397+ ZNODE_WRITE_LOCK, !expected, expected);
12398+ if (ret) {
12399+ if (expected)
12400+ warning("edward-1495",
12401+ "Expected neighbor not found (ret = %d). Fsck?",
12402+ ret);
12403+ break;
12404+ }
12405+
12406+ /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it
12407+ * can be optimal. For now we choose to live with the risk that it will
12408+ * be suboptimal because it would be quite complex to code it to be
12409+ * smarter. */
12410+ if (znode_check_flushprepped(right_lock.node)
12411+ && !znode_convertible(right_lock.node)) {
12412+ assert("edward-1005", !should_convert_next_node(pos));
12413+ pos_stop(pos);
12414+ break;
12415+ }
12416+
12417+ ret = incr_load_count_znode(&right_load, right_lock.node);
12418+ if (ret)
12419+ break;
12420+ if (should_convert_node(pos, right_lock.node)) {
12421+ ret = convert_node(pos, right_lock.node);
12422+ if (ret)
12423+ break;
12424+ if (node_is_empty(right_lock.node)) {
12425+ /* node became empty after converting, repeat */
12426+ done_load_count(&right_load);
12427+ done_lh(&right_lock);
12428+ continue;
12429+ }
12430+ }
12431+
12432+ /* squeeze _before_ going upward. */
12433+ ret =
12434+ squeeze_right_neighbor(pos, pos->lock.node,
12435+ right_lock.node);
12436+ if (ret < 0)
12437+ break;
12438+
12439+ if (znode_check_flushprepped(right_lock.node)) {
12440+ if (should_convert_next_node(pos)) {
12441+ /* in spite of flushprepped status of the node,
12442+ its right slum neighbor should be converted */
12443+ assert("edward-953", convert_data(pos));
12444+ assert("edward-954", item_convert_data(pos));
12445+
12446+ if (node_is_empty(right_lock.node)) {
12447+ done_load_count(&right_load);
12448+ done_lh(&right_lock);
12449+ } else
12450+ move_flush_pos(pos, &right_lock,
12451+ &right_load, NULL);
12452+ continue;
12453+ }
12454+ pos_stop(pos);
12455+ break;
12456+ }
12457+
12458+ if (node_is_empty(right_lock.node)) {
12459+ /* repeat if right node was squeezed completely */
12460+ done_load_count(&right_load);
12461+ done_lh(&right_lock);
12462+ continue;
12463+ }
12464+
12465+ /* parent(right_lock.node) has to be processed before
12466+ * (right_lock.node) due to "parent-first" allocation order. */
12467+ ret =
12468+ check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12469+ right_lock.node);
12470+ if (ret)
12471+ break;
12472+ /* (re)allocate _after_ going upward */
12473+ ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12474+ if (ret)
12475+ break;
12476+ if (should_terminate_squalloc(pos)) {
12477+ set_item_convert_count(pos, 0);
12478+ break;
12479+ }
12480+
12481+ /* advance the flush position to the right neighbor */
12482+ move_flush_pos(pos, &right_lock, &right_load, NULL);
12483+
12484+ ret = rapid_flush(pos);
12485+ if (ret)
12486+ break;
12487+ }
12488+ check_convert_info(pos);
12489+ done_load_count(&right_load);
12490+ done_lh(&right_lock);
12491+
12492+ /* This function indicates via pos whether to stop or go to twig or continue on current
12493+ * level. */
12494+ return ret;
12495+
12496+}
12497+
12498+/* Process nodes on leaf level until unformatted node or rightmost node in the
12499+ * slum reached. */
12500+static int handle_pos_on_leaf(flush_pos_t * pos)
12501+{
12502+ int ret;
12503+
12504+ assert("zam-845", pos->state == POS_ON_LEAF);
12505+
12506+ ret = handle_pos_on_formatted(pos);
12507+
12508+ if (ret == -E_NO_NEIGHBOR) {
12509+ /* cannot get right neighbor, go process extents. */
12510+ pos->state = POS_TO_TWIG;
12511+ return 0;
12512+ }
12513+
12514+ return ret;
12515+}
12516+
12517+/* Process slum on level > 1 */
12518+static int handle_pos_on_internal(flush_pos_t * pos)
12519+{
12520+ assert("zam-850", pos->state == POS_ON_INTERNAL);
12521+ return handle_pos_on_formatted(pos);
12522+}
12523+
12524+/* check whether squalloc should stop before processing given extent */
12525+static int squalloc_extent_should_stop(flush_pos_t * pos)
12526+{
12527+ assert("zam-869", item_is_extent(&pos->coord));
12528+
12529+ /* pos->child is a jnode handle_pos_on_extent() should start with in
12530+ * stead of the first child of the first extent unit. */
12531+ if (pos->child) {
12532+ int prepped;
12533+
12534+ assert("vs-1383", jnode_is_unformatted(pos->child));
12535+ prepped = jnode_check_flushprepped(pos->child);
12536+ pos->pos_in_unit =
12537+ jnode_get_index(pos->child) -
12538+ extent_unit_index(&pos->coord);
12539+ assert("vs-1470",
12540+ pos->pos_in_unit < extent_unit_width(&pos->coord));
12541+ assert("nikita-3434",
12542+ ergo(extent_is_unallocated(&pos->coord),
12543+ pos->pos_in_unit == 0));
12544+ jput(pos->child);
12545+ pos->child = NULL;
12546+
12547+ return prepped;
12548+ }
12549+
12550+ pos->pos_in_unit = 0;
12551+ if (extent_is_unallocated(&pos->coord))
12552+ return 0;
12553+
12554+ return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12555+}
12556+
12557+/* Handle the case when regular reiser4 tree (znodes connected one to its
12558+ * neighbors by sibling pointers) is interrupted on leaf level by one or more
12559+ * unformatted nodes. By having a lock on twig level and use extent code
12560+ * routines to process unformatted nodes we swim around an irregular part of
12561+ * reiser4 tree. */
12562+static int handle_pos_on_twig(flush_pos_t * pos)
12563+{
12564+ int ret;
12565+
12566+ assert("zam-844", pos->state == POS_ON_EPOINT);
12567+ assert("zam-843", item_is_extent(&pos->coord));
12568+
12569+ /* We decide should we continue slum processing with current extent
12570+ unit: if leftmost child of current extent unit is flushprepped
12571+ (i.e. clean or already processed by flush) we stop squalloc(). There
12572+ is a fast check for unallocated extents which we assume contain all
12573+ not flushprepped nodes. */
12574+ /* FIXME: Here we implement simple check, we are only looking on the
12575+ leftmost child. */
12576+ ret = squalloc_extent_should_stop(pos);
12577+ if (ret != 0) {
12578+ pos_stop(pos);
12579+ return ret;
12580+ }
12581+
12582+ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12583+ && item_is_extent(&pos->coord)) {
12584+ ret = reiser4_alloc_extent(pos);
12585+ if (ret) {
12586+ break;
12587+ }
12588+ coord_next_unit(&pos->coord);
12589+ }
12590+
12591+ if (coord_is_after_rightmost(&pos->coord)) {
12592+ pos->state = POS_END_OF_TWIG;
12593+ return 0;
12594+ }
12595+ if (item_is_internal(&pos->coord)) {
12596+ pos->state = POS_TO_LEAF;
12597+ return 0;
12598+ }
12599+
12600+ assert("zam-860", item_is_extent(&pos->coord));
12601+
12602+ /* "slum" is over */
12603+ pos->state = POS_INVALID;
12604+ return 0;
12605+}
12606+
12607+/* When we about to return flush position from twig to leaf level we can process
12608+ * the right twig node or move position to the leaf. This processes right twig
12609+ * if it is possible and jump to leaf level if not. */
12610+static int handle_pos_end_of_twig(flush_pos_t * pos)
12611+{
12612+ int ret;
12613+ lock_handle right_lock;
12614+ load_count right_load;
12615+ coord_t at_right;
12616+ jnode *child = NULL;
12617+
12618+ assert("zam-848", pos->state == POS_END_OF_TWIG);
12619+ assert("zam-849", coord_is_after_rightmost(&pos->coord));
12620+
12621+ init_lh(&right_lock);
12622+ init_load_count(&right_load);
12623+
12624+ /* We get a lock on the right twig node even it is not dirty because
12625+ * slum continues or discontinues on leaf level not on next twig. This
12626+ * lock on the right twig is needed for getting its leftmost child. */
12627+ ret =
12628+ reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12629+ ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12630+ if (ret)
12631+ goto out;
12632+
12633+ ret = incr_load_count_znode(&right_load, right_lock.node);
12634+ if (ret)
12635+ goto out;
12636+
12637+ /* right twig could be not dirty */
12638+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12639+ /* If right twig node is dirty we always attempt to squeeze it
12640+ * content to the left... */
12641+ became_dirty:
12642+ ret =
12643+ squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12644+ if (ret <= 0) {
12645+ /* pos->coord is on internal item, go to leaf level, or
12646+ * we have an error which will be caught in squalloc() */
12647+ pos->state = POS_TO_LEAF;
12648+ goto out;
12649+ }
12650+
12651+ /* If right twig was squeezed completely we wave to re-lock
12652+ * right twig. now it is done through the top-level squalloc
12653+ * routine. */
12654+ if (node_is_empty(right_lock.node))
12655+ goto out;
12656+
12657+ /* ... and prep it if it is not yet prepped */
12658+ if (!znode_check_flushprepped(right_lock.node)) {
12659+ /* As usual, process parent before ... */
12660+ ret =
12661+ check_parents_and_squalloc_upper_levels(pos,
12662+ pos->lock.
12663+ node,
12664+ right_lock.
12665+ node);
12666+ if (ret)
12667+ goto out;
12668+
12669+ /* ... processing the child */
12670+ ret =
12671+ lock_parent_and_allocate_znode(right_lock.node,
12672+ pos);
12673+ if (ret)
12674+ goto out;
12675+ }
12676+ } else {
12677+ coord_init_first_unit(&at_right, right_lock.node);
12678+
12679+ /* check first child of next twig, should we continue there ? */
12680+ ret = get_leftmost_child_of_unit(&at_right, &child);
12681+ if (ret || child == NULL || jnode_check_flushprepped(child)) {
12682+ pos_stop(pos);
12683+ goto out;
12684+ }
12685+
12686+ /* check clean twig for possible relocation */
12687+ if (!znode_check_flushprepped(right_lock.node)) {
12688+ ret =
12689+ reverse_relocate_check_dirty_parent(child,
12690+ &at_right, pos);
12691+ if (ret)
12692+ goto out;
12693+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12694+ goto became_dirty;
12695+ }
12696+ }
12697+
12698+ assert("zam-875", znode_check_flushprepped(right_lock.node));
12699+
12700+ /* Update the preceder by a block number of just processed right twig
12701+ * node. The code above could miss the preceder updating because
12702+ * allocate_znode() could not be called for this node. */
12703+ pos->preceder.blk = *znode_get_block(right_lock.node);
12704+ check_preceder(pos->preceder.blk);
12705+
12706+ coord_init_first_unit(&at_right, right_lock.node);
12707+ assert("zam-868", coord_is_existing_unit(&at_right));
12708+
12709+ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12710+ move_flush_pos(pos, &right_lock, &right_load, &at_right);
12711+
12712+ out:
12713+ done_load_count(&right_load);
12714+ done_lh(&right_lock);
12715+
12716+ if (child)
12717+ jput(child);
12718+
12719+ return ret;
12720+}
12721+
12722+/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12723+ * continue there. */
12724+static int handle_pos_to_leaf(flush_pos_t * pos)
12725+{
12726+ int ret;
12727+ lock_handle child_lock;
12728+ load_count child_load;
12729+ jnode *child;
12730+
12731+ assert("zam-846", pos->state == POS_TO_LEAF);
12732+ assert("zam-847", item_is_internal(&pos->coord));
12733+
12734+ init_lh(&child_lock);
12735+ init_load_count(&child_load);
12736+
12737+ ret = get_leftmost_child_of_unit(&pos->coord, &child);
12738+ if (ret)
12739+ return ret;
12740+ if (child == NULL) {
12741+ pos_stop(pos);
12742+ return 0;
12743+ }
12744+
12745+ if (jnode_check_flushprepped(child)) {
12746+ pos->state = POS_INVALID;
12747+ goto out;
12748+ }
12749+
12750+ ret =
12751+ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
12752+ ZNODE_LOCK_LOPRI);
12753+ if (ret)
12754+ goto out;
12755+
12756+ ret = incr_load_count_znode(&child_load, JZNODE(child));
12757+ if (ret)
12758+ goto out;
12759+
12760+ ret = allocate_znode(JZNODE(child), &pos->coord, pos);
12761+ if (ret)
12762+ goto out;
12763+
12764+ /* move flush position to leaf level */
12765+ pos->state = POS_ON_LEAF;
12766+ move_flush_pos(pos, &child_lock, &child_load, NULL);
12767+
12768+ if (node_is_empty(JZNODE(child))) {
12769+ ret = delete_empty_node(JZNODE(child));
12770+ pos->state = POS_INVALID;
12771+ }
12772+ out:
12773+ done_load_count(&child_load);
12774+ done_lh(&child_lock);
12775+ jput(child);
12776+
12777+ return ret;
12778+}
12779+
12780+/* move pos from leaf to twig, and move lock from leaf to twig. */
12781+/* Move pos->lock to upper (twig) level */
12782+static int handle_pos_to_twig(flush_pos_t * pos)
12783+{
12784+ int ret;
12785+
12786+ lock_handle parent_lock;
12787+ load_count parent_load;
12788+ coord_t pcoord;
12789+
12790+ assert("zam-852", pos->state == POS_TO_TWIG);
12791+
12792+ init_lh(&parent_lock);
12793+ init_load_count(&parent_load);
12794+
12795+ ret =
12796+ reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
12797+ if (ret)
12798+ goto out;
12799+
12800+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
12801+ if (ret)
12802+ goto out;
12803+
12804+ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
12805+ if (ret)
12806+ goto out;
12807+
12808+ assert("zam-870", item_is_internal(&pcoord));
12809+ coord_next_item(&pcoord);
12810+
12811+ if (coord_is_after_rightmost(&pcoord))
12812+ pos->state = POS_END_OF_TWIG;
12813+ else if (item_is_extent(&pcoord))
12814+ pos->state = POS_ON_EPOINT;
12815+ else {
12816+ /* Here we understand that getting -E_NO_NEIGHBOR in
12817+ * handle_pos_on_leaf() was because of just a reaching edge of
12818+ * slum */
12819+ pos_stop(pos);
12820+ goto out;
12821+ }
12822+
12823+ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
12824+
12825+ out:
12826+ done_load_count(&parent_load);
12827+ done_lh(&parent_lock);
12828+
12829+ return ret;
12830+}
12831+
12832+typedef int (*pos_state_handle_t) (flush_pos_t *);
12833+static pos_state_handle_t flush_pos_handlers[] = {
12834+ /* process formatted nodes on leaf level, keep lock on a leaf node */
12835+ [POS_ON_LEAF] = handle_pos_on_leaf,
12836+ /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
12837+ * being processed */
12838+ [POS_ON_EPOINT] = handle_pos_on_twig,
12839+ /* move a lock from leaf node to its parent for further processing of unformatted nodes */
12840+ [POS_TO_TWIG] = handle_pos_to_twig,
12841+ /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
12842+ * pos->coord points to the leaf node we jump to */
12843+ [POS_TO_LEAF] = handle_pos_to_leaf,
12844+ /* after processing last extent in the twig node, attempting to shift items from the twigs
12845+ * right neighbor and process them while shifting */
12846+ [POS_END_OF_TWIG] = handle_pos_end_of_twig,
12847+ /* process formatted nodes on internal level, keep lock on an internal node */
12848+ [POS_ON_INTERNAL] = handle_pos_on_internal
12849+};
12850+
12851+/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
12852+ * encrypt) nodes and their ancestors in "parent-first" order */
12853+static int squalloc(flush_pos_t * pos)
12854+{
12855+ int ret = 0;
12856+
12857+ /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
12858+ * greater CPU efficiency? Measure and see.... -Hans */
12859+ while (pos_valid(pos)) {
12860+ ret = flush_pos_handlers[pos->state] (pos);
12861+ if (ret < 0)
12862+ break;
12863+
12864+ ret = rapid_flush(pos);
12865+ if (ret)
12866+ break;
12867+ }
12868+
12869+ /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
12870+ routines, -E_NO_NEIGHBOR means that slum edge was reached */
12871+ if (ret > 0 || ret == -E_NO_NEIGHBOR)
12872+ ret = 0;
12873+
12874+ return ret;
12875+}
12876+
12877+static void update_ldkey(znode * node)
12878+{
12879+ reiser4_key ldkey;
12880+
12881+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
12882+ if (node_is_empty(node))
12883+ return;
12884+
12885+ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
12886+}
12887+
12888+/* this is to be called after calling of shift node's method to shift data from @right to
12889+ @left. It sets left delimiting keys of @left and @right to keys of first items of @left
12890+ and @right correspondingly and sets right delimiting key of @left to first key of @right */
12891+static void update_znode_dkeys(znode * left, znode * right)
12892+{
12893+ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
12894+ assert("vs-1629", (znode_is_write_locked(left) &&
12895+ znode_is_write_locked(right)));
12896+
12897+ /* we need to update left delimiting of left if it was empty before shift */
12898+ update_ldkey(left);
12899+ update_ldkey(right);
12900+ if (node_is_empty(right))
12901+ znode_set_rd_key(left, znode_get_rd_key(right));
12902+ else
12903+ znode_set_rd_key(left, znode_get_ld_key(right));
12904+}
12905+
12906+/* try to shift everything from @right to @left. If everything was shifted -
12907+ @right is removed from the tree. Result is the number of bytes shifted. */
12908+static int
12909+shift_everything_left(znode * right, znode * left, carry_level * todo)
12910+{
12911+ coord_t from;
12912+ node_plugin *nplug;
12913+ carry_plugin_info info;
12914+
12915+ coord_init_after_last_item(&from, right);
12916+
12917+ nplug = node_plugin_by_node(right);
12918+ info.doing = NULL;
12919+ info.todo = todo;
12920+ return nplug->shift(&from, left, SHIFT_LEFT,
12921+ 1 /* delete @right if it becomes empty */ ,
12922+ 1
12923+ /* move coord @from to node @left if everything will be shifted */
12924+ ,
12925+ &info);
12926+}
12927+
12928+/* Shift as much as possible from @right to @left using the memcpy-optimized
12929+ shift_everything_left. @left and @right are formatted neighboring nodes on
12930+ leaf level. */
12931+static int squeeze_right_non_twig(znode * left, znode * right)
12932+{
12933+ int ret;
12934+ carry_pool *pool;
12935+ carry_level *todo;
12936+
12937+ assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
12938+
12939+ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
12940+ !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
12941+ return SQUEEZE_TARGET_FULL;
12942+
12943+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
12944+ if (IS_ERR(pool))
12945+ return PTR_ERR(pool);
12946+ todo = (carry_level *) (pool + 1);
12947+ init_carry_level(todo, pool);
12948+
12949+ ret = shift_everything_left(right, left, todo);
12950+ if (ret > 0) {
12951+ /* something was shifted */
12952+ reiser4_tree *tree;
12953+ __u64 grabbed;
12954+
12955+ znode_make_dirty(left);
12956+ znode_make_dirty(right);
12957+
12958+ /* update delimiting keys of nodes which participated in
12959+ shift. FIXME: it would be better to have this in shift
12960+ node's operation. But it can not be done there. Nobody
12961+ remembers why, though */
12962+ tree = znode_get_tree(left);
12963+ write_lock_dk(tree);
12964+ update_znode_dkeys(left, right);
12965+ write_unlock_dk(tree);
12966+
12967+ /* Carry is called to update delimiting key and, maybe, to remove empty
12968+ node. */
12969+ grabbed = get_current_context()->grabbed_blocks;
12970+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
12971+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
12972+ ret = reiser4_carry(todo, NULL /* previous level */ );
12973+ grabbed2free_mark(grabbed);
12974+ } else {
12975+ /* Shifting impossible, we return appropriate result code */
12976+ ret =
12977+ node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
12978+ SQUEEZE_TARGET_FULL;
12979+ }
12980+
12981+ done_carry_pool(pool);
12982+
12983+ return ret;
12984+}
12985+
12986+#if REISER4_DEBUG
12987+static int sibling_link_is_ok(const znode *left, const znode *right)
12988+{
12989+ int result;
12990+
12991+ read_lock_tree(znode_get_tree(left));
12992+ result = (left->right == right && left == right->left);
12993+ read_unlock_tree(znode_get_tree(left));
12994+ return result;
12995+}
12996+#endif
12997+
12998+/* Shift first unit of first item if it is an internal one. Return
12999+ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
13000+ SUBTREE_MOVED. */
13001+static int shift_one_internal_unit(znode * left, znode * right)
13002+{
13003+ int ret;
13004+ carry_pool *pool;
13005+ carry_level *todo;
13006+ coord_t *coord;
13007+ carry_plugin_info *info;
13008+ int size, moved;
13009+
13010+ assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
13011+ assert("nikita-2435", znode_is_write_locked(left));
13012+ assert("nikita-2436", znode_is_write_locked(right));
13013+ assert("nikita-2434", sibling_link_is_ok(left, right));
13014+
13015+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
13016+ sizeof(*coord) + sizeof(*info)
13017+#if REISER4_DEBUG
13018+ + sizeof(*coord) + 2 * sizeof(reiser4_key)
13019+#endif
13020+ );
13021+ if (IS_ERR(pool))
13022+ return PTR_ERR(pool);
13023+ todo = (carry_level *) (pool + 1);
13024+ init_carry_level(todo, pool);
13025+
13026+ coord = (coord_t *) (todo + 3);
13027+ coord_init_first_unit(coord, right);
13028+ info = (carry_plugin_info *) (coord + 1);
13029+
13030+#if REISER4_DEBUG
13031+ if (!node_is_empty(left)) {
13032+ coord_t *last;
13033+ reiser4_key *right_key;
13034+ reiser4_key *left_key;
13035+
13036+ last = (coord_t *) (info + 1);
13037+ right_key = (reiser4_key *) (last + 1);
13038+ left_key = right_key + 1;
13039+ coord_init_last_unit(last, left);
13040+
13041+ assert("nikita-2463",
13042+ keyle(item_key_by_coord(last, left_key),
13043+ item_key_by_coord(coord, right_key)));
13044+ }
13045+#endif
13046+
13047+ assert("jmacd-2007", item_is_internal(coord));
13048+
13049+ size = item_length_by_coord(coord);
13050+ info->todo = todo;
13051+ info->doing = NULL;
13052+
13053+ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13054+ 1
13055+ /* delete @right if it becomes empty */
13056+ ,
13057+ 0
13058+ /* do not move coord @coord to node @left */
13059+ ,
13060+ info);
13061+
13062+ /* If shift returns positive, then we shifted the item. */
13063+ assert("vs-423", ret <= 0 || size == ret);
13064+ moved = (ret > 0);
13065+
13066+ if (moved) {
13067+ /* something was moved */
13068+ reiser4_tree *tree;
13069+ int grabbed;
13070+
13071+ znode_make_dirty(left);
13072+ znode_make_dirty(right);
13073+ tree = znode_get_tree(left);
13074+ write_lock_dk(tree);
13075+ update_znode_dkeys(left, right);
13076+ write_unlock_dk(tree);
13077+
13078+ /* reserve space for delimiting keys after shifting */
13079+ grabbed = get_current_context()->grabbed_blocks;
13080+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13081+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13082+
13083+ ret = reiser4_carry(todo, NULL /* previous level */ );
13084+ grabbed2free_mark(grabbed);
13085+ }
13086+
13087+ done_carry_pool(pool);
13088+
13089+ if (ret != 0) {
13090+ /* Shift or carry operation failed. */
13091+ assert("jmacd-7325", ret < 0);
13092+ return ret;
13093+ }
13094+
13095+ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13096+}
13097+
13098+/* Make the final relocate/wander decision during forward parent-first squalloc for a
13099+ znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13100+static int
13101+allocate_znode_loaded(znode * node,
13102+ const coord_t * parent_coord, flush_pos_t * pos)
13103+{
13104+ int ret;
13105+ reiser4_super_info_data *sbinfo = get_current_super_private();
13106+ /* FIXME(D): We have the node write-locked and should have checked for !
13107+ allocated() somewhere before reaching this point, but there can be a race, so
13108+ this assertion is bogus. */
13109+ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13110+ assert("jmacd-7988", znode_is_write_locked(node));
13111+ assert("jmacd-7989", coord_is_invalid(parent_coord)
13112+ || znode_is_write_locked(parent_coord->node));
13113+
13114+ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13115+ znode_is_root(node) ||
13116+ /* We have enough nodes to relocate no matter what. */
13117+ (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13118+ /* No need to decide with new nodes, they are treated the same as
13119+ relocate. If the root node is dirty, relocate. */
13120+ if (pos->preceder.blk == 0) {
13121+ /* preceder is unknown and we have decided to relocate node --
13122+ using of default value for search start is better than search
13123+ from block #0. */
13124+ get_blocknr_hint_default(&pos->preceder.blk);
13125+ check_preceder(pos->preceder.blk);
13126+ }
13127+
13128+ goto best_reloc;
13129+
13130+ } else if (pos->preceder.blk == 0) {
13131+ /* If we don't know the preceder, leave it where it is. */
13132+ jnode_make_wander(ZJNODE(node));
13133+ } else {
13134+ /* Make a decision based on block distance. */
13135+ reiser4_block_nr dist;
13136+ reiser4_block_nr nblk = *znode_get_block(node);
13137+
13138+ assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13139+ assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13140+ assert("jmacd-6174", pos->preceder.blk != 0);
13141+
13142+ if (pos->preceder.blk == nblk - 1) {
13143+ /* Ideal. */
13144+ jnode_make_wander(ZJNODE(node));
13145+ } else {
13146+
13147+ dist =
13148+ (nblk <
13149+ pos->preceder.blk) ? (pos->preceder.blk -
13150+ nblk) : (nblk -
13151+ pos->preceder.blk);
13152+
13153+ /* See if we can find a closer block (forward direction only). */
13154+ pos->preceder.max_dist =
13155+ min((reiser4_block_nr) sbinfo->flush.
13156+ relocate_distance, dist);
13157+ pos->preceder.level = znode_get_level(node);
13158+
13159+ ret = allocate_znode_update(node, parent_coord, pos);
13160+
13161+ pos->preceder.max_dist = 0;
13162+
13163+ if (ret && (ret != -ENOSPC))
13164+ return ret;
13165+
13166+ if (ret == 0) {
13167+ /* Got a better allocation. */
13168+ znode_make_reloc(node, pos->fq);
13169+ } else if (dist < sbinfo->flush.relocate_distance) {
13170+ /* The present allocation is good enough. */
13171+ jnode_make_wander(ZJNODE(node));
13172+ } else {
13173+ /* Otherwise, try to relocate to the best position. */
13174+ best_reloc:
13175+ ret =
13176+ allocate_znode_update(node, parent_coord,
13177+ pos);
13178+ if (ret != 0)
13179+ return ret;
13180+
13181+ /* set JNODE_RELOC bit _after_ node gets allocated */
13182+ znode_make_reloc(node, pos->fq);
13183+ }
13184+ }
13185+ }
13186+
13187+ /* This is the new preceder. */
13188+ pos->preceder.blk = *znode_get_block(node);
13189+ check_preceder(pos->preceder.blk);
13190+ pos->alloc_cnt += 1;
13191+
13192+ assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13193+
13194+ return 0;
13195+}
13196+
13197+static int
13198+allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13199+{
13200+ /*
13201+ * perform znode allocation with znode pinned in memory to avoid races
13202+ * with asynchronous emergency flush (which plays with
13203+ * JNODE_FLUSH_RESERVED bit).
13204+ */
13205+ return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13206+}
13207+
13208+/* A subroutine of allocate_znode, this is called first to see if there is a close
13209+ position to relocate to. It may return ENOSPC if there is no close position. If there
13210+ is no close position it may not relocate. This takes care of updating the parent node
13211+ with the relocated block address. */
13212+static int
13213+allocate_znode_update(znode * node, const coord_t * parent_coord,
13214+ flush_pos_t * pos)
13215+{
13216+ int ret;
13217+ reiser4_block_nr blk;
13218+ lock_handle uber_lock;
13219+ int flush_reserved_used = 0;
13220+ int grabbed;
13221+ reiser4_context *ctx;
13222+ reiser4_super_info_data *sbinfo;
13223+
13224+ init_lh(&uber_lock);
13225+
13226+ ctx = get_current_context();
13227+ sbinfo = get_super_private(ctx->super);
13228+
13229+ grabbed = ctx->grabbed_blocks;
13230+
13231+ /* discard e-flush allocation */
13232+ ret = zload(node);
13233+ if (ret)
13234+ return ret;
13235+
13236+ if (ZF_ISSET(node, JNODE_CREATED)) {
13237+ assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13238+ pos->preceder.block_stage = BLOCK_UNALLOCATED;
13239+ } else {
13240+ pos->preceder.block_stage = BLOCK_GRABBED;
13241+
13242+ /* The disk space for relocating the @node is already reserved in "flush reserved"
13243+ * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13244+ * space from whole disk not from only 95%). */
13245+ if (znode_get_level(node) == LEAF_LEVEL) {
13246+ /*
13247+ * earlier (during do_jnode_make_dirty()) we decided
13248+ * that @node can possibly go into overwrite set and
13249+ * reserved block for its wandering location.
13250+ */
13251+ txn_atom *atom = get_current_atom_locked();
13252+ assert("nikita-3449",
13253+ ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13254+ flush_reserved2grabbed(atom, (__u64) 1);
13255+ spin_unlock_atom(atom);
13256+ /*
13257+ * we are trying to move node into relocate
13258+ * set. Allocation of relocated position "uses"
13259+ * reserved block.
13260+ */
13261+ ZF_CLR(node, JNODE_FLUSH_RESERVED);
13262+ flush_reserved_used = 1;
13263+ } else {
13264+ ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13265+ if (ret != 0)
13266+ goto exit;
13267+ }
13268+ }
13269+
13270+ /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13271+ ret = reiser4_alloc_block(&pos->preceder, &blk,
13272+ BA_FORMATTED | BA_PERMANENT);
13273+ if (ret)
13274+ goto exit;
13275+
13276+ if (!ZF_ISSET(node, JNODE_CREATED) &&
13277+ (ret =
13278+ reiser4_dealloc_block(znode_get_block(node), 0,
13279+ BA_DEFER | BA_FORMATTED)))
13280+ goto exit;
13281+
13282+ if (likely(!znode_is_root(node))) {
13283+ item_plugin *iplug;
13284+
13285+ iplug = item_plugin_by_coord(parent_coord);
13286+ assert("nikita-2954", iplug->f.update != NULL);
13287+ iplug->f.update(parent_coord, &blk);
13288+
13289+ znode_make_dirty(parent_coord->node);
13290+
13291+ } else {
13292+ reiser4_tree *tree = znode_get_tree(node);
13293+ znode *uber;
13294+
13295+ /* We take a longterm lock on the fake node in order to change
13296+ the root block number. This may cause atom fusion. */
13297+ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13298+ &uber_lock);
13299+ /* The fake node cannot be deleted, and we must have priority
13300+ here, and may not be confused with ENOSPC. */
13301+ assert("jmacd-74412",
13302+ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13303+
13304+ if (ret)
13305+ goto exit;
13306+
13307+ uber = uber_lock.node;
13308+
13309+ write_lock_tree(tree);
13310+ tree->root_block = blk;
13311+ write_unlock_tree(tree);
13312+
13313+ znode_make_dirty(uber);
13314+ }
13315+
13316+ ret = znode_rehash(node, &blk);
13317+ exit:
13318+ if (ret) {
13319+ /* Get flush reserved block back if something fails, because
13320+ * callers assume that on error block wasn't relocated and its
13321+ * flush reserved block wasn't used. */
13322+ if (flush_reserved_used) {
13323+ /*
13324+ * ok, we failed to move node into relocate
13325+ * set. Restore status quo.
13326+ */
13327+ grabbed2flush_reserved((__u64) 1);
13328+ ZF_SET(node, JNODE_FLUSH_RESERVED);
13329+ }
13330+ }
13331+ zrelse(node);
13332+ done_lh(&uber_lock);
13333+ grabbed2free_mark(grabbed);
13334+ return ret;
13335+}
13336+
13337+/* JNODE INTERFACE */
13338+
13339+/* Lock a node (if formatted) and then get its parent locked, set the child's
13340+ coordinate in the parent. If the child is the root node, the above_root
13341+ znode is returned but the coord is not set. This function may cause atom
13342+ fusion, but it is only used for read locks (at this point) and therefore
13343+ fusion only occurs when the parent is already dirty. */
13344+/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13345+ pointer in jnodes. */
13346+static int
13347+jnode_lock_parent_coord(jnode * node,
13348+ coord_t * coord,
13349+ lock_handle * parent_lh,
13350+ load_count * parent_zh,
13351+ znode_lock_mode parent_mode, int try)
13352+{
13353+ int ret;
13354+
13355+ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13356+ assert("edward-54", jnode_is_unformatted(node)
13357+ || znode_is_any_locked(JZNODE(node)));
13358+
13359+ if (!jnode_is_znode(node)) {
13360+ reiser4_key key;
13361+ tree_level stop_level = TWIG_LEVEL;
13362+ lookup_bias bias = FIND_EXACT;
13363+
13364+ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13365+
13366+ /* The case when node is not znode, but can have parent coord
13367+ (unformatted node, node which represents cluster page,
13368+ etc..). Generate a key for the appropriate entry, search
13369+ in the tree using coord_by_key, which handles locking for
13370+ us. */
13371+
13372+ /*
13373+ * nothing is locked at this moment, so, nothing prevents
13374+ * concurrent truncate from removing jnode from inode. To
13375+ * prevent this spin-lock jnode. jnode can be truncated just
13376+ * after call to the jnode_build_key(), but this is ok,
13377+ * because coord_by_key() will just fail to find appropriate
13378+ * extent.
13379+ */
13380+ spin_lock_jnode(node);
13381+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13382+ jnode_build_key(node, &key);
13383+ ret = 0;
13384+ } else
13385+ ret = RETERR(-ENOENT);
13386+ spin_unlock_jnode(node);
13387+
13388+ if (ret != 0)
13389+ return ret;
13390+
13391+ if (jnode_is_cluster_page(node))
13392+ stop_level = LEAF_LEVEL;
13393+
13394+ assert("jmacd-1812", coord != NULL);
13395+
13396+ ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13397+ parent_mode, bias, stop_level, stop_level,
13398+ CBK_UNIQUE, NULL /*ra_info */ );
13399+ switch (ret) {
13400+ case CBK_COORD_NOTFOUND:
13401+ assert("edward-1038",
13402+ ergo(jnode_is_cluster_page(node),
13403+ JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13404+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13405+ warning("nikita-3177", "Parent not found");
13406+ return ret;
13407+ case CBK_COORD_FOUND:
13408+ if (coord->between != AT_UNIT) {
13409+ /* FIXME: comment needed */
13410+ done_lh(parent_lh);
13411+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13412+ warning("nikita-3178",
13413+ "Found but not happy: %i",
13414+ coord->between);
13415+ }
13416+ return RETERR(-ENOENT);
13417+ }
13418+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13419+ if (ret != 0)
13420+ return ret;
13421+ /* if (jnode_is_cluster_page(node)) {
13422+ races with write() are possible
13423+ check_child_cluster (parent_lh->node);
13424+ }
13425+ */
13426+ break;
13427+ default:
13428+ return ret;
13429+ }
13430+
13431+ } else {
13432+ int flags;
13433+ znode *z;
13434+
13435+ z = JZNODE(node);
13436+ /* Formatted node case: */
13437+ assert("jmacd-2061", !znode_is_root(z));
13438+
13439+ flags = GN_ALLOW_NOT_CONNECTED;
13440+ if (try)
13441+ flags |= GN_TRY_LOCK;
13442+
13443+ ret =
13444+ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13445+ if (ret != 0)
13446+ /* -E_REPEAT is ok here, it is handled by the caller. */
13447+ return ret;
13448+
13449+ /* Make the child's position "hint" up-to-date. (Unless above
13450+ root, which caller must check.) */
13451+ if (coord != NULL) {
13452+
13453+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13454+ if (ret != 0) {
13455+ warning("jmacd-976812386",
13456+ "incr_load_count_znode failed: %d",
13457+ ret);
13458+ return ret;
13459+ }
13460+
13461+ ret = find_child_ptr(parent_lh->node, z, coord);
13462+ if (ret != 0) {
13463+ warning("jmacd-976812",
13464+ "find_child_ptr failed: %d", ret);
13465+ return ret;
13466+ }
13467+ }
13468+ }
13469+
13470+ return 0;
13471+}
13472+
13473+/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13474+ If there is no next neighbor or the neighbor is not in memory or if there is a
13475+ neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13476+ In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13477+static int neighbor_in_slum(znode * node, /* starting point */
13478+ lock_handle * lock, /* lock on starting point */
13479+ sideof side, /* left or right direction we seek the next node in */
13480+ znode_lock_mode mode, /* kind of lock we want */
13481+ int check_dirty, /* true if the neighbor should be dirty */
13482+ int use_upper_levels /* get neighbor by going though
13483+ upper levels */)
13484+{
13485+ int ret;
13486+ int flags;
13487+
13488+ assert("jmacd-6334", znode_is_connected(node));
13489+
13490+ flags = GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
13491+ if (use_upper_levels)
13492+ flags |= GN_CAN_USE_UPPER_LEVELS;
13493+
13494+ ret = reiser4_get_neighbor(lock, node, mode, flags);
13495+ if (ret) {
13496+ /* May return -ENOENT or -E_NO_NEIGHBOR. */
13497+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13498+ if (ret == -ENOENT) {
13499+ ret = RETERR(-E_NO_NEIGHBOR);
13500+ }
13501+ return ret;
13502+ }
13503+ if (!check_dirty)
13504+ return 0;
13505+ /* Check dirty bit of locked znode, no races here */
13506+ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13507+ return 0;
13508+
13509+ done_lh(lock);
13510+ return RETERR(-E_NO_NEIGHBOR);
13511+}
13512+
13513+/* Return true if two znodes have the same parent. This is called with both nodes
13514+ write-locked (for squeezing) so no tree lock is needed. */
13515+static int znode_same_parents(znode * a, znode * b)
13516+{
13517+ int result;
13518+
13519+ assert("jmacd-7011", znode_is_write_locked(a));
13520+ assert("jmacd-7012", znode_is_write_locked(b));
13521+
13522+ /* We lock the whole tree for this check.... I really don't like whole tree
13523+ * locks... -Hans */
13524+ read_lock_tree(znode_get_tree(a));
13525+ result = (znode_parent(a) == znode_parent(b));
13526+ read_unlock_tree(znode_get_tree(a));
13527+ return result;
13528+}
13529+
13530+/* FLUSH SCAN */
13531+
13532+/* Initialize the flush_scan data structure. */
13533+static void scan_init(flush_scan * scan)
13534+{
13535+ memset(scan, 0, sizeof(*scan));
13536+ init_lh(&scan->node_lock);
13537+ init_lh(&scan->parent_lock);
13538+ init_load_count(&scan->parent_load);
13539+ init_load_count(&scan->node_load);
13540+ coord_init_invalid(&scan->parent_coord, NULL);
13541+}
13542+
13543+/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
13544+static void scan_done(flush_scan * scan)
13545+{
13546+ done_load_count(&scan->node_load);
13547+ if (scan->node != NULL) {
13548+ jput(scan->node);
13549+ scan->node = NULL;
13550+ }
13551+ done_load_count(&scan->parent_load);
13552+ done_lh(&scan->parent_lock);
13553+ done_lh(&scan->node_lock);
13554+}
13555+
13556+/* Returns true if flush scanning is finished. */
13557+int reiser4_scan_finished(flush_scan * scan)
13558+{
13559+ return scan->stop || (scan->direction == RIGHT_SIDE &&
13560+ scan->count >= scan->max_count);
13561+}
13562+
13563+/* Return true if the scan should continue to the @tonode. True if the node meets the
13564+ same_slum_check condition. If not, deref the "left" node and stop the scan. */
13565+int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
13566+{
13567+ int go = same_slum_check(scan->node, tonode, 1, 0);
13568+
13569+ if (!go) {
13570+ scan->stop = 1;
13571+ jput(tonode);
13572+ }
13573+
13574+ return go;
13575+}
13576+
13577+/* Set the current scan->node, refcount it, increment count by the @add_count (number to
13578+ count, e.g., skipped unallocated nodes), deref previous current, and copy the current
13579+ parent coordinate. */
13580+int
13581+scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13582+ const coord_t * parent)
13583+{
13584+ /* Release the old references, take the new reference. */
13585+ done_load_count(&scan->node_load);
13586+
13587+ if (scan->node != NULL) {
13588+ jput(scan->node);
13589+ }
13590+ scan->node = node;
13591+ scan->count += add_count;
13592+
13593+ /* This next stmt is somewhat inefficient. The reiser4_scan_extent() code could
13594+ delay this update step until it finishes and update the parent_coord only once.
13595+ It did that before, but there was a bug and this was the easiest way to make it
13596+ correct. */
13597+ if (parent != NULL) {
13598+ coord_dup(&scan->parent_coord, parent);
13599+ }
13600+
13601+ /* Failure may happen at the incr_load_count call, but the caller can assume the reference
13602+ is safely taken. */
13603+ return incr_load_count_jnode(&scan->node_load, node);
13604+}
13605+
13606+/* Return true if scanning in the leftward direction. */
13607+int reiser4_scanning_left(flush_scan * scan)
13608+{
13609+ return scan->direction == LEFT_SIDE;
13610+}
13611+
13612+/* Performs leftward scanning starting from either kind of node. Counts the starting
13613+ node. The right-scan object is passed in for the left-scan in order to copy the parent
13614+ of an unformatted starting position. This way we avoid searching for the unformatted
13615+ node's parent when scanning in each direction. If we search for the parent once it is
13616+ set in both scan objects. The limit parameter tells flush-scan when to stop.
13617+
13618+ Rapid scanning is used only during scan_left, where we are interested in finding the
13619+ 'leftpoint' where we begin flushing. We are interested in stopping at the left child
13620+ of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The
13621+ problem is finding a way to flush only those nodes without unallocated children, and it
13622+ is difficult to solve in the bottom-up flushing algorithm we are currently using. The
13623+ problem can be solved by scanning left at every level as we go upward, but this would
13624+ basically bring us back to using a top-down allocation strategy, which we already tried
13625+ (see BK history from May 2002), and has a different set of problems. The top-down
13626+ strategy makes avoiding unallocated children easier, but makes it difficult to
13627+ propertly flush dirty children with clean parents that would otherwise stop the
13628+ top-down flush, only later to dirty the parent once the children are flushed. So we
13629+ solve the problem in the bottom-up algorithm with a special case for twigs and leaves
13630+ only.
13631+
13632+ The first step in solving the problem is this rapid leftward scan. After we determine
13633+ that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
13634+ are no longer interested in the exact count, we are only interested in finding a the
13635+ best place to start the flush. We could choose one of two possibilities:
13636+
13637+ 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
13638+ This requires checking one leaf per rapid-scan twig
13639+
13640+ 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
13641+ to the left. This requires checking possibly all of the in-memory children of each
13642+ twig during the rapid scan.
13643+
13644+ For now we implement the first policy.
13645+*/
13646+static int
13647+scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13648+{
13649+ int ret = 0;
13650+
13651+ scan->max_count = limit;
13652+ scan->direction = LEFT_SIDE;
13653+
13654+ ret = scan_set_current(scan, jref(node), 1, NULL);
13655+ if (ret != 0) {
13656+ return ret;
13657+ }
13658+
13659+ ret = scan_common(scan, right);
13660+ if (ret != 0) {
13661+ return ret;
13662+ }
13663+
13664+ /* Before rapid scanning, we need a lock on scan->node so that we can get its
13665+ parent, only if formatted. */
13666+ if (jnode_is_znode(scan->node)) {
13667+ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13668+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13669+ }
13670+
13671+ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
13672+ return ret;
13673+}
13674+
13675+/* Performs rightward scanning... Does not count the starting node. The limit parameter
13676+ is described in scan_left. If the starting node is unformatted then the
13677+ parent_coord was already set during scan_left. The rapid_after parameter is not used
13678+ during right-scanning.
13679+
13680+ scan_right is only called if the scan_left operation does not count at least
13681+ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to
13682+ the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
13683+ scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
13684+static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13685+{
13686+ int ret;
13687+
13688+ scan->max_count = limit;
13689+ scan->direction = RIGHT_SIDE;
13690+
13691+ ret = scan_set_current(scan, jref(node), 0, NULL);
13692+ if (ret != 0) {
13693+ return ret;
13694+ }
13695+
13696+ return scan_common(scan, NULL);
13697+}
13698+
13699+/* Common code to perform left or right scanning. */
13700+static int scan_common(flush_scan * scan, flush_scan * other)
13701+{
13702+ int ret;
13703+
13704+ assert("nikita-2376", scan->node != NULL);
13705+ assert("edward-54", jnode_is_unformatted(scan->node)
13706+ || jnode_is_znode(scan->node));
13707+
13708+ /* Special case for starting at an unformatted node. Optimization: we only want
13709+ to search for the parent (which requires a tree traversal) once. Obviously, we
13710+ shouldn't have to call it once for the left scan and once for the right scan.
13711+ For this reason, if we search for the parent during scan-left we then duplicate
13712+ the coord/lock/load into the scan-right object. */
13713+ if (jnode_is_unformatted(scan->node)) {
13714+ ret = scan_unformatted(scan, other);
13715+ if (ret != 0)
13716+ return ret;
13717+ }
13718+ /* This loop expects to start at a formatted position and performs chaining of
13719+ formatted regions */
13720+ while (!reiser4_scan_finished(scan)) {
13721+
13722+ ret = scan_formatted(scan);
13723+ if (ret != 0) {
13724+ return ret;
13725+ }
13726+ }
13727+
13728+ return 0;
13729+}
13730+
13731+static int scan_unformatted(flush_scan * scan, flush_scan * other)
13732+{
13733+ int ret = 0;
13734+ int try = 0;
13735+
13736+ if (!coord_is_invalid(&scan->parent_coord))
13737+ goto scan;
13738+
13739+ /* set parent coord from */
13740+ if (!jnode_is_unformatted(scan->node)) {
13741+ /* formatted position */
13742+
13743+ lock_handle lock;
13744+ assert("edward-301", jnode_is_znode(scan->node));
13745+ init_lh(&lock);
13746+
13747+ /*
13748+ * when flush starts from unformatted node, first thing it
13749+ * does is tree traversal to find formatted parent of starting
13750+ * node. This parent is then kept lock across scans to the
13751+ * left and to the right. This means that during scan to the
13752+ * left we cannot take left-ward lock, because this is
13753+ * dead-lock prone. So, if we are scanning to the left and
13754+ * there is already lock held by this thread,
13755+ * jnode_lock_parent_coord() should use try-lock.
13756+ */
13757+ try = reiser4_scanning_left(scan)
13758+ && !lock_stack_isclean(get_current_lock_stack());
13759+ /* Need the node locked to get the parent lock, We have to
13760+ take write lock since there is at least one call path
13761+ where this znode is already write-locked by us. */
13762+ ret =
13763+ longterm_lock_znode(&lock, JZNODE(scan->node),
13764+ ZNODE_WRITE_LOCK,
13765+ reiser4_scanning_left(scan) ?
13766+ ZNODE_LOCK_LOPRI :
13767+ ZNODE_LOCK_HIPRI);
13768+ if (ret != 0)
13769+ /* EINVAL or E_DEADLOCK here mean... try again! At this point we've
13770+ scanned too far and can't back out, just start over. */
13771+ return ret;
13772+
13773+ ret = jnode_lock_parent_coord(scan->node,
13774+ &scan->parent_coord,
13775+ &scan->parent_lock,
13776+ &scan->parent_load,
13777+ ZNODE_WRITE_LOCK, try);
13778+
13779+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13780+ done_lh(&lock);
13781+ if (ret == -E_REPEAT) {
13782+ scan->stop = 1;
13783+ return 0;
13784+ }
13785+ if (ret)
13786+ return ret;
13787+
13788+ } else {
13789+ /* unformatted position */
13790+
13791+ ret =
13792+ jnode_lock_parent_coord(scan->node, &scan->parent_coord,
13793+ &scan->parent_lock,
13794+ &scan->parent_load,
13795+ ZNODE_WRITE_LOCK, try);
13796+
13797+ if (IS_CBKERR(ret))
13798+ return ret;
13799+
13800+ if (ret == CBK_COORD_NOTFOUND)
13801+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13802+ return ret;
13803+
13804+ /* parent was found */
13805+ assert("jmacd-8661", other != NULL);
13806+ /* Duplicate the reference into the other flush_scan. */
13807+ coord_dup(&other->parent_coord, &scan->parent_coord);
13808+ copy_lh(&other->parent_lock, &scan->parent_lock);
13809+ copy_load_count(&other->parent_load, &scan->parent_load);
13810+ }
13811+ scan:
13812+ return scan_by_coord(scan);
13813+}
13814+
13815+/* Performs left- or rightward scanning starting from a formatted node. Follow left
13816+ pointers under tree lock as long as:
13817+
13818+ - node->left/right is non-NULL
13819+ - node->left/right is connected, dirty
13820+ - node->left/right belongs to the same atom
13821+ - scan has not reached maximum count
13822+*/
13823+static int scan_formatted(flush_scan * scan)
13824+{
13825+ int ret;
13826+ znode *neighbor = NULL;
13827+
13828+ assert("jmacd-1401", !reiser4_scan_finished(scan));
13829+
13830+ do {
13831+ znode *node = JZNODE(scan->node);
13832+
13833+ /* Node should be connected, but if not stop the scan. */
13834+ if (!znode_is_connected(node)) {
13835+ scan->stop = 1;
13836+ break;
13837+ }
13838+
13839+ /* Lock the tree, check-for and reference the next sibling. */
13840+ read_lock_tree(znode_get_tree(node));
13841+
13842+ /* It may be that a node is inserted or removed between a node and its
13843+ left sibling while the tree lock is released, but the flush-scan count
13844+ does not need to be precise. Thus, we release the tree lock as soon as
13845+ we get the neighboring node. */
13846+ neighbor =
13847+ reiser4_scanning_left(scan) ? node->left : node->right;
13848+ if (neighbor != NULL) {
13849+ zref(neighbor);
13850+ }
13851+
13852+ read_unlock_tree(znode_get_tree(node));
13853+
13854+ /* If neighbor is NULL at the leaf level, need to check for an unformatted
13855+ sibling using the parent--break in any case. */
13856+ if (neighbor == NULL) {
13857+ break;
13858+ }
13859+
13860+ /* Check the condition for going left, break if it is not met. This also
13861+ releases (jputs) the neighbor if false. */
13862+ if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
13863+ break;
13864+ }
13865+
13866+ /* Advance the flush_scan state to the left, repeat. */
13867+ ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
13868+ if (ret != 0) {
13869+ return ret;
13870+ }
13871+
13872+ } while (!reiser4_scan_finished(scan));
13873+
13874+ /* If neighbor is NULL then we reached the end of a formatted region, or else the
13875+ sibling is out of memory, now check for an extent to the left (as long as
13876+ LEAF_LEVEL). */
13877+ if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
13878+ || reiser4_scan_finished(scan)) {
13879+ scan->stop = 1;
13880+ return 0;
13881+ }
13882+ /* Otherwise, calls scan_by_coord for the right(left)most item of the
13883+ left(right) neighbor on the parent level, then possibly continue. */
13884+
13885+ coord_init_invalid(&scan->parent_coord, NULL);
13886+ return scan_unformatted(scan, NULL);
13887+}
13888+
13889+/* NOTE-EDWARD:
13890+ This scans adjacent items of the same type and calls scan flush plugin for each one.
13891+ Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start
13892+ from unformatted node, then we continue only if the next neighbor is also unformatted.
13893+ When called from scan_formatted, we skip first iteration (to make sure that
13894+ right(left)most item of the left(right) neighbor on the parent level is of the same
13895+ type and set appropriate coord). */
13896+static int scan_by_coord(flush_scan * scan)
13897+{
13898+ int ret = 0;
13899+ int scan_this_coord;
13900+ lock_handle next_lock;
13901+ load_count next_load;
13902+ coord_t next_coord;
13903+ jnode *child;
13904+ item_plugin *iplug;
13905+
13906+ init_lh(&next_lock);
13907+ init_load_count(&next_load);
13908+ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
13909+
13910+ /* set initial item id */
13911+ iplug = item_plugin_by_coord(&scan->parent_coord);
13912+
13913+ for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
13914+ if (scan_this_coord) {
13915+ /* Here we expect that unit is scannable. it would not be so due
13916+ * to race with extent->tail conversion. */
13917+ if (iplug->f.scan == NULL) {
13918+ scan->stop = 1;
13919+ ret = -E_REPEAT;
13920+ /* skip the check at the end. */
13921+ goto race;
13922+ }
13923+
13924+ ret = iplug->f.scan(scan);
13925+ if (ret != 0)
13926+ goto exit;
13927+
13928+ if (reiser4_scan_finished(scan)) {
13929+ checkchild(scan);
13930+ break;
13931+ }
13932+ } else {
13933+ /* the same race against truncate as above is possible
13934+ * here, it seems */
13935+
13936+ /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
13937+ the first coordinate. */
13938+ assert("jmacd-1231",
13939+ item_is_internal(&scan->parent_coord));
13940+ }
13941+
13942+ if (iplug->f.utmost_child == NULL
13943+ || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
13944+ /* stop this coord and continue on parrent level */
13945+ ret =
13946+ scan_set_current(scan,
13947+ ZJNODE(zref
13948+ (scan->parent_coord.node)),
13949+ 1, NULL);
13950+ if (ret != 0)
13951+ goto exit;
13952+ break;
13953+ }
13954+
13955+ /* Either way, the invariant is that scan->parent_coord is set to the
13956+ parent of scan->node. Now get the next unit. */
13957+ coord_dup(&next_coord, &scan->parent_coord);
13958+ coord_sideof_unit(&next_coord, scan->direction);
13959+
13960+ /* If off-the-end of the twig, try the next twig. */
13961+ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
13962+ /* We take the write lock because we may start flushing from this
13963+ * coordinate. */
13964+ ret = neighbor_in_slum(next_coord.node,
13965+ &next_lock,
13966+ scan->direction,
13967+ ZNODE_WRITE_LOCK,
13968+ 1 /* check dirty */,
13969+ 0 /* don't go though upper
13970+ levels */);
13971+ if (ret == -E_NO_NEIGHBOR) {
13972+ scan->stop = 1;
13973+ ret = 0;
13974+ break;
13975+ }
13976+
13977+ if (ret != 0) {
13978+ goto exit;
13979+ }
13980+
13981+ ret = incr_load_count_znode(&next_load, next_lock.node);
13982+ if (ret != 0) {
13983+ goto exit;
13984+ }
13985+
13986+ coord_init_sideof_unit(&next_coord, next_lock.node,
13987+ sideof_reverse(scan->direction));
13988+ }
13989+
13990+ iplug = item_plugin_by_coord(&next_coord);
13991+
13992+ /* Get the next child. */
13993+ ret =
13994+ iplug->f.utmost_child(&next_coord,
13995+ sideof_reverse(scan->direction),
13996+ &child);
13997+ if (ret != 0)
13998+ goto exit;
13999+ /* If the next child is not in memory, or, item_utmost_child
14000+ failed (due to race with unlink, most probably), stop
14001+ here. */
14002+ if (child == NULL || IS_ERR(child)) {
14003+ scan->stop = 1;
14004+ checkchild(scan);
14005+ break;
14006+ }
14007+
14008+ assert("nikita-2374", jnode_is_unformatted(child)
14009+ || jnode_is_znode(child));
14010+
14011+ /* See if it is dirty, part of the same atom. */
14012+ if (!reiser4_scan_goto(scan, child)) {
14013+ checkchild(scan);
14014+ break;
14015+ }
14016+
14017+ /* If so, make this child current. */
14018+ ret = scan_set_current(scan, child, 1, &next_coord);
14019+ if (ret != 0)
14020+ goto exit;
14021+
14022+ /* Now continue. If formatted we release the parent lock and return, then
14023+ proceed. */
14024+ if (jnode_is_znode(child))
14025+ break;
14026+
14027+ /* Otherwise, repeat the above loop with next_coord. */
14028+ if (next_load.node != NULL) {
14029+ done_lh(&scan->parent_lock);
14030+ move_lh(&scan->parent_lock, &next_lock);
14031+ move_load_count(&scan->parent_load, &next_load);
14032+ }
14033+ }
14034+
14035+ assert("jmacd-6233",
14036+ reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
14037+ exit:
14038+ checkchild(scan);
14039+ race: /* skip the above check */
14040+ if (jnode_is_znode(scan->node)) {
14041+ done_lh(&scan->parent_lock);
14042+ done_load_count(&scan->parent_load);
14043+ }
14044+
14045+ done_load_count(&next_load);
14046+ done_lh(&next_lock);
14047+ return ret;
14048+}
14049+
14050+/* FLUSH POS HELPERS */
14051+
14052+/* Initialize the fields of a flush_position. */
14053+static void pos_init(flush_pos_t * pos)
14054+{
14055+ memset(pos, 0, sizeof *pos);
14056+
14057+ pos->state = POS_INVALID;
14058+ coord_init_invalid(&pos->coord, NULL);
14059+ init_lh(&pos->lock);
14060+ init_load_count(&pos->load);
14061+
14062+ reiser4_blocknr_hint_init(&pos->preceder);
14063+}
14064+
14065+/* The flush loop inside squalloc periodically checks pos_valid to
14066+ determine when "enough flushing" has been performed. This will return true until one
14067+ of the following conditions is met:
14068+
14069+ 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14070+ parameter, meaning we have flushed as many blocks as the kernel requested. When
14071+ flushing to commit, this parameter is NULL.
14072+
14073+ 2. pos_stop() is called because squalloc discovers that the "next" node in the
14074+ flush order is either non-existant, not dirty, or not in the same atom.
14075+*/
14076+
14077+static int pos_valid(flush_pos_t * pos)
14078+{
14079+ return pos->state != POS_INVALID;
14080+}
14081+
14082+/* Release any resources of a flush_position. Called when jnode_flush finishes. */
14083+static void pos_done(flush_pos_t * pos)
14084+{
14085+ pos_stop(pos);
14086+ reiser4_blocknr_hint_done(&pos->preceder);
14087+ if (convert_data(pos))
14088+ free_convert_data(pos);
14089+}
14090+
14091+/* Reset the point and parent. Called during flush subroutines to terminate the
14092+ squalloc loop. */
14093+static int pos_stop(flush_pos_t * pos)
14094+{
14095+ pos->state = POS_INVALID;
14096+ done_lh(&pos->lock);
14097+ done_load_count(&pos->load);
14098+ coord_init_invalid(&pos->coord, NULL);
14099+
14100+ if (pos->child) {
14101+ jput(pos->child);
14102+ pos->child = NULL;
14103+ }
14104+
14105+ return 0;
14106+}
14107+
14108+/* Return the flush_position's block allocator hint. */
14109+reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
14110+{
14111+ return &pos->preceder;
14112+}
14113+
14114+flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
14115+{
14116+ return pos->fq;
14117+}
14118+
14119+/* Make Linus happy.
14120+ Local variables:
14121+ c-indentation-style: "K&R"
14122+ mode-name: "LC"
14123+ c-basic-offset: 8
14124+ tab-width: 8
14125+ fill-column: 90
14126+ LocalWords: preceder
14127+ End:
14128+*/
14129diff -urN linux-2.6.23.orig/fs/reiser4/flush.h linux-2.6.23/fs/reiser4/flush.h
14130--- linux-2.6.23.orig/fs/reiser4/flush.h 1970-01-01 03:00:00.000000000 +0300
14131+++ linux-2.6.23/fs/reiser4/flush.h 2007-12-04 16:49:30.000000000 +0300
14132@@ -0,0 +1,295 @@
14133+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14134+
14135+/* DECLARATIONS: */
14136+
14137+#if !defined(__REISER4_FLUSH_H__)
14138+#define __REISER4_FLUSH_H__
14139+
14140+#include "plugin/cluster.h"
14141+
14142+/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14143+ single level of the tree. A flush-scan is used for counting the number of adjacent
14144+ nodes to flush, which is used to determine whether we should relocate, and it is also
14145+ used to find a starting point for flush. A flush-scan object can scan in both right
14146+ and left directions via the scan_left() and scan_right() interfaces. The
14147+ right- and left-variations are similar but perform different functions. When scanning
14148+ left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14149+ When scanning right we are simply counting the number of adjacent, dirty nodes. */
14150+struct flush_scan {
14151+
14152+ /* The current number of nodes scanned on this level. */
14153+ unsigned count;
14154+
14155+ /* There may be a maximum number of nodes for a scan on any single level. When
14156+ going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14157+ unsigned max_count;
14158+
14159+ /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14160+ sideof direction;
14161+
14162+ /* Initially @stop is set to false then set true once some condition stops the
14163+ search (e.g., we found a clean node before reaching max_count or we found a
14164+ node belonging to another atom). */
14165+ int stop;
14166+
14167+ /* The current scan position. If @node is non-NULL then its reference count has
14168+ been incremented to reflect this reference. */
14169+ jnode *node;
14170+
14171+ /* A handle for zload/zrelse of current scan position node. */
14172+ load_count node_load;
14173+
14174+ /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14175+ node is locked using this lock handle. The endpoint needs to be locked for
14176+ transfer to the flush_position object after scanning finishes. */
14177+ lock_handle node_lock;
14178+
14179+ /* When the position is unformatted, its parent, coordinate, and parent
14180+ zload/zrelse handle. */
14181+ lock_handle parent_lock;
14182+ coord_t parent_coord;
14183+ load_count parent_load;
14184+
14185+ /* The block allocator preceder hint. Sometimes flush_scan determines what the
14186+ preceder is and if so it sets it here, after which it is copied into the
14187+ flush_position. Otherwise, the preceder is computed later. */
14188+ reiser4_block_nr preceder_blk;
14189+};
14190+
14191+struct convert_item_info {
14192+ dc_item_stat d_cur; /* disk cluster state of the current item */
14193+ dc_item_stat d_next; /* disk cluster state of the next slum item */
14194+ struct inode *inode;
14195+ flow_t flow;
14196+};
14197+
14198+struct convert_info {
14199+ int count; /* for squalloc terminating */
14200+ item_plugin *iplug; /* current item plugin */
14201+ struct convert_item_info *itm; /* current item info */
14202+ struct cluster_handle clust; /* transform cluster */
14203+};
14204+
14205+typedef enum flush_position_state {
14206+ POS_INVALID, /* Invalid or stopped pos, do not continue slum
14207+ * processing */
14208+ POS_ON_LEAF, /* pos points to already prepped, locked formatted node at
14209+ * leaf level */
14210+ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used
14211+ * to traverse unformatted nodes */
14212+ POS_TO_LEAF, /* pos is being moved to leaf level */
14213+ POS_TO_TWIG, /* pos is being moved to twig level */
14214+ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after
14215+ * rightmost unit of the current twig */
14216+ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */
14217+} flushpos_state_t;
14218+
14219+/* An encapsulation of the current flush point and all the parameters that are passed
14220+ through the entire squeeze-and-allocate stage of the flush routine. A single
14221+ flush_position object is constructed after left- and right-scanning finishes. */
14222+struct flush_position {
14223+ flushpos_state_t state;
14224+
14225+ coord_t coord; /* coord to traverse unformatted nodes */
14226+ lock_handle lock; /* current lock we hold */
14227+ load_count load; /* load status for current locked formatted node */
14228+
14229+ jnode *child; /* for passing a reference to unformatted child
14230+ * across pos state changes */
14231+
14232+ reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14233+ int leaf_relocate; /* True if enough leaf-level nodes were
14234+ * found to suggest a relocate policy. */
14235+ int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */
14236+ int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14237+ flush_queue_t *fq;
14238+ long *nr_written; /* number of nodes submitted to disk */
14239+ int flags; /* a copy of jnode_flush flags argument */
14240+
14241+ znode *prev_twig; /* previous parent pointer value, used to catch
14242+ * processing of new twig node */
14243+ struct convert_info *sq; /* convert info */
14244+
14245+ unsigned long pos_in_unit; /* for extents only. Position
14246+ within an extent unit of first
14247+ jnode of slum */
14248+ long nr_to_write; /* number of unformatted nodes to handle on flush */
14249+};
14250+
14251+static inline int item_convert_count(flush_pos_t * pos)
14252+{
14253+ return pos->sq->count;
14254+}
14255+static inline void inc_item_convert_count(flush_pos_t * pos)
14256+{
14257+ pos->sq->count++;
14258+}
14259+static inline void set_item_convert_count(flush_pos_t * pos, int count)
14260+{
14261+ pos->sq->count = count;
14262+}
14263+static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14264+{
14265+ return pos->sq->iplug;
14266+}
14267+
14268+static inline struct convert_info *convert_data(flush_pos_t * pos)
14269+{
14270+ return pos->sq;
14271+}
14272+
14273+static inline struct convert_item_info *item_convert_data(flush_pos_t * pos)
14274+{
14275+ assert("edward-955", convert_data(pos));
14276+ return pos->sq->itm;
14277+}
14278+
14279+static inline struct tfm_cluster * tfm_cluster_sq(flush_pos_t * pos)
14280+{
14281+ return &pos->sq->clust.tc;
14282+}
14283+
14284+static inline struct tfm_stream * tfm_stream_sq(flush_pos_t * pos,
14285+ tfm_stream_id id)
14286+{
14287+ assert("edward-854", pos->sq != NULL);
14288+ return get_tfm_stream(tfm_cluster_sq(pos), id);
14289+}
14290+
14291+static inline int chaining_data_present(flush_pos_t * pos)
14292+{
14293+ return convert_data(pos) && item_convert_data(pos);
14294+}
14295+
14296+/* Returns true if next node contains next item of the disk cluster
14297+ so item convert data should be moved to the right slum neighbor.
14298+*/
14299+static inline int should_chain_next_node(flush_pos_t * pos)
14300+{
14301+ int result = 0;
14302+
14303+ assert("edward-1007", chaining_data_present(pos));
14304+
14305+ switch (item_convert_data(pos)->d_next) {
14306+ case DC_CHAINED_ITEM:
14307+ result = 1;
14308+ break;
14309+ case DC_AFTER_CLUSTER:
14310+ break;
14311+ default:
14312+ impossible("edward-1009", "bad state of next slum item");
14313+ }
14314+ return result;
14315+}
14316+
14317+/* update item state in a disk cluster to assign conversion mode */
14318+static inline void
14319+move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14320+{
14321+
14322+ assert("edward-1010", chaining_data_present(pos));
14323+
14324+ if (this_node == 0) {
14325+ /* next item is on the right neighbor */
14326+ assert("edward-1011",
14327+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14328+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14329+ assert("edward-1012",
14330+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14331+
14332+ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14333+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14334+ } else {
14335+ /* next item is on the same node */
14336+ assert("edward-1013",
14337+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14338+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14339+ assert("edward-1227",
14340+ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14341+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
14342+
14343+ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14344+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14345+ }
14346+}
14347+
14348+static inline int should_convert_node(flush_pos_t * pos, znode * node)
14349+{
14350+ return znode_convertible(node);
14351+}
14352+
14353+/* true if there is attached convert item info */
14354+static inline int should_convert_next_node(flush_pos_t * pos)
14355+{
14356+ return convert_data(pos) && item_convert_data(pos);
14357+}
14358+
14359+#define SQUALLOC_THRESHOLD 256
14360+
14361+static inline int should_terminate_squalloc(flush_pos_t * pos)
14362+{
14363+ return convert_data(pos) &&
14364+ !item_convert_data(pos) &&
14365+ item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14366+}
14367+
14368+#if 1
14369+#define check_convert_info(pos) \
14370+do { \
14371+ if (unlikely(should_convert_next_node(pos))){ \
14372+ warning("edward-1006", "unprocessed chained data"); \
14373+ printk("d_cur = %d, d_next = %d, flow.len = %llu\n", \
14374+ item_convert_data(pos)->d_cur, \
14375+ item_convert_data(pos)->d_next, \
14376+ item_convert_data(pos)->flow.length); \
14377+ printk("inode %llu, size = %llu, cluster %lu\n", \
14378+ (unsigned long long)get_inode_oid \
14379+ (item_convert_data(pos)->inode), \
14380+ i_size_read(item_convert_data(pos)->inode), \
14381+ convert_data(pos)->clust.index); \
14382+ } \
14383+} while (0)
14384+#else
14385+#define check_convert_info(pos)
14386+#endif /* REISER4_DEBUG */
14387+
14388+void free_convert_data(flush_pos_t * pos);
14389+/* used in extent.c */
14390+int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14391+ const coord_t * parent);
14392+int reiser4_scan_finished(flush_scan * scan);
14393+int reiser4_scanning_left(flush_scan * scan);
14394+int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14395+txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14396+int reiser4_alloc_extent(flush_pos_t *flush_pos);
14397+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14398+ reiser4_key *stop_key);
14399+extern int reiser4_init_fqs(void);
14400+extern void reiser4_done_fqs(void);
14401+
14402+#if REISER4_DEBUG
14403+
14404+extern void reiser4_check_fq(const txn_atom *atom);
14405+extern atomic_t flush_cnt;
14406+
14407+#define check_preceder(blk) \
14408+assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14409+extern void check_pos(flush_pos_t * pos);
14410+#else
14411+#define check_preceder(b) noop
14412+#define check_pos(pos) noop
14413+#endif
14414+
14415+/* __REISER4_FLUSH_H__ */
14416+#endif
14417+
14418+/* Make Linus happy.
14419+ Local variables:
14420+ c-indentation-style: "K&R"
14421+ mode-name: "LC"
14422+ c-basic-offset: 8
14423+ tab-width: 8
14424+ fill-column: 90
14425+ LocalWords: preceder
14426+ End:
14427+*/
14428diff -urN linux-2.6.23.orig/fs/reiser4/flush_queue.c linux-2.6.23/fs/reiser4/flush_queue.c
14429--- linux-2.6.23.orig/fs/reiser4/flush_queue.c 1970-01-01 03:00:00.000000000 +0300
14430+++ linux-2.6.23/fs/reiser4/flush_queue.c 2007-12-04 21:05:55.782803824 +0300
14431@@ -0,0 +1,680 @@
14432+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14433+
14434+#include "debug.h"
14435+#include "super.h"
14436+#include "txnmgr.h"
14437+#include "jnode.h"
14438+#include "znode.h"
14439+#include "page_cache.h"
14440+#include "wander.h"
14441+#include "vfs_ops.h"
14442+#include "writeout.h"
14443+#include "flush.h"
14444+
14445+#include <linux/bio.h>
14446+#include <linux/mm.h>
14447+#include <linux/pagemap.h>
14448+#include <linux/blkdev.h>
14449+#include <linux/writeback.h>
14450+
14451+/* A flush queue object is an accumulator for keeping jnodes prepared
14452+ by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14453+ kept on the flush queue until memory pressure or atom commit asks
14454+ flush queues to write some or all from their jnodes. */
14455+
14456+/*
14457+ LOCKING:
14458+
14459+ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14460+ list protected by atom spin lock. fq->prepped list uses the following
14461+ locking:
14462+
14463+ two ways to protect fq->prepped list for read-only list traversal:
14464+
14465+ 1. atom spin-lock atom.
14466+ 2. fq is IN_USE, atom->nr_running_queues increased.
14467+
14468+ and one for list modification:
14469+
14470+ 1. atom is spin-locked and one condition is true: fq is IN_USE or
14471+ atom->nr_running_queues == 0.
14472+
14473+ The deadlock-safe order for flush queues and atoms is: first lock atom, then
14474+ lock flush queue, then lock jnode.
14475+*/
14476+
14477+#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14478+#define fq_ready(fq) (!fq_in_use(fq))
14479+
14480+#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14481+#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14482+
14483+/* get lock on atom from locked flush queue object */
14484+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14485+{
14486+ /* This code is similar to jnode_get_atom(), look at it for the
14487+ * explanation. */
14488+ txn_atom *atom;
14489+
14490+ assert_spin_locked(&(fq->guard));
14491+
14492+ while (1) {
14493+ atom = fq->atom;
14494+ if (atom == NULL)
14495+ break;
14496+
14497+ if (spin_trylock_atom(atom))
14498+ break;
14499+
14500+ atomic_inc(&atom->refcount);
14501+ spin_unlock(&(fq->guard));
14502+ spin_lock_atom(atom);
14503+ spin_lock(&(fq->guard));
14504+
14505+ if (fq->atom == atom) {
14506+ atomic_dec(&atom->refcount);
14507+ break;
14508+ }
14509+
14510+ spin_unlock(&(fq->guard));
14511+ atom_dec_and_unlock(atom);
14512+ spin_lock(&(fq->guard));
14513+ }
14514+
14515+ return atom;
14516+}
14517+
14518+txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14519+{
14520+ txn_atom *atom;
14521+
14522+ spin_lock(&(fq->guard));
14523+ atom = atom_locked_by_fq_nolock(fq);
14524+ spin_unlock(&(fq->guard));
14525+ return atom;
14526+}
14527+
14528+static void init_fq(flush_queue_t * fq)
14529+{
14530+ memset(fq, 0, sizeof *fq);
14531+
14532+ atomic_set(&fq->nr_submitted, 0);
14533+
14534+ INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14535+
14536+ init_waitqueue_head(&fq->wait);
14537+ spin_lock_init(&fq->guard);
14538+}
14539+
14540+/* slab for flush queues */
14541+static struct kmem_cache *fq_slab;
14542+
14543+/**
14544+ * reiser4_init_fqs - create flush queue cache
14545+ *
14546+ * Initializes slab cache of flush queues. It is part of reiser4 module
14547+ * initialization.
14548+ */
14549+int reiser4_init_fqs(void)
14550+{
14551+ fq_slab = kmem_cache_create("fq",
14552+ sizeof(flush_queue_t),
14553+ 0, SLAB_HWCACHE_ALIGN, NULL);
14554+ if (fq_slab == NULL)
14555+ return RETERR(-ENOMEM);
14556+ return 0;
14557+}
14558+
14559+/**
14560+ * reiser4_done_fqs - delete flush queue cache
14561+ *
14562+ * This is called on reiser4 module unloading or system shutdown.
14563+ */
14564+void reiser4_done_fqs(void)
14565+{
14566+ destroy_reiser4_cache(&fq_slab);
14567+}
14568+
14569+/* create new flush queue object */
14570+static flush_queue_t *create_fq(gfp_t gfp)
14571+{
14572+ flush_queue_t *fq;
14573+
14574+ fq = kmem_cache_alloc(fq_slab, gfp);
14575+ if (fq)
14576+ init_fq(fq);
14577+
14578+ return fq;
14579+}
14580+
14581+/* adjust atom's and flush queue's counters of queued nodes */
14582+static void count_enqueued_node(flush_queue_t * fq)
14583+{
14584+ ON_DEBUG(fq->atom->num_queued++);
14585+}
14586+
14587+static void count_dequeued_node(flush_queue_t * fq)
14588+{
14589+ assert("zam-993", fq->atom->num_queued > 0);
14590+ ON_DEBUG(fq->atom->num_queued--);
14591+}
14592+
14593+/* attach flush queue object to the atom */
14594+static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14595+{
14596+ assert_spin_locked(&(atom->alock));
14597+ list_add(&fq->alink, &atom->flush_queues);
14598+ fq->atom = atom;
14599+ ON_DEBUG(atom->nr_flush_queues++);
14600+}
14601+
14602+static void detach_fq(flush_queue_t * fq)
14603+{
14604+ assert_spin_locked(&(fq->atom->alock));
14605+
14606+ spin_lock(&(fq->guard));
14607+ list_del_init(&fq->alink);
14608+ assert("vs-1456", fq->atom->nr_flush_queues > 0);
14609+ ON_DEBUG(fq->atom->nr_flush_queues--);
14610+ fq->atom = NULL;
14611+ spin_unlock(&(fq->guard));
14612+}
14613+
14614+/* destroy flush queue object */
14615+static void done_fq(flush_queue_t * fq)
14616+{
14617+ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14618+ assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14619+
14620+ kmem_cache_free(fq_slab, fq);
14621+}
14622+
14623+/* */
14624+static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
14625+{
14626+ JF_SET(node, JNODE_FLUSH_QUEUED);
14627+ count_enqueued_node(fq);
14628+}
14629+
14630+/* Putting jnode into the flush queue. Both atom and jnode should be
14631+ spin-locked. */
14632+void queue_jnode(flush_queue_t * fq, jnode * node)
14633+{
14634+ assert_spin_locked(&(node->guard));
14635+ assert("zam-713", node->atom != NULL);
14636+ assert_spin_locked(&(node->atom->alock));
14637+ assert("zam-716", fq->atom != NULL);
14638+ assert("zam-717", fq->atom == node->atom);
14639+ assert("zam-907", fq_in_use(fq));
14640+
14641+ assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14642+ assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14643+ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14644+ assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14645+
14646+ mark_jnode_queued(fq, node);
14647+ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14648+
14649+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14650+ FQ_LIST, 1));
14651+}
14652+
14653+/* repeatable process for waiting io completion on a flush queue object */
14654+static int wait_io(flush_queue_t * fq, int *nr_io_errors)
14655+{
14656+ assert("zam-738", fq->atom != NULL);
14657+ assert_spin_locked(&(fq->atom->alock));
14658+ assert("zam-736", fq_in_use(fq));
14659+ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14660+
14661+ if (atomic_read(&fq->nr_submitted) != 0) {
14662+ struct super_block *super;
14663+
14664+ spin_unlock_atom(fq->atom);
14665+
14666+ assert("nikita-3013", reiser4_schedulable());
14667+
14668+ super = reiser4_get_current_sb();
14669+
14670+ /* FIXME: this is instead of blk_run_queues() */
14671+ blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
14672+
14673+ if (!(super->s_flags & MS_RDONLY))
14674+ wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
14675+
14676+ /* Ask the caller to re-acquire the locks and call this
14677+ function again. Note: this technique is commonly used in
14678+ the txnmgr code. */
14679+ return -E_REPEAT;
14680+ }
14681+
14682+ *nr_io_errors += atomic_read(&fq->nr_errors);
14683+ return 0;
14684+}
14685+
14686+/* wait on I/O completion, re-submit dirty nodes to write */
14687+static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
14688+{
14689+ int ret;
14690+ txn_atom *atom = fq->atom;
14691+
14692+ assert("zam-801", atom != NULL);
14693+ assert_spin_locked(&(atom->alock));
14694+ assert("zam-762", fq_in_use(fq));
14695+
14696+ ret = wait_io(fq, nr_io_errors);
14697+ if (ret)
14698+ return ret;
14699+
14700+ detach_fq(fq);
14701+ done_fq(fq);
14702+
14703+ reiser4_atom_send_event(atom);
14704+
14705+ return 0;
14706+}
14707+
14708+/* wait for all i/o for given atom to be completed, actually do one iteration
14709+ on that and return -E_REPEAT if there more iterations needed */
14710+static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14711+{
14712+ flush_queue_t *fq;
14713+
14714+ assert_spin_locked(&(atom->alock));
14715+
14716+ if (list_empty_careful(&atom->flush_queues))
14717+ return 0;
14718+
14719+ list_for_each_entry(fq, &atom->flush_queues, alink) {
14720+ if (fq_ready(fq)) {
14721+ int ret;
14722+
14723+ mark_fq_in_use(fq);
14724+ assert("vs-1247", fq->owner == NULL);
14725+ ON_DEBUG(fq->owner = current);
14726+ ret = finish_fq(fq, nr_io_errors);
14727+
14728+ if (*nr_io_errors)
14729+ reiser4_handle_error();
14730+
14731+ if (ret) {
14732+ reiser4_fq_put(fq);
14733+ return ret;
14734+ }
14735+
14736+ spin_unlock_atom(atom);
14737+
14738+ return -E_REPEAT;
14739+ }
14740+ }
14741+
14742+ /* All flush queues are in use; atom remains locked */
14743+ return -EBUSY;
14744+}
14745+
14746+/* wait all i/o for current atom */
14747+int current_atom_finish_all_fq(void)
14748+{
14749+ txn_atom *atom;
14750+ int nr_io_errors = 0;
14751+ int ret = 0;
14752+
14753+ do {
14754+ while (1) {
14755+ atom = get_current_atom_locked();
14756+ ret = finish_all_fq(atom, &nr_io_errors);
14757+ if (ret != -EBUSY)
14758+ break;
14759+ reiser4_atom_wait_event(atom);
14760+ }
14761+ } while (ret == -E_REPEAT);
14762+
14763+ /* we do not need locked atom after this function finishes, SUCCESS or
14764+ -EBUSY are two return codes when atom remains locked after
14765+ finish_all_fq */
14766+ if (!ret)
14767+ spin_unlock_atom(atom);
14768+
14769+ assert_spin_not_locked(&(atom->alock));
14770+
14771+ if (ret)
14772+ return ret;
14773+
14774+ if (nr_io_errors)
14775+ return RETERR(-EIO);
14776+
14777+ return 0;
14778+}
14779+
14780+/* change node->atom field for all jnode from given list */
14781+static void
14782+scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
14783+{
14784+ jnode *cur;
14785+
14786+ list_for_each_entry(cur, list, capture_link) {
14787+ spin_lock_jnode(cur);
14788+ cur->atom = atom;
14789+ spin_unlock_jnode(cur);
14790+ }
14791+}
14792+
14793+/* support for atom fusion operation */
14794+void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
14795+{
14796+ flush_queue_t *fq;
14797+
14798+ assert_spin_locked(&(to->alock));
14799+ assert_spin_locked(&(from->alock));
14800+
14801+ list_for_each_entry(fq, &from->flush_queues, alink) {
14802+ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
14803+ spin_lock(&(fq->guard));
14804+ fq->atom = to;
14805+ spin_unlock(&(fq->guard));
14806+ }
14807+
14808+ list_splice_init(&from->flush_queues, to->flush_queues.prev);
14809+
14810+#if REISER4_DEBUG
14811+ to->num_queued += from->num_queued;
14812+ to->nr_flush_queues += from->nr_flush_queues;
14813+ from->nr_flush_queues = 0;
14814+#endif
14815+}
14816+
14817+#if REISER4_DEBUG
14818+int atom_fq_parts_are_clean(txn_atom * atom)
14819+{
14820+ assert("zam-915", atom != NULL);
14821+ return list_empty_careful(&atom->flush_queues);
14822+}
14823+#endif
14824+/* Bio i/o completion routine for reiser4 write operations. */
14825+static int
14826+end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
14827+ int err)
14828+{
14829+ int i;
14830+ int nr_errors = 0;
14831+ flush_queue_t *fq;
14832+
14833+ assert("zam-958", bio->bi_rw & WRITE);
14834+
14835+ /* i/o op. is not fully completed */
14836+ if (bio->bi_size != 0)
14837+ return 1;
14838+
14839+ if (err == -EOPNOTSUPP)
14840+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
14841+
14842+ /* we expect that bio->private is set to NULL or fq object which is used
14843+ * for synchronization and error counting. */
14844+ fq = bio->bi_private;
14845+ /* Check all elements of io_vec for correct write completion. */
14846+ for (i = 0; i < bio->bi_vcnt; i += 1) {
14847+ struct page *pg = bio->bi_io_vec[i].bv_page;
14848+
14849+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
14850+ SetPageError(pg);
14851+ nr_errors++;
14852+ }
14853+
14854+ {
14855+ /* jnode WRITEBACK ("write is in progress bit") is
14856+ * atomically cleared here. */
14857+ jnode *node;
14858+
14859+ assert("zam-736", pg != NULL);
14860+ assert("zam-736", PagePrivate(pg));
14861+ node = jprivate(pg);
14862+
14863+ JF_CLR(node, JNODE_WRITEBACK);
14864+ }
14865+
14866+ end_page_writeback(pg);
14867+ page_cache_release(pg);
14868+ }
14869+
14870+ if (fq) {
14871+ /* count i/o error in fq object */
14872+ atomic_add(nr_errors, &fq->nr_errors);
14873+
14874+ /* If all write requests registered in this "fq" are done we up
14875+ * the waiter. */
14876+ if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
14877+ wake_up(&fq->wait);
14878+ }
14879+
14880+ bio_put(bio);
14881+ return 0;
14882+}
14883+
14884+/* Count I/O requests which will be submitted by @bio in given flush queues
14885+ @fq */
14886+void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
14887+{
14888+ bio->bi_private = fq;
14889+ bio->bi_end_io = end_io_handler;
14890+
14891+ if (fq)
14892+ atomic_add(bio->bi_vcnt, &fq->nr_submitted);
14893+}
14894+
14895+/* Move all queued nodes out from @fq->prepped list. */
14896+static void release_prepped_list(flush_queue_t * fq)
14897+{
14898+ txn_atom *atom;
14899+
14900+ assert("zam-904", fq_in_use(fq));
14901+ atom = atom_locked_by_fq(fq);
14902+
14903+ while (!list_empty(ATOM_FQ_LIST(fq))) {
14904+ jnode *cur;
14905+
14906+ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
14907+ list_del_init(&cur->capture_link);
14908+
14909+ count_dequeued_node(fq);
14910+ spin_lock_jnode(cur);
14911+ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
14912+ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
14913+ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
14914+ JF_CLR(cur, JNODE_FLUSH_QUEUED);
14915+
14916+ if (JF_ISSET(cur, JNODE_DIRTY)) {
14917+ list_add_tail(&cur->capture_link,
14918+ ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
14919+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14920+ DIRTY_LIST, 1));
14921+ } else {
14922+ list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
14923+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14924+ CLEAN_LIST, 1));
14925+ }
14926+
14927+ spin_unlock_jnode(cur);
14928+ }
14929+
14930+ if (--atom->nr_running_queues == 0)
14931+ reiser4_atom_send_event(atom);
14932+
14933+ spin_unlock_atom(atom);
14934+}
14935+
14936+/* Submit write requests for nodes on the already filled flush queue @fq.
14937+
14938+ @fq: flush queue object which contains jnodes we can (and will) write.
14939+ @return: number of submitted blocks (>=0) if success, otherwise -- an error
14940+ code (<0). */
14941+int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
14942+{
14943+ int ret;
14944+ txn_atom *atom;
14945+
14946+ while (1) {
14947+ atom = atom_locked_by_fq(fq);
14948+ assert("zam-924", atom);
14949+ /* do not write fq in parallel. */
14950+ if (atom->nr_running_queues == 0
14951+ || !(flags & WRITEOUT_SINGLE_STREAM))
14952+ break;
14953+ reiser4_atom_wait_event(atom);
14954+ }
14955+
14956+ atom->nr_running_queues++;
14957+ spin_unlock_atom(atom);
14958+
14959+ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
14960+ release_prepped_list(fq);
14961+
14962+ return ret;
14963+}
14964+
14965+/* Getting flush queue object for exclusive use by one thread. May require
14966+ several iterations which is indicated by -E_REPEAT return code.
14967+
14968+ This function does not contain code for obtaining an atom lock because an
14969+ atom lock is obtained by different ways in different parts of reiser4,
14970+ usually it is current atom, but we need a possibility for getting fq for the
14971+ atom of given jnode. */
14972+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
14973+{
14974+ flush_queue_t *fq;
14975+
14976+ assert_spin_locked(&(atom->alock));
14977+
14978+ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
14979+ while (&atom->flush_queues != &fq->alink) {
14980+ spin_lock(&(fq->guard));
14981+
14982+ if (fq_ready(fq)) {
14983+ mark_fq_in_use(fq);
14984+ assert("vs-1246", fq->owner == NULL);
14985+ ON_DEBUG(fq->owner = current);
14986+ spin_unlock(&(fq->guard));
14987+
14988+ if (*new_fq)
14989+ done_fq(*new_fq);
14990+
14991+ *new_fq = fq;
14992+
14993+ return 0;
14994+ }
14995+
14996+ spin_unlock(&(fq->guard));
14997+
14998+ fq = list_entry(fq->alink.next, flush_queue_t, alink);
14999+ }
15000+
15001+ /* Use previously allocated fq object */
15002+ if (*new_fq) {
15003+ mark_fq_in_use(*new_fq);
15004+ assert("vs-1248", (*new_fq)->owner == 0);
15005+ ON_DEBUG((*new_fq)->owner = current);
15006+ attach_fq(atom, *new_fq);
15007+
15008+ return 0;
15009+ }
15010+
15011+ spin_unlock_atom(atom);
15012+
15013+ *new_fq = create_fq(gfp);
15014+
15015+ if (*new_fq == NULL)
15016+ return RETERR(-ENOMEM);
15017+
15018+ return RETERR(-E_REPEAT);
15019+}
15020+
15021+int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
15022+{
15023+ return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
15024+}
15025+
15026+/* A wrapper around reiser4_fq_by_atom for getting a flush queue
15027+ object for current atom, if success fq->atom remains locked. */
15028+flush_queue_t *get_fq_for_current_atom(void)
15029+{
15030+ flush_queue_t *fq = NULL;
15031+ txn_atom *atom;
15032+ int ret;
15033+
15034+ do {
15035+ atom = get_current_atom_locked();
15036+ ret = reiser4_fq_by_atom(atom, &fq);
15037+ } while (ret == -E_REPEAT);
15038+
15039+ if (ret)
15040+ return ERR_PTR(ret);
15041+ return fq;
15042+}
15043+
15044+/* Releasing flush queue object after exclusive use */
15045+void reiser4_fq_put_nolock(flush_queue_t *fq)
15046+{
15047+ assert("zam-747", fq->atom != NULL);
15048+ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15049+ mark_fq_ready(fq);
15050+ assert("vs-1245", fq->owner == current);
15051+ ON_DEBUG(fq->owner = NULL);
15052+}
15053+
15054+void reiser4_fq_put(flush_queue_t * fq)
15055+{
15056+ txn_atom *atom;
15057+
15058+ spin_lock(&(fq->guard));
15059+ atom = atom_locked_by_fq_nolock(fq);
15060+
15061+ assert("zam-746", atom != NULL);
15062+
15063+ reiser4_fq_put_nolock(fq);
15064+ reiser4_atom_send_event(atom);
15065+
15066+ spin_unlock(&(fq->guard));
15067+ spin_unlock_atom(atom);
15068+}
15069+
15070+/* A part of atom object initialization related to the embedded flush queue
15071+ list head */
15072+
15073+void init_atom_fq_parts(txn_atom *atom)
15074+{
15075+ INIT_LIST_HEAD(&atom->flush_queues);
15076+}
15077+
15078+#if REISER4_DEBUG
15079+
15080+void reiser4_check_fq(const txn_atom *atom)
15081+{
15082+ /* check number of nodes on all atom's flush queues */
15083+ flush_queue_t *fq;
15084+ int count;
15085+ struct list_head *pos;
15086+
15087+ count = 0;
15088+ list_for_each_entry(fq, &atom->flush_queues, alink) {
15089+ spin_lock(&(fq->guard));
15090+ /* calculate number of jnodes on fq' list of prepped jnodes */
15091+ list_for_each(pos, ATOM_FQ_LIST(fq))
15092+ count++;
15093+ spin_unlock(&(fq->guard));
15094+ }
15095+ if (count != atom->fq)
15096+ warning("", "fq counter %d, real %d\n", atom->fq, count);
15097+
15098+}
15099+
15100+#endif
15101+
15102+/*
15103+ * Local variables:
15104+ * c-indentation-style: "K&R"
15105+ * mode-name: "LC"
15106+ * c-basic-offset: 8
15107+ * tab-width: 8
15108+ * fill-column: 79
15109+ * scroll-step: 1
15110+ * End:
15111+ */
15112diff -urN linux-2.6.23.orig/fs/reiser4/forward.h linux-2.6.23/fs/reiser4/forward.h
15113--- linux-2.6.23.orig/fs/reiser4/forward.h 1970-01-01 03:00:00.000000000 +0300
15114+++ linux-2.6.23/fs/reiser4/forward.h 2007-12-04 16:49:30.000000000 +0300
15115@@ -0,0 +1,252 @@
15116+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15117+
15118+/* Forward declarations. Thank you Kernighan. */
15119+
15120+#if !defined( __REISER4_FORWARD_H__ )
15121+#define __REISER4_FORWARD_H__
15122+
15123+#include <asm/errno.h>
15124+#include <linux/types.h>
15125+
15126+typedef struct zlock zlock;
15127+typedef struct lock_stack lock_stack;
15128+typedef struct lock_handle lock_handle;
15129+typedef struct znode znode;
15130+typedef struct flow flow_t;
15131+typedef struct coord coord_t;
15132+typedef struct tree_access_pointer tap_t;
15133+typedef struct reiser4_object_create_data reiser4_object_create_data;
15134+typedef union reiser4_plugin reiser4_plugin;
15135+typedef __u16 reiser4_plugin_id;
15136+typedef __u64 reiser4_plugin_groups;
15137+typedef struct item_plugin item_plugin;
15138+typedef struct jnode_plugin jnode_plugin;
15139+typedef struct reiser4_item_data reiser4_item_data;
15140+typedef union reiser4_key reiser4_key;
15141+typedef struct reiser4_tree reiser4_tree;
15142+typedef struct carry_cut_data carry_cut_data;
15143+typedef struct carry_kill_data carry_kill_data;
15144+typedef struct carry_tree_op carry_tree_op;
15145+typedef struct carry_tree_node carry_tree_node;
15146+typedef struct carry_plugin_info carry_plugin_info;
15147+typedef struct reiser4_journal reiser4_journal;
15148+typedef struct txn_atom txn_atom;
15149+typedef struct txn_handle txn_handle;
15150+typedef struct txn_mgr txn_mgr;
15151+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15152+typedef struct reiser4_context reiser4_context;
15153+typedef struct carry_level carry_level;
15154+typedef struct blocknr_set_entry blocknr_set_entry;
15155+/* super_block->s_fs_info points to this */
15156+typedef struct reiser4_super_info_data reiser4_super_info_data;
15157+/* next two objects are fields of reiser4_super_info_data */
15158+typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15159+typedef struct reiser4_space_allocator reiser4_space_allocator;
15160+
15161+typedef struct flush_scan flush_scan;
15162+typedef struct flush_position flush_pos_t;
15163+
15164+typedef unsigned short pos_in_node_t;
15165+#define MAX_POS_IN_NODE 65535
15166+
15167+typedef struct jnode jnode;
15168+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15169+
15170+typedef struct uf_coord uf_coord_t;
15171+typedef struct hint hint_t;
15172+
15173+typedef struct ktxnmgrd_context ktxnmgrd_context;
15174+
15175+struct inode;
15176+struct page;
15177+struct file;
15178+struct dentry;
15179+struct super_block;
15180+
15181+/* return values of coord_by_key(). cbk == coord_by_key */
15182+typedef enum {
15183+ CBK_COORD_FOUND = 0,
15184+ CBK_COORD_NOTFOUND = -ENOENT,
15185+} lookup_result;
15186+
15187+/* results of lookup with directory file */
15188+typedef enum {
15189+ FILE_NAME_FOUND = 0,
15190+ FILE_NAME_NOTFOUND = -ENOENT,
15191+ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15192+ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15193+} file_lookup_result;
15194+
15195+/* behaviors of lookup. If coord we are looking for is actually in a tree,
15196+ both coincide. */
15197+typedef enum {
15198+ /* search exactly for the coord with key given */
15199+ FIND_EXACT,
15200+ /* search for coord with the maximal key not greater than one
15201+ given */
15202+ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15203+} lookup_bias;
15204+
15205+typedef enum {
15206+ /* number of leaf level of the tree
15207+ The fake root has (tree_level=0). */
15208+ LEAF_LEVEL = 1,
15209+
15210+ /* number of level one above leaf level of the tree.
15211+
15212+ It is supposed that internal tree used by reiser4 to store file
15213+ system data and meta data will have height 2 initially (when
15214+ created by mkfs).
15215+ */
15216+ TWIG_LEVEL = 2,
15217+} tree_level;
15218+
15219+/* The "real" maximum ztree height is the 0-origin size of any per-level
15220+ array, since the zero'th level is not used. */
15221+#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15222+
15223+/* enumeration of possible mutual position of item and coord. This enum is
15224+ return type of ->is_in_item() item plugin method which see. */
15225+typedef enum {
15226+ /* coord is on the left of an item */
15227+ IP_ON_THE_LEFT,
15228+ /* coord is inside item */
15229+ IP_INSIDE,
15230+ /* coord is inside item, but to the right of the rightmost unit of
15231+ this item */
15232+ IP_RIGHT_EDGE,
15233+ /* coord is on the right of an item */
15234+ IP_ON_THE_RIGHT
15235+} interposition;
15236+
15237+/* type of lock to acquire on znode before returning it to caller */
15238+typedef enum {
15239+ ZNODE_NO_LOCK = 0,
15240+ ZNODE_READ_LOCK = 1,
15241+ ZNODE_WRITE_LOCK = 2,
15242+} znode_lock_mode;
15243+
15244+/* type of lock request */
15245+typedef enum {
15246+ ZNODE_LOCK_LOPRI = 0,
15247+ ZNODE_LOCK_HIPRI = (1 << 0),
15248+
15249+ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15250+ waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately
15251+ return the value -E_REPEAT. */
15252+ ZNODE_LOCK_NONBLOCK = (1 << 1),
15253+ /* An option for longterm_lock_znode which prevents atom fusion */
15254+ ZNODE_LOCK_DONT_FUSE = (1 << 2)
15255+} znode_lock_request;
15256+
15257+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15258+
15259+/* used to specify direction of shift. These must be -1 and 1 */
15260+typedef enum {
15261+ SHIFT_LEFT = 1,
15262+ SHIFT_RIGHT = -1
15263+} shift_direction;
15264+
15265+typedef enum {
15266+ LEFT_SIDE,
15267+ RIGHT_SIDE
15268+} sideof;
15269+
15270+#define round_up( value, order ) \
15271+ ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \
15272+ ~( ( order ) - 1 ) ) )
15273+
15274+/* values returned by squalloc_right_neighbor and its auxiliary functions */
15275+typedef enum {
15276+ /* unit of internal item is moved */
15277+ SUBTREE_MOVED = 0,
15278+ /* nothing else can be squeezed into left neighbor */
15279+ SQUEEZE_TARGET_FULL = 1,
15280+ /* all content of node is squeezed into its left neighbor */
15281+ SQUEEZE_SOURCE_EMPTY = 2,
15282+ /* one more item is copied (this is only returned by
15283+ allocate_and_copy_extent to squalloc_twig)) */
15284+ SQUEEZE_CONTINUE = 3
15285+} squeeze_result;
15286+
15287+/* Do not change items ids. If you do - there will be format change */
15288+typedef enum {
15289+ STATIC_STAT_DATA_ID = 0x0,
15290+ SIMPLE_DIR_ENTRY_ID = 0x1,
15291+ COMPOUND_DIR_ID = 0x2,
15292+ NODE_POINTER_ID = 0x3,
15293+ EXTENT_POINTER_ID = 0x5,
15294+ FORMATTING_ID = 0x6,
15295+ CTAIL_ID = 0x7,
15296+ BLACK_BOX_ID = 0x8,
15297+ LAST_ITEM_ID = 0x9
15298+} item_id;
15299+
15300+/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15301+ whether commit() was called or VM memory pressure was applied. */
15302+typedef enum {
15303+ /* submit flush queue to disk at jnode_flush completion */
15304+ JNODE_FLUSH_WRITE_BLOCKS = 1,
15305+
15306+ /* flush is called for commit */
15307+ JNODE_FLUSH_COMMIT = 2,
15308+ /* not implemented */
15309+ JNODE_FLUSH_MEMORY_FORMATTED = 4,
15310+
15311+ /* not implemented */
15312+ JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15313+} jnode_flush_flags;
15314+
15315+/* Flags to insert/paste carry operations. Currently they only used in
15316+ flushing code, but in future, they can be used to optimize for repetitive
15317+ accesses. */
15318+typedef enum {
15319+ /* carry is not allowed to shift data to the left when trying to find
15320+ free space */
15321+ COPI_DONT_SHIFT_LEFT = (1 << 0),
15322+ /* carry is not allowed to shift data to the right when trying to find
15323+ free space */
15324+ COPI_DONT_SHIFT_RIGHT = (1 << 1),
15325+ /* carry is not allowed to allocate new node(s) when trying to find
15326+ free space */
15327+ COPI_DONT_ALLOCATE = (1 << 2),
15328+ /* try to load left neighbor if its not in a cache */
15329+ COPI_LOAD_LEFT = (1 << 3),
15330+ /* try to load right neighbor if its not in a cache */
15331+ COPI_LOAD_RIGHT = (1 << 4),
15332+ /* shift insertion point to the left neighbor */
15333+ COPI_GO_LEFT = (1 << 5),
15334+ /* shift insertion point to the right neighbor */
15335+ COPI_GO_RIGHT = (1 << 6),
15336+ /* try to step back into original node if insertion into new node
15337+ fails after shifting data there. */
15338+ COPI_STEP_BACK = (1 << 7)
15339+} cop_insert_flag;
15340+
15341+typedef enum {
15342+ SAFE_UNLINK, /* safe-link for unlink */
15343+ SAFE_TRUNCATE /* safe-link for truncate */
15344+} reiser4_safe_link_t;
15345+
15346+/* this is to show on which list of atom jnode is */
15347+typedef enum {
15348+ NOT_CAPTURED,
15349+ DIRTY_LIST,
15350+ CLEAN_LIST,
15351+ FQ_LIST,
15352+ WB_LIST,
15353+ OVRWR_LIST
15354+} atom_list;
15355+
15356+/* __REISER4_FORWARD_H__ */
15357+#endif
15358+
15359+/* Make Linus happy.
15360+ Local variables:
15361+ c-indentation-style: "K&R"
15362+ mode-name: "LC"
15363+ c-basic-offset: 8
15364+ tab-width: 8
15365+ fill-column: 120
15366+ End:
15367+*/
15368diff -urN linux-2.6.23.orig/fs/reiser4/fsdata.c linux-2.6.23/fs/reiser4/fsdata.c
15369--- linux-2.6.23.orig/fs/reiser4/fsdata.c 1970-01-01 03:00:00.000000000 +0300
15370+++ linux-2.6.23/fs/reiser4/fsdata.c 2007-12-04 16:49:30.000000000 +0300
15371@@ -0,0 +1,804 @@
15372+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15373+ * reiser4/README */
15374+
15375+#include "fsdata.h"
15376+#include "inode.h"
15377+
15378+
15379+/* cache or dir_cursors */
15380+static struct kmem_cache *d_cursor_cache;
15381+
15382+/* list of unused cursors */
15383+static LIST_HEAD(cursor_cache);
15384+
15385+/* number of cursors in list of ununsed cursors */
15386+static unsigned long d_cursor_unused = 0;
15387+
15388+/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15389+DEFINE_SPINLOCK(d_lock);
15390+
15391+static reiser4_file_fsdata *create_fsdata(struct file *file);
15392+static int file_is_stateless(struct file *file);
15393+static void free_fsdata(reiser4_file_fsdata *fsdata);
15394+static void kill_cursor(dir_cursor *);
15395+
15396+/**
15397+ * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15398+ * @nr: number of objects to free
15399+ * @mask: GFP mask
15400+ *
15401+ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15402+ * number. Return number of still freeable cursors.
15403+ */
15404+static int d_cursor_shrink(int nr, gfp_t mask)
15405+{
15406+ if (nr != 0) {
15407+ dir_cursor *scan;
15408+ int killed;
15409+
15410+ killed = 0;
15411+ spin_lock(&d_lock);
15412+ while (!list_empty(&cursor_cache)) {
15413+ scan = list_entry(cursor_cache.next, dir_cursor, alist);
15414+ assert("nikita-3567", scan->ref == 0);
15415+ kill_cursor(scan);
15416+ ++killed;
15417+ --nr;
15418+ if (nr == 0)
15419+ break;
15420+ }
15421+ spin_unlock(&d_lock);
15422+ }
15423+ return d_cursor_unused;
15424+}
15425+
15426+/*
15427+ * actually, d_cursors are "priceless", because there is no way to
15428+ * recover information stored in them. On the other hand, we don't
15429+ * want to consume all kernel memory by them. As a compromise, just
15430+ * assign higher "seeks" value to d_cursor cache, so that it will be
15431+ * shrunk only if system is really tight on memory.
15432+ */
15433+static struct shrinker d_cursor_shrinker = {
15434+ .shrink = d_cursor_shrink,
15435+ .seeks = DEFAULT_SEEKS << 3,
15436+};
15437+
15438+/**
15439+ * reiser4_init_d_cursor - create d_cursor cache
15440+ *
15441+ * Initializes slab cache of d_cursors. It is part of reiser4 module
15442+ * initialization.
15443+ */
15444+int reiser4_init_d_cursor(void)
15445+{
15446+ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15447+ SLAB_HWCACHE_ALIGN, NULL);
15448+ if (d_cursor_cache == NULL)
15449+ return RETERR(-ENOMEM);
15450+
15451+ register_shrinker(&d_cursor_shrinker);
15452+ return 0;
15453+}
15454+
15455+/**
15456+ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15457+ *
15458+ * This is called on reiser4 module unloading or system shutdown.
15459+ */
15460+void reiser4_done_d_cursor(void)
15461+{
15462+ unregister_shrinker(&d_cursor_shrinker);
15463+
15464+ destroy_reiser4_cache(&d_cursor_cache);
15465+}
15466+
15467+#define D_CURSOR_TABLE_SIZE (256)
15468+
15469+static inline unsigned long
15470+d_cursor_hash(d_cursor_hash_table *table, const struct d_cursor_key *key)
15471+{
15472+ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15473+ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15474+}
15475+
15476+static inline int d_cursor_eq(const struct d_cursor_key *k1,
15477+ const struct d_cursor_key *k2)
15478+{
15479+ return k1->cid == k2->cid && k1->oid == k2->oid;
15480+}
15481+
15482+/*
15483+ * define functions to manipulate reiser4 super block's hash table of
15484+ * dir_cursors
15485+ */
15486+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15487+#define KFREE(ptr, size) kfree(ptr)
15488+TYPE_SAFE_HASH_DEFINE(d_cursor,
15489+ dir_cursor,
15490+ struct d_cursor_key,
15491+ key, hash, d_cursor_hash, d_cursor_eq);
15492+#undef KFREE
15493+#undef KMALLOC
15494+
15495+/**
15496+ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15497+ * @super: super block to initialize
15498+ *
15499+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15500+ * of mount.
15501+ */
15502+int reiser4_init_super_d_info(struct super_block *super)
15503+{
15504+ struct d_cursor_info *p;
15505+
15506+ p = &get_super_private(super)->d_info;
15507+
15508+ INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15509+ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15510+}
15511+
15512+/**
15513+ * reiser4_done_super_d_info - release per-super-block d_cursor resources
15514+ * @super: super block being umounted
15515+ *
15516+ * It is called on umount. Kills all directory cursors attached to suoer block.
15517+ */
15518+void reiser4_done_super_d_info(struct super_block *super)
15519+{
15520+ struct d_cursor_info *d_info;
15521+ dir_cursor *cursor, *next;
15522+
15523+ d_info = &get_super_private(super)->d_info;
15524+ for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15525+ kill_cursor(cursor);
15526+
15527+ BUG_ON(d_info->tree.rnode != NULL);
15528+ d_cursor_hash_done(&d_info->table);
15529+}
15530+
15531+/**
15532+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15533+ * @cursor: cursor to free
15534+ *
15535+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15536+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15537+ * indices, hash table, list of unused cursors and frees it.
15538+ */
15539+static void kill_cursor(dir_cursor *cursor)
15540+{
15541+ unsigned long index;
15542+
15543+ assert("nikita-3566", cursor->ref == 0);
15544+ assert("nikita-3572", cursor->fsdata != NULL);
15545+
15546+ index = (unsigned long)cursor->key.oid;
15547+ list_del_init(&cursor->fsdata->dir.linkage);
15548+ free_fsdata(cursor->fsdata);
15549+ cursor->fsdata = NULL;
15550+
15551+ if (list_empty_careful(&cursor->list))
15552+ /* this is last cursor for a file. Kill radix-tree entry */
15553+ radix_tree_delete(&cursor->info->tree, index);
15554+ else {
15555+ void **slot;
15556+
15557+ /*
15558+ * there are other cursors for the same oid.
15559+ */
15560+
15561+ /*
15562+ * if radix tree point to the cursor being removed, re-target
15563+ * radix tree slot to the next cursor in the (non-empty as was
15564+ * checked above) element of the circular list of all cursors
15565+ * for this oid.
15566+ */
15567+ slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15568+ assert("nikita-3571", *slot != NULL);
15569+ if (*slot == cursor)
15570+ *slot = list_entry(cursor->list.next, dir_cursor, list);
15571+ /* remove cursor from circular list */
15572+ list_del_init(&cursor->list);
15573+ }
15574+ /* remove cursor from the list of unused cursors */
15575+ list_del_init(&cursor->alist);
15576+ /* remove cursor from the hash table */
15577+ d_cursor_hash_remove(&cursor->info->table, cursor);
15578+ /* and free it */
15579+ kmem_cache_free(d_cursor_cache, cursor);
15580+ --d_cursor_unused;
15581+}
15582+
15583+/* possible actions that can be performed on all cursors for the given file */
15584+enum cursor_action {
15585+ /*
15586+ * load all detached state: this is called when stat-data is loaded
15587+ * from the disk to recover information about all pending readdirs
15588+ */
15589+ CURSOR_LOAD,
15590+ /*
15591+ * detach all state from inode, leaving it in the cache. This is called
15592+ * when inode is removed form the memory by memory pressure
15593+ */
15594+ CURSOR_DISPOSE,
15595+ /*
15596+ * detach cursors from the inode, and free them. This is called when
15597+ * inode is destroyed
15598+ */
15599+ CURSOR_KILL
15600+};
15601+
15602+/*
15603+ * return d_cursor data for the file system @inode is in.
15604+ */
15605+static inline struct d_cursor_info *d_info(struct inode *inode)
15606+{
15607+ return &get_super_private(inode->i_sb)->d_info;
15608+}
15609+
15610+/*
15611+ * lookup d_cursor in the per-super-block radix tree.
15612+ */
15613+static inline dir_cursor *lookup(struct d_cursor_info * info,
15614+ unsigned long index)
15615+{
15616+ return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15617+}
15618+
15619+/*
15620+ * attach @cursor to the radix tree. There may be multiple cursors for the
15621+ * same oid, they are chained into circular list.
15622+ */
15623+static void bind_cursor(dir_cursor * cursor, unsigned long index)
15624+{
15625+ dir_cursor *head;
15626+
15627+ head = lookup(cursor->info, index);
15628+ if (head == NULL) {
15629+ /* this is the first cursor for this index */
15630+ INIT_LIST_HEAD(&cursor->list);
15631+ radix_tree_insert(&cursor->info->tree, index, cursor);
15632+ } else {
15633+ /* some cursor already exists. Chain ours */
15634+ list_add(&cursor->list, &head->list);
15635+ }
15636+}
15637+
15638+/*
15639+ * detach fsdata (if detachable) from file descriptor, and put cursor on the
15640+ * "unused" list. Called when file descriptor is not longer in active use.
15641+ */
15642+static void clean_fsdata(struct file *file)
15643+{
15644+ dir_cursor *cursor;
15645+ reiser4_file_fsdata *fsdata;
15646+
15647+ assert("nikita-3570", file_is_stateless(file));
15648+
15649+ fsdata = (reiser4_file_fsdata *) file->private_data;
15650+ if (fsdata != NULL) {
15651+ cursor = fsdata->cursor;
15652+ if (cursor != NULL) {
15653+ spin_lock(&d_lock);
15654+ --cursor->ref;
15655+ if (cursor->ref == 0) {
15656+ list_add_tail(&cursor->alist, &cursor_cache);
15657+ ++d_cursor_unused;
15658+ }
15659+ spin_unlock(&d_lock);
15660+ file->private_data = NULL;
15661+ }
15662+ }
15663+}
15664+
15665+/*
15666+ * global counter used to generate "client ids". These ids are encoded into
15667+ * high bits of fpos.
15668+ */
15669+static __u32 cid_counter = 0;
15670+#define CID_SHIFT (20)
15671+#define CID_MASK (0xfffffull)
15672+
15673+static void free_file_fsdata_nolock(struct file *);
15674+
15675+/**
15676+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15677+ * @cursor:
15678+ * @file:
15679+ * @inode:
15680+ *
15681+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15682+ * reiser4 super block's hash table and radix tree.
15683+ add detachable readdir
15684+ * state to the @f
15685+ */
15686+static int insert_cursor(dir_cursor *cursor, struct file *file,
15687+ struct inode *inode)
15688+{
15689+ int result;
15690+ reiser4_file_fsdata *fsdata;
15691+
15692+ memset(cursor, 0, sizeof *cursor);
15693+
15694+ /* this is either first call to readdir, or rewind. Anyway, create new
15695+ * cursor. */
15696+ fsdata = create_fsdata(NULL);
15697+ if (fsdata != NULL) {
15698+ result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
15699+ if (result == 0) {
15700+ struct d_cursor_info *info;
15701+ oid_t oid;
15702+
15703+ info = d_info(inode);
15704+ oid = get_inode_oid(inode);
15705+ /* cid occupies higher 12 bits of f->f_pos. Don't
15706+ * allow it to become negative: this confuses
15707+ * nfsd_readdir() */
15708+ cursor->key.cid = (++cid_counter) & 0x7ff;
15709+ cursor->key.oid = oid;
15710+ cursor->fsdata = fsdata;
15711+ cursor->info = info;
15712+ cursor->ref = 1;
15713+
15714+ spin_lock_inode(inode);
15715+ /* install cursor as @f's private_data, discarding old
15716+ * one if necessary */
15717+#if REISER4_DEBUG
15718+ if (file->private_data)
15719+ warning("", "file has fsdata already");
15720+#endif
15721+ clean_fsdata(file);
15722+ free_file_fsdata_nolock(file);
15723+ file->private_data = fsdata;
15724+ fsdata->cursor = cursor;
15725+ spin_unlock_inode(inode);
15726+ spin_lock(&d_lock);
15727+ /* insert cursor into hash table */
15728+ d_cursor_hash_insert(&info->table, cursor);
15729+ /* and chain it into radix-tree */
15730+ bind_cursor(cursor, (unsigned long)oid);
15731+ spin_unlock(&d_lock);
15732+ radix_tree_preload_end();
15733+ file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15734+ }
15735+ } else
15736+ result = RETERR(-ENOMEM);
15737+ return result;
15738+}
15739+
15740+/**
15741+ * process_cursors - do action on each cursor attached to inode
15742+ * @inode:
15743+ * @act: action to do
15744+ *
15745+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15746+ * and performs action specified by @act on each of cursors.
15747+ */
15748+static void process_cursors(struct inode *inode, enum cursor_action act)
15749+{
15750+ oid_t oid;
15751+ dir_cursor *start;
15752+ struct list_head *head;
15753+ reiser4_context *ctx;
15754+ struct d_cursor_info *info;
15755+
15756+ /* this can be called by
15757+ *
15758+ * kswapd->...->prune_icache->..reiser4_destroy_inode
15759+ *
15760+ * without reiser4_context
15761+ */
15762+ ctx = reiser4_init_context(inode->i_sb);
15763+ if (IS_ERR(ctx)) {
15764+ warning("vs-23", "failed to init context");
15765+ return;
15766+ }
15767+
15768+ assert("nikita-3558", inode != NULL);
15769+
15770+ info = d_info(inode);
15771+ oid = get_inode_oid(inode);
15772+ spin_lock_inode(inode);
15773+ head = get_readdir_list(inode);
15774+ spin_lock(&d_lock);
15775+ /* find any cursor for this oid: reference to it is hanging of radix
15776+ * tree */
15777+ start = lookup(info, (unsigned long)oid);
15778+ if (start != NULL) {
15779+ dir_cursor *scan;
15780+ reiser4_file_fsdata *fsdata;
15781+
15782+ /* process circular list of cursors for this oid */
15783+ scan = start;
15784+ do {
15785+ dir_cursor *next;
15786+
15787+ next = list_entry(scan->list.next, dir_cursor, list);
15788+ fsdata = scan->fsdata;
15789+ assert("nikita-3557", fsdata != NULL);
15790+ if (scan->key.oid == oid) {
15791+ switch (act) {
15792+ case CURSOR_DISPOSE:
15793+ list_del_init(&fsdata->dir.linkage);
15794+ break;
15795+ case CURSOR_LOAD:
15796+ list_add(&fsdata->dir.linkage, head);
15797+ break;
15798+ case CURSOR_KILL:
15799+ kill_cursor(scan);
15800+ break;
15801+ }
15802+ }
15803+ if (scan == next)
15804+ /* last cursor was just killed */
15805+ break;
15806+ scan = next;
15807+ } while (scan != start);
15808+ }
15809+ spin_unlock(&d_lock);
15810+ /* check that we killed 'em all */
15811+ assert("nikita-3568",
15812+ ergo(act == CURSOR_KILL,
15813+ list_empty_careful(get_readdir_list(inode))));
15814+ assert("nikita-3569",
15815+ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
15816+ spin_unlock_inode(inode);
15817+ reiser4_exit_context(ctx);
15818+}
15819+
15820+/**
15821+ * reiser4_dispose_cursors - removes cursors from inode's list
15822+ * @inode: inode to dispose cursors of
15823+ *
15824+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
15825+ * attached to cursor from inode's readdir list. This is called when inode is
15826+ * removed from the memory by memory pressure.
15827+ */
15828+void reiser4_dispose_cursors(struct inode *inode)
15829+{
15830+ process_cursors(inode, CURSOR_DISPOSE);
15831+}
15832+
15833+/**
15834+ * reiser4_load_cursors - attach cursors to inode
15835+ * @inode: inode to load cursors to
15836+ *
15837+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
15838+ * attached to cursor to inode's readdir list. This is done when inode is
15839+ * loaded into memory.
15840+ */
15841+void reiser4_load_cursors(struct inode *inode)
15842+{
15843+ process_cursors(inode, CURSOR_LOAD);
15844+}
15845+
15846+/**
15847+ * reiser4_kill_cursors - kill all inode cursors
15848+ * @inode: inode to kill cursors of
15849+ *
15850+ * Frees all cursors for this inode. This is called when inode is destroyed.
15851+ */
15852+void reiser4_kill_cursors(struct inode *inode)
15853+{
15854+ process_cursors(inode, CURSOR_KILL);
15855+}
15856+
15857+/**
15858+ * file_is_stateless -
15859+ * @file:
15860+ *
15861+ * true, if file descriptor @f is created by NFS server by "demand" to serve
15862+ * one file system operation. This means that there may be "detached state"
15863+ * for underlying inode.
15864+ */
15865+static int file_is_stateless(struct file *file)
15866+{
15867+ return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
15868+}
15869+
15870+/**
15871+ * reiser4_get_dir_fpos -
15872+ * @dir:
15873+ *
15874+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
15875+ * in the case of stateless directory operation (readdir-over-nfs), client id
15876+ * was encoded in the high bits of cookie and should me masked off.
15877+ */
15878+loff_t reiser4_get_dir_fpos(struct file *dir)
15879+{
15880+ if (file_is_stateless(dir))
15881+ return dir->f_pos & CID_MASK;
15882+ else
15883+ return dir->f_pos;
15884+}
15885+
15886+/**
15887+ * reiser4_attach_fsdata - try to attach fsdata
15888+ * @file:
15889+ * @inode:
15890+ *
15891+ * Finds or creates cursor for readdir-over-nfs.
15892+ */
15893+int reiser4_attach_fsdata(struct file *file, struct inode *inode)
15894+{
15895+ loff_t pos;
15896+ int result;
15897+ dir_cursor *cursor;
15898+
15899+ /*
15900+ * we are serialized by inode->i_mutex
15901+ */
15902+ if (!file_is_stateless(file))
15903+ return 0;
15904+
15905+ pos = file->f_pos;
15906+ result = 0;
15907+ if (pos == 0) {
15908+ /*
15909+ * first call to readdir (or rewind to the beginning of
15910+ * directory)
15911+ */
15912+ cursor = kmem_cache_alloc(d_cursor_cache,
15913+ reiser4_ctx_gfp_mask_get());
15914+ if (cursor != NULL)
15915+ result = insert_cursor(cursor, file, inode);
15916+ else
15917+ result = RETERR(-ENOMEM);
15918+ } else {
15919+ /* try to find existing cursor */
15920+ struct d_cursor_key key;
15921+
15922+ key.cid = pos >> CID_SHIFT;
15923+ key.oid = get_inode_oid(inode);
15924+ spin_lock(&d_lock);
15925+ cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
15926+ if (cursor != NULL) {
15927+ /* cursor was found */
15928+ if (cursor->ref == 0) {
15929+ /* move it from unused list */
15930+ list_del_init(&cursor->alist);
15931+ --d_cursor_unused;
15932+ }
15933+ ++cursor->ref;
15934+ }
15935+ spin_unlock(&d_lock);
15936+ if (cursor != NULL) {
15937+ spin_lock_inode(inode);
15938+ assert("nikita-3556", cursor->fsdata->back == NULL);
15939+ clean_fsdata(file);
15940+ free_file_fsdata_nolock(file);
15941+ file->private_data = cursor->fsdata;
15942+ spin_unlock_inode(inode);
15943+ }
15944+ }
15945+ return result;
15946+}
15947+
15948+/**
15949+ * reiser4_detach_fsdata - ???
15950+ * @file:
15951+ *
15952+ * detach fsdata, if necessary
15953+ */
15954+void reiser4_detach_fsdata(struct file *file)
15955+{
15956+ struct inode *inode;
15957+
15958+ if (!file_is_stateless(file))
15959+ return;
15960+
15961+ inode = file->f_dentry->d_inode;
15962+ spin_lock_inode(inode);
15963+ clean_fsdata(file);
15964+ spin_unlock_inode(inode);
15965+}
15966+
15967+/* slab for reiser4_dentry_fsdata */
15968+static struct kmem_cache *dentry_fsdata_cache;
15969+
15970+/**
15971+ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
15972+ *
15973+ * Initializes slab cache of structures attached to denty->d_fsdata. It is
15974+ * part of reiser4 module initialization.
15975+ */
15976+int reiser4_init_dentry_fsdata(void)
15977+{
15978+ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
15979+ sizeof(struct reiser4_dentry_fsdata),
15980+ 0,
15981+ SLAB_HWCACHE_ALIGN |
15982+ SLAB_RECLAIM_ACCOUNT,
15983+ NULL);
15984+ if (dentry_fsdata_cache == NULL)
15985+ return RETERR(-ENOMEM);
15986+ return 0;
15987+}
15988+
15989+/**
15990+ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
15991+ *
15992+ * This is called on reiser4 module unloading or system shutdown.
15993+ */
15994+void reiser4_done_dentry_fsdata(void)
15995+{
15996+ destroy_reiser4_cache(&dentry_fsdata_cache);
15997+}
15998+
15999+/**
16000+ * reiser4_get_dentry_fsdata - get fs-specific dentry data
16001+ * @dentry: queried dentry
16002+ *
16003+ * Allocates if necessary and returns per-dentry data that we attach to each
16004+ * dentry.
16005+ */
16006+struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
16007+{
16008+ assert("nikita-1365", dentry != NULL);
16009+
16010+ if (dentry->d_fsdata == NULL) {
16011+ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
16012+ reiser4_ctx_gfp_mask_get());
16013+ if (dentry->d_fsdata == NULL)
16014+ return ERR_PTR(RETERR(-ENOMEM));
16015+ memset(dentry->d_fsdata, 0,
16016+ sizeof(struct reiser4_dentry_fsdata));
16017+ }
16018+ return dentry->d_fsdata;
16019+}
16020+
16021+/**
16022+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
16023+ * @dentry: dentry to free fsdata of
16024+ *
16025+ * Detaches and frees fs-specific dentry data
16026+ */
16027+void reiser4_free_dentry_fsdata(struct dentry *dentry)
16028+{
16029+ if (dentry->d_fsdata != NULL) {
16030+ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
16031+ dentry->d_fsdata = NULL;
16032+ }
16033+}
16034+
16035+/* slab for reiser4_file_fsdata */
16036+static struct kmem_cache *file_fsdata_cache;
16037+
16038+/**
16039+ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
16040+ *
16041+ * Initializes slab cache of structures attached to file->private_data. It is
16042+ * part of reiser4 module initialization.
16043+ */
16044+int reiser4_init_file_fsdata(void)
16045+{
16046+ file_fsdata_cache = kmem_cache_create("file_fsdata",
16047+ sizeof(reiser4_file_fsdata),
16048+ 0,
16049+ SLAB_HWCACHE_ALIGN |
16050+ SLAB_RECLAIM_ACCOUNT, NULL);
16051+ if (file_fsdata_cache == NULL)
16052+ return RETERR(-ENOMEM);
16053+ return 0;
16054+}
16055+
16056+/**
16057+ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
16058+ *
16059+ * This is called on reiser4 module unloading or system shutdown.
16060+ */
16061+void reiser4_done_file_fsdata(void)
16062+{
16063+ destroy_reiser4_cache(&file_fsdata_cache);
16064+}
16065+
16066+/**
16067+ * create_fsdata - allocate and initialize reiser4_file_fsdata
16068+ * @file: what to create file_fsdata for, may be NULL
16069+ *
16070+ * Allocates and initializes reiser4_file_fsdata structure.
16071+ */
16072+static reiser4_file_fsdata *create_fsdata(struct file *file)
16073+{
16074+ reiser4_file_fsdata *fsdata;
16075+
16076+ fsdata = kmem_cache_alloc(file_fsdata_cache,
16077+ reiser4_ctx_gfp_mask_get());
16078+ if (fsdata != NULL) {
16079+ memset(fsdata, 0, sizeof *fsdata);
16080+ fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16081+ fsdata->back = file;
16082+ INIT_LIST_HEAD(&fsdata->dir.linkage);
16083+ }
16084+ return fsdata;
16085+}
16086+
16087+/**
16088+ * free_fsdata - free reiser4_file_fsdata
16089+ * @fsdata: object to free
16090+ *
16091+ * Dual to create_fsdata(). Free reiser4_file_fsdata.
16092+ */
16093+static void free_fsdata(reiser4_file_fsdata *fsdata)
16094+{
16095+ BUG_ON(fsdata == NULL);
16096+ kmem_cache_free(file_fsdata_cache, fsdata);
16097+}
16098+
16099+/**
16100+ * reiser4_get_file_fsdata - get fs-specific file data
16101+ * @file: queried file
16102+ *
16103+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16104+ * to @file.
16105+ */
16106+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16107+{
16108+ assert("nikita-1603", file != NULL);
16109+
16110+ if (file->private_data == NULL) {
16111+ reiser4_file_fsdata *fsdata;
16112+ struct inode *inode;
16113+
16114+ fsdata = create_fsdata(file);
16115+ if (fsdata == NULL)
16116+ return ERR_PTR(RETERR(-ENOMEM));
16117+
16118+ inode = file->f_dentry->d_inode;
16119+ spin_lock_inode(inode);
16120+ if (file->private_data == NULL) {
16121+ file->private_data = fsdata;
16122+ fsdata = NULL;
16123+ }
16124+ spin_unlock_inode(inode);
16125+ if (fsdata != NULL)
16126+ /* other thread initialized ->fsdata */
16127+ kmem_cache_free(file_fsdata_cache, fsdata);
16128+ }
16129+ assert("nikita-2665", file->private_data != NULL);
16130+ return file->private_data;
16131+}
16132+
16133+/**
16134+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16135+ * @file:
16136+ *
16137+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16138+ * readdir list, frees if it is not linked to d_cursor object.
16139+ */
16140+static void free_file_fsdata_nolock(struct file *file)
16141+{
16142+ reiser4_file_fsdata *fsdata;
16143+
16144+ assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16145+ fsdata = file->private_data;
16146+ if (fsdata != NULL) {
16147+ list_del_init(&fsdata->dir.linkage);
16148+ if (fsdata->cursor == NULL)
16149+ free_fsdata(fsdata);
16150+ }
16151+ file->private_data = NULL;
16152+}
16153+
16154+/**
16155+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16156+ * @file:
16157+ *
16158+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16159+ */
16160+void reiser4_free_file_fsdata(struct file *file)
16161+{
16162+ spin_lock_inode(file->f_dentry->d_inode);
16163+ free_file_fsdata_nolock(file);
16164+ spin_unlock_inode(file->f_dentry->d_inode);
16165+}
16166+
16167+/*
16168+ * Local variables:
16169+ * c-indentation-style: "K&R"
16170+ * mode-name: "LC"
16171+ * c-basic-offset: 8
16172+ * tab-width: 8
16173+ * fill-column: 79
16174+ * End:
16175+ */
16176diff -urN linux-2.6.23.orig/fs/reiser4/fsdata.h linux-2.6.23/fs/reiser4/fsdata.h
16177--- linux-2.6.23.orig/fs/reiser4/fsdata.h 1970-01-01 03:00:00.000000000 +0300
16178+++ linux-2.6.23/fs/reiser4/fsdata.h 2007-12-04 16:49:30.000000000 +0300
16179@@ -0,0 +1,205 @@
16180+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16181+ * reiser4/README */
16182+
16183+#if !defined( __REISER4_FSDATA_H__ )
16184+#define __REISER4_FSDATA_H__
16185+
16186+#include "debug.h"
16187+#include "kassign.h"
16188+#include "seal.h"
16189+#include "type_safe_hash.h"
16190+#include "plugin/file/file.h"
16191+#include "readahead.h"
16192+
16193+/*
16194+ * comment about reiser4_dentry_fsdata
16195+ *
16196+ *
16197+ */
16198+
16199+/*
16200+ * locking: fields of per file descriptor readdir_pos and ->f_pos are
16201+ * protected by ->i_mutex on inode. Under this lock following invariant
16202+ * holds:
16203+ *
16204+ * file descriptor is "looking" at the entry_no-th directory entry from
16205+ * the beginning of directory. This entry has key dir_entry_key and is
16206+ * pos-th entry with duplicate-key sequence.
16207+ *
16208+ */
16209+
16210+/* logical position within directory */
16211+struct dir_pos {
16212+ /* key of directory entry (actually, part of a key sufficient to
16213+ identify directory entry) */
16214+ de_id dir_entry_key;
16215+ /* ordinal number of directory entry among all entries with the same
16216+ key. (Starting from 0.) */
16217+ unsigned pos;
16218+};
16219+
16220+struct readdir_pos {
16221+ /* f_pos corresponding to this readdir position */
16222+ __u64 fpos;
16223+ /* logical position within directory */
16224+ struct dir_pos position;
16225+ /* logical number of directory entry within
16226+ directory */
16227+ __u64 entry_no;
16228+};
16229+
16230+/*
16231+ * this is used to speed up lookups for directory entry: on initial call to
16232+ * ->lookup() seal and coord of directory entry (if found, that is) are stored
16233+ * in struct dentry and reused later to avoid tree traversals.
16234+ */
16235+struct de_location {
16236+ /* seal covering directory entry */
16237+ seal_t entry_seal;
16238+ /* coord of directory entry */
16239+ coord_t entry_coord;
16240+ /* ordinal number of directory entry among all entries with the same
16241+ key. (Starting from 0.) */
16242+ int pos;
16243+};
16244+
16245+/**
16246+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16247+ *
16248+ * This is allocated dynamically and released in d_op->d_release()
16249+ *
16250+ * Currently it only contains cached location (hint) of directory entry, but
16251+ * it is expected that other information will be accumulated here.
16252+ */
16253+struct reiser4_dentry_fsdata {
16254+ /*
16255+ * here will go fields filled by ->lookup() to speedup next
16256+ * create/unlink, like blocknr of znode with stat-data, or key of
16257+ * stat-data.
16258+ */
16259+ struct de_location dec;
16260+ int stateless; /* created through reiser4_decode_fh, needs special
16261+ * treatment in readdir. */
16262+};
16263+
16264+extern int reiser4_init_dentry_fsdata(void);
16265+extern void reiser4_done_dentry_fsdata(void);
16266+extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16267+extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16268+
16269+/**
16270+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16271+ *
16272+ * This is allocated dynamically and released in inode->i_fop->release
16273+ */
16274+typedef struct reiser4_file_fsdata {
16275+ /*
16276+ * pointer back to the struct file which this reiser4_file_fsdata is
16277+ * part of
16278+ */
16279+ struct file *back;
16280+ /* detached cursor for stateless readdir. */
16281+ struct dir_cursor *cursor;
16282+ /*
16283+ * We need both directory and regular file parts here, because there
16284+ * are file system objects that are files and directories.
16285+ */
16286+ struct {
16287+ /*
16288+ * position in directory. It is updated each time directory is
16289+ * modified
16290+ */
16291+ struct readdir_pos readdir;
16292+ /* head of this list is reiser4_inode->lists.readdir_list */
16293+ struct list_head linkage;
16294+ } dir;
16295+ /* hints to speed up operations with regular files: read and write. */
16296+ struct {
16297+ hint_t hint;
16298+ } reg;
16299+ struct reiser4_file_ra_state ra1;
16300+
16301+} reiser4_file_fsdata;
16302+
16303+extern int reiser4_init_file_fsdata(void);
16304+extern void reiser4_done_file_fsdata(void);
16305+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16306+extern void reiser4_free_file_fsdata(struct file *);
16307+
16308+/*
16309+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16310+ * used to address problem reiser4 has with readdir accesses via NFS. See
16311+ * plugin/file_ops_readdir.c for more details.
16312+ */
16313+struct d_cursor_key{
16314+ __u16 cid;
16315+ __u64 oid;
16316+};
16317+
16318+/*
16319+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16320+ * maintain hash table of dir_cursor-s in reiser4's super block
16321+ */
16322+typedef struct dir_cursor dir_cursor;
16323+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16324+
16325+struct dir_cursor {
16326+ int ref;
16327+ reiser4_file_fsdata *fsdata;
16328+
16329+ /* link to reiser4 super block hash table of cursors */
16330+ d_cursor_hash_link hash;
16331+
16332+ /*
16333+ * this is to link cursors to reiser4 super block's radix tree of
16334+ * cursors if there are more than one cursor of the same objectid
16335+ */
16336+ struct list_head list;
16337+ struct d_cursor_key key;
16338+ struct d_cursor_info *info;
16339+ /* list of unused cursors */
16340+ struct list_head alist;
16341+};
16342+
16343+extern int reiser4_init_d_cursor(void);
16344+extern void reiser4_done_d_cursor(void);
16345+
16346+extern int reiser4_init_super_d_info(struct super_block *);
16347+extern void reiser4_done_super_d_info(struct super_block *);
16348+
16349+extern loff_t reiser4_get_dir_fpos(struct file *);
16350+extern int reiser4_attach_fsdata(struct file *, struct inode *);
16351+extern void reiser4_detach_fsdata(struct file *);
16352+
16353+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16354+ more details */
16355+void reiser4_dispose_cursors(struct inode *inode);
16356+void reiser4_load_cursors(struct inode *inode);
16357+void reiser4_kill_cursors(struct inode *inode);
16358+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16359+ int offset, int adj);
16360+
16361+/*
16362+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16363+ * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16364+ */
16365+struct d_cursor_info {
16366+ d_cursor_hash_table table;
16367+ struct radix_tree_root tree;
16368+};
16369+
16370+/* spinlock protecting readdir cursors */
16371+extern spinlock_t d_lock;
16372+
16373+/* __REISER4_FSDATA_H__ */
16374+#endif
16375+
16376+/*
16377+ * Local variables:
16378+ * c-indentation-style: "K&R"
16379+ * mode-name: "LC"
16380+ * c-basic-offset: 8
16381+ * tab-width: 8
16382+ * fill-column: 120
16383+ * End:
16384+ */
16385diff -urN linux-2.6.23.orig/fs/reiser4/init_super.c linux-2.6.23/fs/reiser4/init_super.c
16386--- linux-2.6.23.orig/fs/reiser4/init_super.c 1970-01-01 03:00:00.000000000 +0300
16387+++ linux-2.6.23/fs/reiser4/init_super.c 2007-12-04 16:49:30.000000000 +0300
16388@@ -0,0 +1,751 @@
16389+/* Copyright by Hans Reiser, 2003 */
16390+
16391+#include "super.h"
16392+#include "inode.h"
16393+#include "plugin/plugin_set.h"
16394+
16395+#include <linux/swap.h>
16396+
16397+/**
16398+ * init_fs_info - allocate reiser4 specific super block
16399+ * @super: super block of filesystem
16400+ *
16401+ * Allocates and initialize reiser4_super_info_data, attaches it to
16402+ * super->s_fs_info, initializes structures maintaining d_cursor-s.
16403+ */
16404+int reiser4_init_fs_info(struct super_block *super)
16405+{
16406+ reiser4_super_info_data *sbinfo;
16407+
16408+ sbinfo = kzalloc(sizeof(reiser4_super_info_data),
16409+ reiser4_ctx_gfp_mask_get());
16410+ if (!sbinfo)
16411+ return RETERR(-ENOMEM);
16412+
16413+ super->s_fs_info = sbinfo;
16414+ super->s_op = NULL;
16415+
16416+ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16417+ ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16418+
16419+ mutex_init(&sbinfo->delete_mutex);
16420+ spin_lock_init(&(sbinfo->guard));
16421+
16422+ /* initialize per-super-block d_cursor resources */
16423+ reiser4_init_super_d_info(super);
16424+
16425+ return 0;
16426+}
16427+
16428+/**
16429+ * reiser4_done_fs_info - free reiser4 specific super block
16430+ * @super: super block of filesystem
16431+ *
16432+ * Performs some sanity checks, releases structures maintaining d_cursor-s,
16433+ * frees reiser4_super_info_data.
16434+ */
16435+void reiser4_done_fs_info(struct super_block *super)
16436+{
16437+ assert("zam-990", super->s_fs_info != NULL);
16438+
16439+ /* release per-super-block d_cursor resources */
16440+ reiser4_done_super_d_info(super);
16441+
16442+ /* make sure that there are not jnodes already */
16443+ assert("", list_empty(&get_super_private(super)->all_jnodes));
16444+ assert("", get_current_context()->trans->atom == NULL);
16445+ reiser4_check_block_counters(super);
16446+ kfree(super->s_fs_info);
16447+ super->s_fs_info = NULL;
16448+}
16449+
16450+/* type of option parseable by parse_option() */
16451+typedef enum {
16452+ /* value of option is arbitrary string */
16453+ OPT_STRING,
16454+
16455+ /*
16456+ * option specifies bit in a bitmask. When option is set - bit in
16457+ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16458+ * dont_load_bitmap, atomic_write.
16459+ */
16460+ OPT_BIT,
16461+
16462+ /*
16463+ * value of option should conform to sprintf() format. Examples are
16464+ * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16465+ */
16466+ OPT_FORMAT,
16467+
16468+ /*
16469+ * option can take one of predefined values. Example is onerror=panic or
16470+ * onerror=remount-ro
16471+ */
16472+ OPT_ONEOF,
16473+} opt_type_t;
16474+
16475+#if 0
16476+struct opt_bitmask_bit {
16477+ const char *bit_name;
16478+ int bit_nr;
16479+};
16480+#endif
16481+
16482+/* description of option parseable by parse_option() */
16483+struct opt_desc {
16484+ /* option name.
16485+
16486+ parsed portion of string has a form "name=value".
16487+ */
16488+ const char *name;
16489+ /* type of option */
16490+ opt_type_t type;
16491+ union {
16492+ /* where to store value of string option (type == OPT_STRING) */
16493+ char **string;
16494+ /* description of bits for bit option (type == OPT_BIT) */
16495+ struct {
16496+ int nr;
16497+ void *addr;
16498+ } bit;
16499+ /* description of format and targets for format option (type
16500+ == OPT_FORMAT) */
16501+ struct {
16502+ const char *format;
16503+ int nr_args;
16504+ void *arg1;
16505+ void *arg2;
16506+ void *arg3;
16507+ void *arg4;
16508+ } f;
16509+ struct {
16510+ int *result;
16511+ const char *list[10];
16512+ } oneof;
16513+ struct {
16514+ void *addr;
16515+ int nr_bits;
16516+ //struct opt_bitmask_bit *bits;
16517+ } bitmask;
16518+ } u;
16519+};
16520+
16521+/**
16522+ * parse_option - parse one option
16523+ * @opt_strin: starting point of parsing
16524+ * @opt: option description
16525+ *
16526+ * foo=bar,
16527+ * ^ ^ ^
16528+ * | | +-- replaced to '\0'
16529+ * | +-- val_start
16530+ * +-- opt_string
16531+ * Figures out option type and handles option correspondingly.
16532+ */
16533+static int parse_option(char *opt_string, struct opt_desc *opt)
16534+{
16535+ char *val_start;
16536+ int result;
16537+ const char *err_msg;
16538+
16539+ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16540+
16541+ val_start = strchr(opt_string, '=');
16542+ if (val_start != NULL) {
16543+ *val_start = '\0';
16544+ ++val_start;
16545+ }
16546+
16547+ err_msg = NULL;
16548+ result = 0;
16549+ switch (opt->type) {
16550+ case OPT_STRING:
16551+ if (val_start == NULL) {
16552+ err_msg = "String arg missing";
16553+ result = RETERR(-EINVAL);
16554+ } else
16555+ *opt->u.string = val_start;
16556+ break;
16557+ case OPT_BIT:
16558+ if (val_start != NULL)
16559+ err_msg = "Value ignored";
16560+ else
16561+ set_bit(opt->u.bit.nr, opt->u.bit.addr);
16562+ break;
16563+ case OPT_FORMAT:
16564+ if (val_start == NULL) {
16565+ err_msg = "Formatted arg missing";
16566+ result = RETERR(-EINVAL);
16567+ break;
16568+ }
16569+ if (sscanf(val_start, opt->u.f.format,
16570+ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16571+ opt->u.f.arg4) != opt->u.f.nr_args) {
16572+ err_msg = "Wrong conversion";
16573+ result = RETERR(-EINVAL);
16574+ }
16575+ break;
16576+ case OPT_ONEOF:
16577+ {
16578+ int i = 0;
16579+
16580+ if (val_start == NULL) {
16581+ err_msg = "Value is missing";
16582+ result = RETERR(-EINVAL);
16583+ break;
16584+ }
16585+ err_msg = "Wrong option value";
16586+ result = RETERR(-EINVAL);
16587+ while (opt->u.oneof.list[i]) {
16588+ if (!strcmp(opt->u.oneof.list[i], val_start)) {
16589+ result = 0;
16590+ err_msg = NULL;
16591+ *opt->u.oneof.result = i;
16592+ break;
16593+ }
16594+ i++;
16595+ }
16596+ break;
16597+ }
16598+ default:
16599+ wrong_return_value("nikita-2100", "opt -> type");
16600+ break;
16601+ }
16602+ if (err_msg != NULL) {
16603+ warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16604+ err_msg, opt->name, val_start ? "=" : "",
16605+ val_start ? : "");
16606+ }
16607+ return result;
16608+}
16609+
16610+/**
16611+ * parse_options - parse reiser4 mount options
16612+ * @opt_string: starting point
16613+ * @opts: array of option description
16614+ * @nr_opts: number of elements in @opts
16615+ *
16616+ * Parses comma separated list of reiser4 mount options.
16617+ */
16618+static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
16619+{
16620+ int result;
16621+
16622+ result = 0;
16623+ while ((result == 0) && opt_string && *opt_string) {
16624+ int j;
16625+ char *next;
16626+
16627+ next = strchr(opt_string, ',');
16628+ if (next != NULL) {
16629+ *next = '\0';
16630+ ++next;
16631+ }
16632+ for (j = 0; j < nr_opts; ++j) {
16633+ if (!strncmp(opt_string, opts[j].name,
16634+ strlen(opts[j].name))) {
16635+ result = parse_option(opt_string, &opts[j]);
16636+ break;
16637+ }
16638+ }
16639+ if (j == nr_opts) {
16640+ warning("nikita-2307", "Unrecognized option: \"%s\"",
16641+ opt_string);
16642+ /* traditionally, -EINVAL is returned on wrong mount
16643+ option */
16644+ result = RETERR(-EINVAL);
16645+ }
16646+ opt_string = next;
16647+ }
16648+ return result;
16649+}
16650+
16651+#define NUM_OPT( label, fmt, addr ) \
16652+ { \
16653+ .name = ( label ), \
16654+ .type = OPT_FORMAT, \
16655+ .u = { \
16656+ .f = { \
16657+ .format = ( fmt ), \
16658+ .nr_args = 1, \
16659+ .arg1 = ( addr ), \
16660+ .arg2 = NULL, \
16661+ .arg3 = NULL, \
16662+ .arg4 = NULL \
16663+ } \
16664+ } \
16665+ }
16666+
16667+#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
16668+
16669+#define BIT_OPT(label, bitnr) \
16670+ { \
16671+ .name = label, \
16672+ .type = OPT_BIT, \
16673+ .u = { \
16674+ .bit = { \
16675+ .nr = bitnr, \
16676+ .addr = &sbinfo->fs_flags \
16677+ } \
16678+ } \
16679+ }
16680+
16681+#define MAX_NR_OPTIONS (30)
16682+
16683+/**
16684+ * reiser4_init_super_data - initialize reiser4 private super block
16685+ * @super: super block to initialize
16686+ * @opt_string: list of reiser4 mount options
16687+ *
16688+ * Sets various reiser4 parameters to default values. Parses mount options and
16689+ * overwrites default settings.
16690+ */
16691+int reiser4_init_super_data(struct super_block *super, char *opt_string)
16692+{
16693+ int result;
16694+ struct opt_desc *opts, *p;
16695+ reiser4_super_info_data *sbinfo = get_super_private(super);
16696+
16697+ /* initialize super, export, dentry operations */
16698+ sbinfo->ops.super = reiser4_super_operations;
16699+ sbinfo->ops.export = reiser4_export_operations;
16700+ sbinfo->ops.dentry = reiser4_dentry_operations;
16701+ super->s_op = &sbinfo->ops.super;
16702+ super->s_export_op = &sbinfo->ops.export;
16703+
16704+ /* initialize transaction manager parameters to default values */
16705+ sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16706+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16707+ sbinfo->tmgr.atom_min_size = 256;
16708+ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16709+
16710+ /* initialize cbk cache parameter */
16711+ sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16712+
16713+ /* initialize flush parameters */
16714+ sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16715+ sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16716+ sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16717+ sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16718+
16719+ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16720+
16721+ /* preliminary tree initializations */
16722+ sbinfo->tree.super = super;
16723+ sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16724+ sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16725+ sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16726+ sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16727+ rwlock_init(&(sbinfo->tree.tree_lock));
16728+ spin_lock_init(&(sbinfo->tree.epoch_lock));
16729+
16730+ /* initialize default readahead params */
16731+ sbinfo->ra_params.max = num_physpages / 4;
16732+ sbinfo->ra_params.flags = 0;
16733+
16734+ /* allocate memory for structure describing reiser4 mount options */
16735+ opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
16736+ reiser4_ctx_gfp_mask_get());
16737+ if (opts == NULL)
16738+ return RETERR(-ENOMEM);
16739+
16740+ /* initialize structure describing reiser4 mount options */
16741+ p = opts;
16742+
16743+#if REISER4_DEBUG
16744+# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \
16745+ warning ("zam-1046", "opt array is overloaded"); break; \
16746+ }
16747+#else
16748+# define OPT_ARRAY_CHECK noop
16749+#endif
16750+
16751+#define PUSH_OPT(...) \
16752+do { \
16753+ struct opt_desc o = __VA_ARGS__; \
16754+ OPT_ARRAY_CHECK; \
16755+ *p ++ = o; \
16756+} while (0)
16757+
16758+#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
16759+#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
16760+
16761+ /*
16762+ * tmgr.atom_max_size=N
16763+ * Atoms containing more than N blocks will be forced to commit. N is
16764+ * decimal.
16765+ */
16766+ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
16767+ /*
16768+ * tmgr.atom_max_age=N
16769+ * Atoms older than N seconds will be forced to commit. N is decimal.
16770+ */
16771+ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
16772+ /*
16773+ * tmgr.atom_min_size=N
16774+ * In committing an atom to free dirty pages, force the atom less than
16775+ * N in size to fuse with another one.
16776+ */
16777+ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
16778+ /*
16779+ * tmgr.atom_max_flushers=N
16780+ * limit of concurrent flushers for one atom. 0 means no limit.
16781+ */
16782+ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
16783+ /*
16784+ * tree.cbk_cache_slots=N
16785+ * Number of slots in the cbk cache.
16786+ */
16787+ PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
16788+ /*
16789+ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
16790+ * leaf-level blocks it will force them to be relocated.
16791+ */
16792+ PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
16793+ /*
16794+ * If flush finds can find a block allocation closer than at most
16795+ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
16796+ * position.
16797+ */
16798+ PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
16799+ /*
16800+ * If we have written this much or more blocks before encountering busy
16801+ * jnode in flush list - abort flushing hoping that next time we get
16802+ * called this jnode will be clean already, and we will save some
16803+ * seeks.
16804+ */
16805+ PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
16806+ /* The maximum number of nodes to scan left on a level during flush. */
16807+ PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
16808+ /* preferred IO size */
16809+ PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
16810+ /* carry flags used for insertion of new nodes */
16811+ PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
16812+ /* carry flags used for insertion of new extents */
16813+ PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
16814+ /* carry flags used for paste operations */
16815+ PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
16816+ /* carry flags used for insert operations */
16817+ PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
16818+
16819+#ifdef CONFIG_REISER4_BADBLOCKS
16820+ /*
16821+ * Alternative master superblock location in case if it's original
16822+ * location is not writeable/accessable. This is offset in BYTES.
16823+ */
16824+ PUSH_SB_FIELD_OPT(altsuper, "%lu");
16825+#endif
16826+
16827+ /* turn on BSD-style gid assignment */
16828+ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
16829+ /* turn on 32 bit times */
16830+ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
16831+ /*
16832+ * Don't load all bitmap blocks at mount time, it is useful for
16833+ * machines with tiny RAM and large disks.
16834+ */
16835+ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
16836+ /* disable transaction commits during write() */
16837+ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
16838+ /* disable use of write barriers in the reiser4 log writer. */
16839+ PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
16840+
16841+ PUSH_OPT(
16842+ {
16843+ /*
16844+ * tree traversal readahead parameters:
16845+ * -o readahead:MAXNUM:FLAGS
16846+ * MAXNUM - max number fo nodes to request readahead for: -1UL
16847+ * will set it to max_sane_readahead()
16848+ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
16849+ * CONTINUE_ON_PRESENT
16850+ */
16851+ .name = "readahead",
16852+ .type = OPT_FORMAT,
16853+ .u = {
16854+ .f = {
16855+ .format = "%u:%u",
16856+ .nr_args = 2,
16857+ .arg1 = &sbinfo->ra_params.max,
16858+ .arg2 = &sbinfo->ra_params.flags,
16859+ .arg3 = NULL,
16860+ .arg4 = NULL
16861+ }
16862+ }
16863+ }
16864+ );
16865+
16866+ /* What to do in case of fs error */
16867+ PUSH_OPT(
16868+ {
16869+ .name = "onerror",
16870+ .type = OPT_ONEOF,
16871+ .u = {
16872+ .oneof = {
16873+ .result = &sbinfo->onerror,
16874+ .list = {
16875+ "panic", "remount-ro", NULL
16876+ },
16877+ }
16878+ }
16879+ }
16880+ );
16881+
16882+ /* modify default settings to values set by mount options */
16883+ result = parse_options(opt_string, opts, p - opts);
16884+ kfree(opts);
16885+ if (result != 0)
16886+ return result;
16887+
16888+ /* correct settings to sanity values */
16889+ sbinfo->tmgr.atom_max_age *= HZ;
16890+ if (sbinfo->tmgr.atom_max_age <= 0)
16891+ /* overflow */
16892+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
16893+
16894+ /* round optimal io size up to 512 bytes */
16895+ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
16896+ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
16897+ if (sbinfo->optimal_io_size == 0) {
16898+ warning("nikita-2497", "optimal_io_size is too small");
16899+ return RETERR(-EINVAL);
16900+ }
16901+ return result;
16902+}
16903+
16904+/**
16905+ * reiser4_init_read_super - read reiser4 master super block
16906+ * @super: super block to fill
16907+ * @silent: if 0 - print warnings
16908+ *
16909+ * Reads reiser4 master super block either from predefined location or from
16910+ * location specified by altsuper mount option, initializes disk format plugin.
16911+ */
16912+int reiser4_init_read_super(struct super_block *super, int silent)
16913+{
16914+ struct buffer_head *super_bh;
16915+ struct reiser4_master_sb *master_sb;
16916+ reiser4_super_info_data *sbinfo = get_super_private(super);
16917+ unsigned long blocksize;
16918+
16919+ read_super_block:
16920+#ifdef CONFIG_REISER4_BADBLOCKS
16921+ if (sbinfo->altsuper)
16922+ /*
16923+ * read reiser4 master super block at position specified by
16924+ * mount option
16925+ */
16926+ super_bh = sb_bread(super,
16927+ (sector_t)(sbinfo->altsuper / super->s_blocksize));
16928+ else
16929+#endif
16930+ /* read reiser4 master super block at 16-th 4096 block */
16931+ super_bh = sb_bread(super,
16932+ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
16933+ if (!super_bh)
16934+ return RETERR(-EIO);
16935+
16936+ master_sb = (struct reiser4_master_sb *)super_bh->b_data;
16937+ /* check reiser4 magic string */
16938+ if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
16939+ sizeof(REISER4_SUPER_MAGIC_STRING))) {
16940+ /* reiser4 master super block contains filesystem blocksize */
16941+ blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
16942+
16943+ if (blocksize != PAGE_CACHE_SIZE) {
16944+ /*
16945+ * currenly reiser4's blocksize must be equal to
16946+ * pagesize
16947+ */
16948+ if (!silent)
16949+ warning("nikita-2609",
16950+ "%s: wrong block size %ld\n", super->s_id,
16951+ blocksize);
16952+ brelse(super_bh);
16953+ return RETERR(-EINVAL);
16954+ }
16955+ if (blocksize != super->s_blocksize) {
16956+ /*
16957+ * filesystem uses different blocksize. Reread master
16958+ * super block with correct blocksize
16959+ */
16960+ brelse(super_bh);
16961+ if (!sb_set_blocksize(super, (int)blocksize))
16962+ return RETERR(-EINVAL);
16963+ goto read_super_block;
16964+ }
16965+
16966+ sbinfo->df_plug =
16967+ disk_format_plugin_by_id(
16968+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16969+ if (sbinfo->df_plug == NULL) {
16970+ if (!silent)
16971+ warning("nikita-26091",
16972+ "%s: unknown disk format plugin %d\n",
16973+ super->s_id,
16974+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16975+ brelse(super_bh);
16976+ return RETERR(-EINVAL);
16977+ }
16978+ sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
16979+ brelse(super_bh);
16980+ return 0;
16981+ }
16982+
16983+ /* there is no reiser4 on the device */
16984+ if (!silent)
16985+ warning("nikita-2608",
16986+ "%s: wrong master super block magic", super->s_id);
16987+ brelse(super_bh);
16988+ return RETERR(-EINVAL);
16989+}
16990+
16991+static struct {
16992+ reiser4_plugin_type type;
16993+ reiser4_plugin_id id;
16994+} default_plugins[PSET_LAST] = {
16995+ [PSET_FILE] = {
16996+ .type = REISER4_FILE_PLUGIN_TYPE,
16997+ .id = UNIX_FILE_PLUGIN_ID
16998+ },
16999+ [PSET_DIR] = {
17000+ .type = REISER4_DIR_PLUGIN_TYPE,
17001+ .id = HASHED_DIR_PLUGIN_ID
17002+ },
17003+ [PSET_HASH] = {
17004+ .type = REISER4_HASH_PLUGIN_TYPE,
17005+ .id = R5_HASH_ID
17006+ },
17007+ [PSET_FIBRATION] = {
17008+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
17009+ .id = FIBRATION_DOT_O
17010+ },
17011+ [PSET_PERM] = {
17012+ .type = REISER4_PERM_PLUGIN_TYPE,
17013+ .id = NULL_PERM_ID
17014+ },
17015+ [PSET_FORMATTING] = {
17016+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
17017+ .id = SMALL_FILE_FORMATTING_ID
17018+ },
17019+ [PSET_SD] = {
17020+ .type = REISER4_ITEM_PLUGIN_TYPE,
17021+ .id = STATIC_STAT_DATA_ID
17022+ },
17023+ [PSET_DIR_ITEM] = {
17024+ .type = REISER4_ITEM_PLUGIN_TYPE,
17025+ .id = COMPOUND_DIR_ID
17026+ },
17027+ [PSET_CIPHER] = {
17028+ .type = REISER4_CIPHER_PLUGIN_TYPE,
17029+ .id = NONE_CIPHER_ID
17030+ },
17031+ [PSET_DIGEST] = {
17032+ .type = REISER4_DIGEST_PLUGIN_TYPE,
17033+ .id = SHA256_32_DIGEST_ID
17034+ },
17035+ [PSET_COMPRESSION] = {
17036+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17037+ .id = LZO1_COMPRESSION_ID
17038+ },
17039+ [PSET_COMPRESSION_MODE] = {
17040+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17041+ .id = CONVX_COMPRESSION_MODE_ID
17042+ },
17043+ [PSET_CLUSTER] = {
17044+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
17045+ .id = CLUSTER_64K_ID
17046+ },
17047+ [PSET_CREATE] = {
17048+ .type = REISER4_FILE_PLUGIN_TYPE,
17049+ .id = UNIX_FILE_PLUGIN_ID
17050+ }
17051+};
17052+
17053+/* access to default plugin table */
17054+reiser4_plugin *get_default_plugin(pset_member memb)
17055+{
17056+ return plugin_by_id(default_plugins[memb].type,
17057+ default_plugins[memb].id);
17058+}
17059+
17060+/**
17061+ * reiser4_init_root_inode - obtain inode of root directory
17062+ * @super: super block of filesystem
17063+ *
17064+ * Obtains inode of root directory (reading it from disk), initializes plugin
17065+ * set it was not initialized.
17066+ */
17067+int reiser4_init_root_inode(struct super_block *super)
17068+{
17069+ reiser4_super_info_data *sbinfo = get_super_private(super);
17070+ struct inode *inode;
17071+ int result = 0;
17072+
17073+ inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17074+ if (IS_ERR(inode))
17075+ return RETERR(PTR_ERR(inode));
17076+
17077+ super->s_root = d_alloc_root(inode);
17078+ if (!super->s_root) {
17079+ iput(inode);
17080+ return RETERR(-ENOMEM);
17081+ }
17082+
17083+ super->s_root->d_op = &sbinfo->ops.dentry;
17084+
17085+ if (!is_inode_loaded(inode)) {
17086+ pset_member memb;
17087+ plugin_set *pset;
17088+
17089+ pset = reiser4_inode_data(inode)->pset;
17090+ for (memb = 0; memb < PSET_LAST; ++memb) {
17091+
17092+ if (aset_get(pset, memb) != NULL)
17093+ continue;
17094+
17095+ result = grab_plugin_pset(inode, NULL, memb);
17096+ if (result != 0)
17097+ break;
17098+
17099+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17100+ }
17101+
17102+ if (result == 0) {
17103+ if (REISER4_DEBUG) {
17104+ for (memb = 0; memb < PSET_LAST; ++memb)
17105+ assert("nikita-3500",
17106+ aset_get(pset, memb) != NULL);
17107+ }
17108+ } else
17109+ warning("nikita-3448", "Cannot set plugins of root: %i",
17110+ result);
17111+ reiser4_iget_complete(inode);
17112+
17113+ /* As the default pset kept in the root dir may has been changed
17114+ (length is unknown), call update_sd. */
17115+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17116+ result = reiser4_grab_space(
17117+ inode_file_plugin(inode)->estimate.update(inode),
17118+ BA_CAN_COMMIT);
17119+
17120+ if (result == 0)
17121+ result = reiser4_update_sd(inode);
17122+
17123+ all_grabbed2free();
17124+ }
17125+ }
17126+
17127+ super->s_maxbytes = MAX_LFS_FILESIZE;
17128+ return result;
17129+}
17130+
17131+/*
17132+ * Local variables:
17133+ * c-indentation-style: "K&R"
17134+ * mode-name: "LC"
17135+ * c-basic-offset: 8
17136+ * tab-width: 8
17137+ * fill-column: 79
17138+ * End:
17139+ */
17140diff -urN linux-2.6.23.orig/fs/reiser4/inode.c linux-2.6.23/fs/reiser4/inode.c
17141--- linux-2.6.23.orig/fs/reiser4/inode.c 1970-01-01 03:00:00.000000000 +0300
17142+++ linux-2.6.23/fs/reiser4/inode.c 2007-12-04 16:49:30.000000000 +0300
17143@@ -0,0 +1,709 @@
17144+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17145+
17146+/* Inode specific operations. */
17147+
17148+#include "forward.h"
17149+#include "debug.h"
17150+#include "key.h"
17151+#include "kassign.h"
17152+#include "coord.h"
17153+#include "seal.h"
17154+#include "dscale.h"
17155+#include "plugin/item/item.h"
17156+#include "plugin/security/perm.h"
17157+#include "plugin/plugin.h"
17158+#include "plugin/object.h"
17159+#include "znode.h"
17160+#include "vfs_ops.h"
17161+#include "inode.h"
17162+#include "super.h"
17163+#include "reiser4.h"
17164+
17165+#include <linux/fs.h> /* for struct super_block, address_space */
17166+
17167+/* return reiser4 internal tree which inode belongs to */
17168+/* Audited by: green(2002.06.17) */
17169+reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ )
17170+{
17171+ assert("nikita-256", inode != NULL);
17172+ assert("nikita-257", inode->i_sb != NULL);
17173+ return reiser4_get_tree(inode->i_sb);
17174+}
17175+
17176+/* return reiser4-specific inode flags */
17177+static inline unsigned long *inode_flags(const struct inode *const inode)
17178+{
17179+ assert("nikita-2842", inode != NULL);
17180+ return &reiser4_inode_data(inode)->flags;
17181+}
17182+
17183+/* set reiser4-specific flag @f in @inode */
17184+void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17185+{
17186+ assert("nikita-2248", inode != NULL);
17187+ set_bit((int)f, inode_flags(inode));
17188+}
17189+
17190+/* clear reiser4-specific flag @f in @inode */
17191+void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17192+{
17193+ assert("nikita-2250", inode != NULL);
17194+ clear_bit((int)f, inode_flags(inode));
17195+}
17196+
17197+/* true if reiser4-specific flag @f is set in @inode */
17198+int reiser4_inode_get_flag(const struct inode *inode,
17199+ reiser4_file_plugin_flags f)
17200+{
17201+ assert("nikita-2251", inode != NULL);
17202+ return test_bit((int)f, inode_flags(inode));
17203+}
17204+
17205+/* convert oid to inode number */
17206+ino_t oid_to_ino(oid_t oid)
17207+{
17208+ return (ino_t) oid;
17209+}
17210+
17211+/* convert oid to user visible inode number */
17212+ino_t oid_to_uino(oid_t oid)
17213+{
17214+ /* reiser4 object is uniquely identified by oid which is 64 bit
17215+ quantity. Kernel in-memory inode is indexed (in the hash table) by
17216+ 32 bit i_ino field, but this is not a problem, because there is a
17217+ way to further distinguish inodes with identical inode numbers
17218+ (find_actor supplied to iget()).
17219+
17220+ But user space expects unique 32 bit inode number. Obviously this
17221+ is impossible. Work-around is to somehow hash oid into user visible
17222+ inode number.
17223+ */
17224+ oid_t max_ino = (ino_t) ~ 0;
17225+
17226+ if (REISER4_INO_IS_OID || (oid <= max_ino))
17227+ return oid;
17228+ else
17229+ /* this is remotely similar to algorithm used to find next pid
17230+ to use for process: after wrap-around start from some
17231+ offset rather than from 0. Idea is that there are some long
17232+ living objects with which we don't want to collide.
17233+ */
17234+ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17235+}
17236+
17237+/* check that "inode" is on reiser4 file-system */
17238+int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17239+{
17240+ return inode != NULL && is_reiser4_super(inode->i_sb);
17241+}
17242+
17243+/* Maximal length of a name that can be stored in directory @inode.
17244+
17245+ This is used in check during file creation and lookup. */
17246+int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17247+{
17248+ assert("nikita-287", is_reiser4_inode(inode));
17249+ assert("nikita-1710", inode_dir_item_plugin(inode));
17250+ if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17251+ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17252+ else
17253+ return 255;
17254+}
17255+
17256+#if REISER4_USE_COLLISION_LIMIT
17257+/* Maximal number of hash collisions for this directory. */
17258+int max_hash_collisions(const struct inode *dir /* inode queried */ )
17259+{
17260+ assert("nikita-1711", dir != NULL);
17261+ return reiser4_inode_data(dir)->plugin.max_collisions;
17262+}
17263+#endif /* REISER4_USE_COLLISION_LIMIT */
17264+
17265+/* Install file, inode, and address_space operation on @inode, depending on
17266+ its mode. */
17267+int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17268+ reiser4_object_create_data * data /* parameters to create
17269+ * object */ )
17270+{
17271+ reiser4_super_info_data *sinfo;
17272+ file_plugin *fplug;
17273+ dir_plugin *dplug;
17274+
17275+ fplug = inode_file_plugin(inode);
17276+ dplug = inode_dir_plugin(inode);
17277+
17278+ sinfo = get_super_private(inode->i_sb);
17279+
17280+ switch (inode->i_mode & S_IFMT) {
17281+ case S_IFSOCK:
17282+ case S_IFBLK:
17283+ case S_IFCHR:
17284+ case S_IFIFO:
17285+ {
17286+ dev_t rdev; /* to keep gcc happy */
17287+
17288+ assert("vs-46", fplug != NULL);
17289+ /* ugly hack with rdev */
17290+ if (data == NULL) {
17291+ rdev = inode->i_rdev;
17292+ inode->i_rdev = 0;
17293+ } else
17294+ rdev = data->rdev;
17295+ inode->i_blocks = 0;
17296+ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17297+ inode->i_op = file_plugins[fplug->h.id].inode_ops;
17298+ /* initialize inode->i_fop and inode->i_rdev for block and char
17299+ devices */
17300+ init_special_inode(inode, inode->i_mode, rdev);
17301+ /* all address space operations are null */
17302+ inode->i_mapping->a_ops =
17303+ file_plugins[fplug->h.id].as_ops;
17304+ break;
17305+ }
17306+ case S_IFLNK:
17307+ assert("vs-46", fplug != NULL);
17308+ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17309+ inode->i_op = file_plugins[fplug->h.id].inode_ops;
17310+ inode->i_fop = NULL;
17311+ /* all address space operations are null */
17312+ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17313+ break;
17314+ case S_IFDIR:
17315+ assert("vs-46", dplug != NULL);
17316+ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17317+ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17318+ inode->i_op = dir_plugins[dplug->h.id].inode_ops;
17319+ inode->i_fop = dir_plugins[dplug->h.id].file_ops;
17320+ inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops;
17321+ break;
17322+ case S_IFREG:
17323+ assert("vs-46", fplug != NULL);
17324+ assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17325+ fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17326+ inode->i_op = file_plugins[fplug->h.id].inode_ops;
17327+ inode->i_fop = file_plugins[fplug->h.id].file_ops;
17328+ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17329+ break;
17330+ default:
17331+ warning("nikita-291", "wrong file mode: %o for %llu",
17332+ inode->i_mode,
17333+ (unsigned long long)get_inode_oid(inode));
17334+ reiser4_make_bad_inode(inode);
17335+ return RETERR(-EINVAL);
17336+ }
17337+ return 0;
17338+}
17339+
17340+/* Initialize inode from disk data. Called with inode locked.
17341+ Return inode locked. */
17342+static int init_inode(struct inode *inode /* inode to intialise */ ,
17343+ coord_t * coord /* coord of stat data */ )
17344+{
17345+ int result;
17346+ item_plugin *iplug;
17347+ void *body;
17348+ int length;
17349+ reiser4_inode *state;
17350+
17351+ assert("nikita-292", coord != NULL);
17352+ assert("nikita-293", inode != NULL);
17353+
17354+ coord_clear_iplug(coord);
17355+ result = zload(coord->node);
17356+ if (result)
17357+ return result;
17358+ iplug = item_plugin_by_coord(coord);
17359+ body = item_body_by_coord(coord);
17360+ length = item_length_by_coord(coord);
17361+
17362+ assert("nikita-295", iplug != NULL);
17363+ assert("nikita-296", body != NULL);
17364+ assert("nikita-297", length > 0);
17365+
17366+ /* inode is under I_LOCK now */
17367+
17368+ state = reiser4_inode_data(inode);
17369+ /* call stat-data plugin method to load sd content into inode */
17370+ result = iplug->s.sd.init_inode(inode, body, length);
17371+ set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17372+ if (result == 0) {
17373+ result = setup_inode_ops(inode, NULL);
17374+ if (result == 0 && inode->i_sb->s_root &&
17375+ inode->i_sb->s_root->d_inode)
17376+ result = finish_pset(inode);
17377+ }
17378+ zrelse(coord->node);
17379+ return result;
17380+}
17381+
17382+/* read `inode' from the disk. This is what was previously in
17383+ reiserfs_read_inode2().
17384+
17385+ Must be called with inode locked. Return inode still locked.
17386+*/
17387+static int read_inode(struct inode *inode /* inode to read from disk */ ,
17388+ const reiser4_key * key /* key of stat data */ ,
17389+ int silent)
17390+{
17391+ int result;
17392+ lock_handle lh;
17393+ reiser4_inode *info;
17394+ coord_t coord;
17395+
17396+ assert("nikita-298", inode != NULL);
17397+ assert("nikita-1945", !is_inode_loaded(inode));
17398+
17399+ info = reiser4_inode_data(inode);
17400+ assert("nikita-300", info->locality_id != 0);
17401+
17402+ coord_init_zero(&coord);
17403+ init_lh(&lh);
17404+ /* locate stat-data in a tree and return znode locked */
17405+ result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17406+ assert("nikita-301", !is_inode_loaded(inode));
17407+ if (result == 0) {
17408+ /* use stat-data plugin to load sd into inode. */
17409+ result = init_inode(inode, &coord);
17410+ if (result == 0) {
17411+ /* initialize stat-data seal */
17412+ spin_lock_inode(inode);
17413+ reiser4_seal_init(&info->sd_seal, &coord, key);
17414+ info->sd_coord = coord;
17415+ spin_unlock_inode(inode);
17416+
17417+ /* call file plugin's method to initialize plugin
17418+ * specific part of inode */
17419+ if (inode_file_plugin(inode)->init_inode_data)
17420+ inode_file_plugin(inode)->init_inode_data(inode,
17421+ NULL,
17422+ 0);
17423+ /* load detached directory cursors for stateless
17424+ * directory readers (NFS). */
17425+ reiser4_load_cursors(inode);
17426+
17427+ /* Check the opened inode for consistency. */
17428+ result =
17429+ get_super_private(inode->i_sb)->df_plug->
17430+ check_open(inode);
17431+ }
17432+ }
17433+ /* lookup_sd() doesn't release coord because we want znode
17434+ stay read-locked while stat-data fields are accessed in
17435+ init_inode() */
17436+ done_lh(&lh);
17437+
17438+ if (result != 0)
17439+ reiser4_make_bad_inode(inode);
17440+ return result;
17441+}
17442+
17443+/* initialise new reiser4 inode being inserted into hash table. */
17444+static int init_locked_inode(struct inode *inode /* new inode */ ,
17445+ void *opaque /* key of stat data passed to the
17446+ * iget5_locked as cookie */ )
17447+{
17448+ reiser4_key *key;
17449+
17450+ assert("nikita-1995", inode != NULL);
17451+ assert("nikita-1996", opaque != NULL);
17452+ key = opaque;
17453+ set_inode_oid(inode, get_key_objectid(key));
17454+ reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17455+ return 0;
17456+}
17457+
17458+/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17459+
17460+ This function is called by iget5_locked() to distinguish reiser4 inodes
17461+ having the same inode numbers. Such inodes can only exist due to some error
17462+ condition. One of them should be bad. Inodes with identical inode numbers
17463+ (objectids) are distinguished by their packing locality.
17464+
17465+*/
17466+static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to
17467+ * check */ ,
17468+ void *opaque /* "cookie" passed to
17469+ * iget5_locked(). This is stat data
17470+ * key */ )
17471+{
17472+ reiser4_key *key;
17473+
17474+ key = opaque;
17475+ return
17476+ /* oid is unique, so first term is enough, actually. */
17477+ get_inode_oid(inode) == get_key_objectid(key) &&
17478+ /*
17479+ * also, locality should be checked, but locality is stored in
17480+ * the reiser4-specific part of the inode, and actor can be
17481+ * called against arbitrary inode that happened to be in this
17482+ * hash chain. Hence we first have to check that this is
17483+ * reiser4 inode at least. is_reiser4_inode() is probably too
17484+ * early to call, as inode may have ->i_op not yet
17485+ * initialised.
17486+ */
17487+ is_reiser4_super(inode->i_sb) &&
17488+ /*
17489+ * usually objectid is unique, but pseudo files use counter to
17490+ * generate objectid. All pseudo files are placed into special
17491+ * (otherwise unused) locality.
17492+ */
17493+ reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17494+}
17495+
17496+/* hook for kmem_cache_create */
17497+void loading_init_once(reiser4_inode * info)
17498+{
17499+ mutex_init(&info->loading);
17500+}
17501+
17502+/* for reiser4_alloc_inode */
17503+void loading_alloc(reiser4_inode * info)
17504+{
17505+ assert("vs-1717", !mutex_is_locked(&info->loading));
17506+}
17507+
17508+/* for reiser4_destroy */
17509+void loading_destroy(reiser4_inode * info)
17510+{
17511+ assert("vs-1717a", !mutex_is_locked(&info->loading));
17512+}
17513+
17514+static void loading_begin(reiser4_inode * info)
17515+{
17516+ mutex_lock(&info->loading);
17517+}
17518+
17519+static void loading_end(reiser4_inode * info)
17520+{
17521+ mutex_unlock(&info->loading);
17522+}
17523+
17524+/**
17525+ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17526+ * @super: super block of filesystem
17527+ * @key: key of inode's stat-data
17528+ * @silent:
17529+ *
17530+ * This is our helper function a la iget(). This is be called by
17531+ * lookup_common() and reiser4_read_super(). Return inode locked or error
17532+ * encountered.
17533+ */
17534+struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17535+ int silent)
17536+{
17537+ struct inode *inode;
17538+ int result;
17539+ reiser4_inode *info;
17540+
17541+ assert("nikita-302", super != NULL);
17542+ assert("nikita-303", key != NULL);
17543+
17544+ result = 0;
17545+
17546+ /* call iget(). Our ->read_inode() is dummy, so this will either
17547+ find inode in cache or return uninitialised inode */
17548+ inode = iget5_locked(super,
17549+ (unsigned long)get_key_objectid(key),
17550+ reiser4_inode_find_actor,
17551+ init_locked_inode, (reiser4_key *) key);
17552+ if (inode == NULL)
17553+ return ERR_PTR(RETERR(-ENOMEM));
17554+ if (is_bad_inode(inode)) {
17555+ warning("nikita-304", "Bad inode found");
17556+ reiser4_print_key("key", key);
17557+ iput(inode);
17558+ return ERR_PTR(RETERR(-EIO));
17559+ }
17560+
17561+ info = reiser4_inode_data(inode);
17562+
17563+ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17564+ loaded and initialized inode from just allocated inode. If
17565+ REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17566+ info->loading. The place in reiser4 which uses not initialized inode
17567+ is the reiser4 repacker, see repacker-related functions in
17568+ plugin/item/extent.c */
17569+ if (!is_inode_loaded(inode)) {
17570+ loading_begin(info);
17571+ if (!is_inode_loaded(inode)) {
17572+ /* locking: iget5_locked returns locked inode */
17573+ assert("nikita-1941", !is_inode_loaded(inode));
17574+ assert("nikita-1949",
17575+ reiser4_inode_find_actor(inode,
17576+ (reiser4_key *) key));
17577+ /* now, inode has objectid as ->i_ino and locality in
17578+ reiser4-specific part. This is enough for
17579+ read_inode() to read stat data from the disk */
17580+ result = read_inode(inode, key, silent);
17581+ } else
17582+ loading_end(info);
17583+ }
17584+
17585+ if (inode->i_state & I_NEW)
17586+ unlock_new_inode(inode);
17587+
17588+ if (is_bad_inode(inode)) {
17589+ assert("vs-1717", result != 0);
17590+ loading_end(info);
17591+ iput(inode);
17592+ inode = ERR_PTR(result);
17593+ } else if (REISER4_DEBUG) {
17594+ reiser4_key found_key;
17595+
17596+ assert("vs-1717", result == 0);
17597+ build_sd_key(inode, &found_key);
17598+ if (!keyeq(&found_key, key)) {
17599+ warning("nikita-305", "Wrong key in sd");
17600+ reiser4_print_key("sought for", key);
17601+ reiser4_print_key("found", &found_key);
17602+ }
17603+ if (inode->i_nlink == 0) {
17604+ warning("nikita-3559", "Unlinked inode found: %llu\n",
17605+ (unsigned long long)get_inode_oid(inode));
17606+ }
17607+ }
17608+ return inode;
17609+}
17610+
17611+/* reiser4_iget() may return not fully initialized inode, this function should
17612+ * be called after one completes reiser4 inode initializing. */
17613+void reiser4_iget_complete(struct inode *inode)
17614+{
17615+ assert("zam-988", is_reiser4_inode(inode));
17616+
17617+ if (!is_inode_loaded(inode)) {
17618+ reiser4_inode_set_flag(inode, REISER4_LOADED);
17619+ loading_end(reiser4_inode_data(inode));
17620+ }
17621+}
17622+
17623+void reiser4_make_bad_inode(struct inode *inode)
17624+{
17625+ assert("nikita-1934", inode != NULL);
17626+
17627+ /* clear LOADED bit */
17628+ reiser4_inode_clr_flag(inode, REISER4_LOADED);
17629+ make_bad_inode(inode);
17630+ return;
17631+}
17632+
17633+file_plugin *inode_file_plugin(const struct inode * inode)
17634+{
17635+ assert("nikita-1997", inode != NULL);
17636+ return reiser4_inode_data(inode)->pset->file;
17637+}
17638+
17639+dir_plugin *inode_dir_plugin(const struct inode * inode)
17640+{
17641+ assert("nikita-1998", inode != NULL);
17642+ return reiser4_inode_data(inode)->pset->dir;
17643+}
17644+
17645+formatting_plugin *inode_formatting_plugin(const struct inode * inode)
17646+{
17647+ assert("nikita-2000", inode != NULL);
17648+ return reiser4_inode_data(inode)->pset->formatting;
17649+}
17650+
17651+hash_plugin *inode_hash_plugin(const struct inode * inode)
17652+{
17653+ assert("nikita-2001", inode != NULL);
17654+ return reiser4_inode_data(inode)->pset->hash;
17655+}
17656+
17657+fibration_plugin *inode_fibration_plugin(const struct inode * inode)
17658+{
17659+ assert("nikita-2001", inode != NULL);
17660+ return reiser4_inode_data(inode)->pset->fibration;
17661+}
17662+
17663+cipher_plugin *inode_cipher_plugin(const struct inode * inode)
17664+{
17665+ assert("edward-36", inode != NULL);
17666+ return reiser4_inode_data(inode)->pset->cipher;
17667+}
17668+
17669+compression_plugin *inode_compression_plugin(const struct inode * inode)
17670+{
17671+ assert("edward-37", inode != NULL);
17672+ return reiser4_inode_data(inode)->pset->compression;
17673+}
17674+
17675+compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17676+ inode)
17677+{
17678+ assert("edward-1330", inode != NULL);
17679+ return reiser4_inode_data(inode)->pset->compression_mode;
17680+}
17681+
17682+cluster_plugin *inode_cluster_plugin(const struct inode * inode)
17683+{
17684+ assert("edward-1328", inode != NULL);
17685+ return reiser4_inode_data(inode)->pset->cluster;
17686+}
17687+
17688+file_plugin *inode_create_plugin(const struct inode * inode)
17689+{
17690+ assert("edward-1329", inode != NULL);
17691+ return reiser4_inode_data(inode)->pset->create;
17692+}
17693+
17694+digest_plugin *inode_digest_plugin(const struct inode * inode)
17695+{
17696+ assert("edward-86", inode != NULL);
17697+ return reiser4_inode_data(inode)->pset->digest;
17698+}
17699+
17700+item_plugin *inode_sd_plugin(const struct inode * inode)
17701+{
17702+ assert("vs-534", inode != NULL);
17703+ return reiser4_inode_data(inode)->pset->sd;
17704+}
17705+
17706+item_plugin *inode_dir_item_plugin(const struct inode * inode)
17707+{
17708+ assert("vs-534", inode != NULL);
17709+ return reiser4_inode_data(inode)->pset->dir_item;
17710+}
17711+
17712+file_plugin *child_create_plugin(const struct inode * inode)
17713+{
17714+ assert("edward-1329", inode != NULL);
17715+ return reiser4_inode_data(inode)->hset->create;
17716+}
17717+
17718+void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17719+{
17720+ reiser4_inode *state;
17721+
17722+ assert("nikita-2716", inode != NULL);
17723+ assert("nikita-2717", ext < LAST_SD_EXTENSION);
17724+ assert("nikita-3491", spin_inode_is_locked(inode));
17725+
17726+ state = reiser4_inode_data(inode);
17727+ state->extmask |= 1 << ext;
17728+ /* force re-calculation of stat-data length on next call to
17729+ update_sd(). */
17730+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17731+}
17732+
17733+void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
17734+{
17735+ reiser4_inode *state;
17736+
17737+ assert("vpf-1926", inode != NULL);
17738+ assert("vpf-1927", ext < LAST_SD_EXTENSION);
17739+ assert("vpf-1928", spin_inode_is_locked(inode));
17740+
17741+ state = reiser4_inode_data(inode);
17742+ state->extmask &= ~(1 << ext);
17743+ /* force re-calculation of stat-data length on next call to
17744+ update_sd(). */
17745+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17746+}
17747+
17748+void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
17749+{
17750+ assert("edward-1287", inode != NULL);
17751+ if (!dscale_fit(old, new))
17752+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17753+ return;
17754+}
17755+
17756+void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
17757+{
17758+ assert("nikita-2875", inode != NULL);
17759+ spin_lock_inode(inode);
17760+ inode_check_scale_nolock(inode, old, new);
17761+ spin_unlock_inode(inode);
17762+}
17763+
17764+/*
17765+ * initialize ->ordering field of inode. This field defines how file stat-data
17766+ * and body is ordered within a tree with respect to other objects within the
17767+ * same parent directory.
17768+ */
17769+void
17770+init_inode_ordering(struct inode *inode,
17771+ reiser4_object_create_data * crd, int create)
17772+{
17773+ reiser4_key key;
17774+
17775+ if (create) {
17776+ struct inode *parent;
17777+
17778+ parent = crd->parent;
17779+ assert("nikita-3224", inode_dir_plugin(parent) != NULL);
17780+ inode_dir_plugin(parent)->build_entry_key(parent,
17781+ &crd->dentry->d_name,
17782+ &key);
17783+ } else {
17784+ coord_t *coord;
17785+
17786+ coord = &reiser4_inode_data(inode)->sd_coord;
17787+ coord_clear_iplug(coord);
17788+ /* safe to use ->sd_coord, because node is under long term
17789+ * lock */
17790+ WITH_DATA(coord->node, item_key_by_coord(coord, &key));
17791+ }
17792+
17793+ set_inode_ordering(inode, get_key_ordering(&key));
17794+}
17795+
17796+znode *inode_get_vroot(struct inode *inode)
17797+{
17798+ reiser4_block_nr blk;
17799+ znode *result;
17800+
17801+ spin_lock_inode(inode);
17802+ blk = reiser4_inode_data(inode)->vroot;
17803+ spin_unlock_inode(inode);
17804+ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
17805+ result = zlook(reiser4_tree_by_inode(inode), &blk);
17806+ else
17807+ result = NULL;
17808+ return result;
17809+}
17810+
17811+void inode_set_vroot(struct inode *inode, znode *vroot)
17812+{
17813+ spin_lock_inode(inode);
17814+ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
17815+ spin_unlock_inode(inode);
17816+}
17817+
17818+#if REISER4_DEBUG
17819+
17820+void reiser4_inode_invariant(const struct inode *inode)
17821+{
17822+ assert("nikita-3077", spin_inode_is_locked(inode));
17823+}
17824+
17825+int inode_has_no_jnodes(reiser4_inode * r4_inode)
17826+{
17827+ return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
17828+ r4_inode->nr_jnodes == 0;
17829+}
17830+
17831+#endif
17832+
17833+/* true if directory is empty (only contains dot and dotdot) */
17834+/* FIXME: shouldn't it be dir plugin method? */
17835+int is_dir_empty(const struct inode *dir)
17836+{
17837+ assert("nikita-1976", dir != NULL);
17838+
17839+ /* rely on our method to maintain directory i_size being equal to the
17840+ number of entries. */
17841+ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
17842+}
17843+
17844+/* Make Linus happy.
17845+ Local variables:
17846+ c-indentation-style: "K&R"
17847+ mode-name: "LC"
17848+ c-basic-offset: 8
17849+ tab-width: 8
17850+ fill-column: 120
17851+ End:
17852+*/
17853diff -urN linux-2.6.23.orig/fs/reiser4/inode.h linux-2.6.23/fs/reiser4/inode.h
17854--- linux-2.6.23.orig/fs/reiser4/inode.h 1970-01-01 03:00:00.000000000 +0300
17855+++ linux-2.6.23/fs/reiser4/inode.h 2007-12-04 16:49:30.000000000 +0300
17856@@ -0,0 +1,449 @@
17857+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17858+
17859+/* Inode functions. */
17860+
17861+#if !defined( __REISER4_INODE_H__ )
17862+#define __REISER4_INODE_H__
17863+
17864+#include "forward.h"
17865+#include "debug.h"
17866+#include "key.h"
17867+#include "seal.h"
17868+#include "plugin/plugin.h"
17869+#include "plugin/file/cryptcompress.h"
17870+#include "plugin/file/file.h"
17871+#include "plugin/dir/dir.h"
17872+#include "plugin/plugin_set.h"
17873+#include "plugin/security/perm.h"
17874+#include "vfs_ops.h"
17875+#include "jnode.h"
17876+#include "fsdata.h"
17877+
17878+#include <linux/types.h> /* for __u?? , ino_t */
17879+#include <linux/fs.h> /* for struct super_block, struct
17880+ * rw_semaphore, etc */
17881+#include <linux/spinlock.h>
17882+#include <asm/types.h>
17883+
17884+/* reiser4-specific inode flags. They are "transient" and are not
17885+ supposed to be stored on disk. Used to trace "state" of
17886+ inode
17887+*/
17888+typedef enum {
17889+ /* this is light-weight inode, inheriting some state from its
17890+ parent */
17891+ REISER4_LIGHT_WEIGHT = 0,
17892+ /* stat data wasn't yet created */
17893+ REISER4_NO_SD = 1,
17894+ /* internal immutable flag. Currently is only used
17895+ to avoid race condition during file creation.
17896+ See comment in create_object(). */
17897+ REISER4_IMMUTABLE = 2,
17898+ /* inode was read from storage */
17899+ REISER4_LOADED = 3,
17900+ /* this bit is set for symlinks. inode->i_private points to target
17901+ name of symlink. */
17902+ REISER4_GENERIC_PTR_USED = 4,
17903+ /* set if size of stat-data item for this inode is known. If this is
17904+ * set we can avoid recalculating size of stat-data on each update. */
17905+ REISER4_SDLEN_KNOWN = 5,
17906+ /* reiser4_inode->crypt points to the crypto stat */
17907+ REISER4_CRYPTO_STAT_LOADED = 6,
17908+ /* cryptcompress_inode_data points to the secret key */
17909+ REISER4_SECRET_KEY_INSTALLED = 7,
17910+ /* File (possibly) has pages corresponding to the tail items, that
17911+ * were created by ->readpage. It is set by mmap_unix_file() and
17912+ * sendfile_unix_file(). This bit is inspected by write_unix_file and
17913+ * kill-hook of tail items. It is never cleared once set. This bit is
17914+ * modified and inspected under i_mutex. */
17915+ REISER4_HAS_MMAP = 8,
17916+ REISER4_PART_MIXED = 9,
17917+ REISER4_PART_IN_CONV = 10,
17918+ /* This flag indicates that file plugin conversion is in progress */
17919+ REISER4_FILE_CONV_IN_PROGRESS = 11
17920+} reiser4_file_plugin_flags;
17921+
17922+/* state associated with each inode.
17923+ reiser4 inode.
17924+
17925+ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
17926+ be of the same size. File-system allocates inodes by itself through
17927+ s_op->allocate_inode() method. So, it is possible to adjust size of inode
17928+ at the time of its creation.
17929+
17930+ Invariants involving parts of this data-type:
17931+
17932+ [inode->eflushed]
17933+
17934+*/
17935+
17936+typedef struct reiser4_inode reiser4_inode;
17937+/* return pointer to reiser4-specific part of inode */
17938+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17939+ /* inode queried */ );
17940+
17941+#if BITS_PER_LONG == 64
17942+
17943+#define REISER4_INO_IS_OID (1)
17944+typedef struct {;
17945+} oid_hi_t;
17946+
17947+/* BITS_PER_LONG == 64 */
17948+#else
17949+
17950+#define REISER4_INO_IS_OID (0)
17951+typedef __u32 oid_hi_t;
17952+
17953+/* BITS_PER_LONG == 64 */
17954+#endif
17955+
17956+struct reiser4_inode {
17957+ /* spin lock protecting fields of this structure. */
17958+ spinlock_t guard;
17959+ /* main plugin set that control the file
17960+ (see comments in plugin/plugin_set.c) */
17961+ plugin_set *pset;
17962+ /* plugin set for inheritance
17963+ (see comments in plugin/plugin_set.c) */
17964+ plugin_set *hset;
17965+ /* high 32 bits of object id */
17966+ oid_hi_t oid_hi;
17967+ /* seal for stat-data */
17968+ seal_t sd_seal;
17969+ /* locality id for this file */
17970+ oid_t locality_id;
17971+#if REISER4_LARGE_KEY
17972+ __u64 ordering;
17973+#endif
17974+ /* coord of stat-data in sealed node */
17975+ coord_t sd_coord;
17976+ /* bit-mask of stat-data extentions used by this file */
17977+ __u64 extmask;
17978+ /* bitmask of non-default plugins for this inode */
17979+ __u16 plugin_mask;
17980+ /* bitmask of set heir plugins for this inode. */
17981+ __u16 heir_mask;
17982+ union {
17983+ struct list_head readdir_list;
17984+ struct list_head not_used;
17985+ } lists;
17986+ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
17987+ unsigned long flags;
17988+ union {
17989+ /* fields specific to unix_file plugin */
17990+ struct unix_file_info unix_file_info;
17991+ /* fields specific to cryptcompress file plugin */
17992+ struct cryptcompress_info cryptcompress_info;
17993+ } file_plugin_data;
17994+
17995+ /* this semaphore is to serialize readers and writers of @pset->file
17996+ * when file plugin conversion is enabled
17997+ */
17998+ struct rw_semaphore conv_sem;
17999+
18000+ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
18001+ tagged in that tree by EFLUSH_TAG_ANONYMOUS */
18002+ struct radix_tree_root jnodes_tree;
18003+#if REISER4_DEBUG
18004+ /* number of unformatted node jnodes of this file in jnode hash table */
18005+ unsigned long nr_jnodes;
18006+#endif
18007+
18008+ /* block number of virtual root for this object. See comment above
18009+ * fs/reiser4/search.c:handle_vroot() */
18010+ reiser4_block_nr vroot;
18011+ struct mutex loading;
18012+};
18013+
18014+void loading_init_once(reiser4_inode *);
18015+void loading_alloc(reiser4_inode *);
18016+void loading_destroy(reiser4_inode *);
18017+
18018+struct reiser4_inode_object {
18019+ /* private part */
18020+ reiser4_inode p;
18021+ /* generic fields not specific to reiser4, but used by VFS */
18022+ struct inode vfs_inode;
18023+};
18024+
18025+/* return pointer to the reiser4 specific portion of @inode */
18026+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18027+ /* inode queried */ )
18028+{
18029+ assert("nikita-254", inode != NULL);
18030+ return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p;
18031+}
18032+
18033+static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18034+ r4_inode /* inode queried */
18035+ )
18036+{
18037+ return &container_of(r4_inode, struct reiser4_inode_object, p)->vfs_inode;
18038+}
18039+
18040+/*
18041+ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18042+ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18043+ * bits.
18044+ *
18045+ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18046+ * of inode, otherwise whole oid is stored in i_ino.
18047+ *
18048+ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18049+ */
18050+
18051+#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18052+
18053+#if REISER4_INO_IS_OID
18054+
18055+static inline oid_t get_inode_oid(const struct inode *inode)
18056+{
18057+ return inode->i_ino;
18058+}
18059+
18060+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18061+{
18062+ inode->i_ino = oid;
18063+}
18064+
18065+/* REISER4_INO_IS_OID */
18066+#else
18067+
18068+static inline oid_t get_inode_oid(const struct inode *inode)
18069+{
18070+ return
18071+ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18072+ inode->i_ino;
18073+}
18074+
18075+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18076+{
18077+ assert("nikita-2519", inode != NULL);
18078+ inode->i_ino = (ino_t) (oid);
18079+ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18080+ assert("nikita-2521", get_inode_oid(inode) == (oid));
18081+}
18082+
18083+/* REISER4_INO_IS_OID */
18084+#endif
18085+
18086+static inline oid_t get_inode_locality(const struct inode *inode)
18087+{
18088+ return reiser4_inode_data(inode)->locality_id;
18089+}
18090+
18091+#if REISER4_LARGE_KEY
18092+static inline __u64 get_inode_ordering(const struct inode *inode)
18093+{
18094+ return reiser4_inode_data(inode)->ordering;
18095+}
18096+
18097+static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18098+{
18099+ reiser4_inode_data(inode)->ordering = ordering;
18100+}
18101+
18102+#else
18103+
18104+#define get_inode_ordering(inode) (0)
18105+#define set_inode_ordering(inode, val) noop
18106+
18107+#endif
18108+
18109+/* return inode in which @uf_info is embedded */
18110+static inline struct inode *
18111+unix_file_info_to_inode(const struct unix_file_info * uf_info)
18112+{
18113+ return &container_of(uf_info, struct reiser4_inode_object,
18114+ p.file_plugin_data.unix_file_info)->vfs_inode;
18115+}
18116+
18117+extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18118+extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18119+
18120+extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18121+
18122+#if REISER4_DEBUG
18123+extern void reiser4_inode_invariant(const struct inode *inode);
18124+extern int inode_has_no_jnodes(reiser4_inode *);
18125+#else
18126+#define reiser4_inode_invariant(inode) noop
18127+#endif
18128+
18129+static inline int spin_inode_is_locked(const struct inode *inode)
18130+{
18131+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18132+ return 1;
18133+}
18134+
18135+/**
18136+ * spin_lock_inode - lock reiser4_inode' embedded spinlock
18137+ * @inode: inode to lock
18138+ *
18139+ * In debug mode it checks that lower priority locks are not held and
18140+ * increments reiser4_context's lock counters on which lock ordering checking
18141+ * is based.
18142+ */
18143+static inline void spin_lock_inode(struct inode *inode)
18144+{
18145+ assert("", LOCK_CNT_NIL(spin_locked));
18146+ /* check lock ordering */
18147+ assert_spin_not_locked(&d_lock);
18148+
18149+ spin_lock(&reiser4_inode_data(inode)->guard);
18150+
18151+ LOCK_CNT_INC(spin_locked_inode);
18152+ LOCK_CNT_INC(spin_locked);
18153+
18154+ reiser4_inode_invariant(inode);
18155+}
18156+
18157+/**
18158+ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18159+ * @inode: inode to unlock
18160+ *
18161+ * In debug mode it checks that spinlock is held and decrements
18162+ * reiser4_context's lock counters on which lock ordering checking is based.
18163+ */
18164+static inline void spin_unlock_inode(struct inode *inode)
18165+{
18166+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18167+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18168+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18169+
18170+ reiser4_inode_invariant(inode);
18171+
18172+ LOCK_CNT_DEC(spin_locked_inode);
18173+ LOCK_CNT_DEC(spin_locked);
18174+
18175+ spin_unlock(&reiser4_inode_data(inode)->guard);
18176+}
18177+
18178+extern znode *inode_get_vroot(struct inode *inode);
18179+extern void inode_set_vroot(struct inode *inode, znode * vroot);
18180+
18181+extern int reiser4_max_filename_len(const struct inode *inode);
18182+extern int max_hash_collisions(const struct inode *dir);
18183+extern void reiser4_unlock_inode(struct inode *inode);
18184+extern int is_reiser4_inode(const struct inode *inode);
18185+extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18186+extern struct inode *reiser4_iget(struct super_block *super,
18187+ const reiser4_key * key, int silent);
18188+extern void reiser4_iget_complete(struct inode *inode);
18189+extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18190+extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18191+extern int reiser4_inode_get_flag(const struct inode *inode,
18192+ reiser4_file_plugin_flags f);
18193+
18194+/* has inode been initialized? */
18195+static inline int
18196+is_inode_loaded(const struct inode *inode /* inode queried */ )
18197+{
18198+ assert("nikita-1120", inode != NULL);
18199+ return reiser4_inode_get_flag(inode, REISER4_LOADED);
18200+}
18201+
18202+extern file_plugin *inode_file_plugin(const struct inode *inode);
18203+extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18204+extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18205+extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18206+extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18207+extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18208+extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18209+extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18210+extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18211+ *inode);
18212+extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18213+extern file_plugin *inode_create_plugin(const struct inode *inode);
18214+extern item_plugin *inode_sd_plugin(const struct inode *inode);
18215+extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18216+extern file_plugin *child_create_plugin(const struct inode *inode);
18217+
18218+extern void reiser4_make_bad_inode(struct inode *inode);
18219+
18220+extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18221+extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18222+extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18223+extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18224+
18225+#define INODE_SET_SIZE(i, value) \
18226+({ \
18227+ struct inode *__i; \
18228+ typeof(value) __v; \
18229+ \
18230+ __i = (i); \
18231+ __v = (value); \
18232+ inode_check_scale(__i, __i->i_size, __v); \
18233+ i_size_write(__i, __v); \
18234+})
18235+
18236+/*
18237+ * update field @field in inode @i to contain value @value.
18238+ */
18239+#define INODE_SET_FIELD(i, field, value) \
18240+({ \
18241+ struct inode *__i; \
18242+ typeof(value) __v; \
18243+ \
18244+ __i = (i); \
18245+ __v = (value); \
18246+ inode_check_scale(__i, __i->field, __v); \
18247+ __i->field = __v; \
18248+})
18249+
18250+#define INODE_INC_FIELD(i, field) \
18251+({ \
18252+ struct inode *__i; \
18253+ \
18254+ __i = (i); \
18255+ inode_check_scale(__i, __i->field, __i->field + 1); \
18256+ ++ __i->field; \
18257+})
18258+
18259+#define INODE_DEC_FIELD(i, field) \
18260+({ \
18261+ struct inode *__i; \
18262+ \
18263+ __i = (i); \
18264+ inode_check_scale(__i, __i->field, __i->field - 1); \
18265+ -- __i->field; \
18266+})
18267+
18268+/* See comment before reiser4_readdir_common() for description. */
18269+static inline struct list_head *get_readdir_list(const struct inode *inode)
18270+{
18271+ return &reiser4_inode_data(inode)->lists.readdir_list;
18272+}
18273+
18274+extern void init_inode_ordering(struct inode *inode,
18275+ reiser4_object_create_data * crd, int create);
18276+
18277+static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18278+{
18279+ return &reiser4_inode_data(inode)->jnodes_tree;
18280+}
18281+
18282+static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18283+ * r4_inode)
18284+{
18285+ return &r4_inode->jnodes_tree;
18286+}
18287+
18288+#if REISER4_DEBUG
18289+extern void print_inode(const char *prefix, const struct inode *i);
18290+#endif
18291+
18292+int is_dir_empty(const struct inode *);
18293+
18294+/* __REISER4_INODE_H__ */
18295+#endif
18296+
18297+/* Make Linus happy.
18298+ Local variables:
18299+ c-indentation-style: "K&R"
18300+ mode-name: "LC"
18301+ c-basic-offset: 8
18302+ tab-width: 8
18303+ fill-column: 120
18304+ End:
18305+*/
18306diff -urN linux-2.6.23.orig/fs/reiser4/ioctl.h linux-2.6.23/fs/reiser4/ioctl.h
18307--- linux-2.6.23.orig/fs/reiser4/ioctl.h 1970-01-01 03:00:00.000000000 +0300
18308+++ linux-2.6.23/fs/reiser4/ioctl.h 2007-12-04 16:49:30.000000000 +0300
18309@@ -0,0 +1,41 @@
18310+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18311+ * reiser4/README */
18312+
18313+#if !defined( __REISER4_IOCTL_H__ )
18314+#define __REISER4_IOCTL_H__
18315+
18316+#include <linux/fs.h>
18317+
18318+/*
18319+ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18320+ * extents and fix in this state. This is used by applications that rely on
18321+ *
18322+ * . files being block aligned, and
18323+ *
18324+ * . files never migrating on disk
18325+ *
18326+ * for example, boot loaders (LILO) need this.
18327+ *
18328+ * This ioctl should be used as
18329+ *
18330+ * result = ioctl(fd, REISER4_IOC_UNPACK);
18331+ *
18332+ * File behind fd descriptor will be converted to the extents (if necessary),
18333+ * and its stat-data will be updated so that it will never be converted back
18334+ * into tails again.
18335+ */
18336+#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18337+
18338+/* __REISER4_IOCTL_H__ */
18339+#endif
18340+
18341+/* Make Linus happy.
18342+ Local variables:
18343+ c-indentation-style: "K&R"
18344+ mode-name: "LC"
18345+ c-basic-offset: 8
18346+ tab-width: 8
18347+ fill-column: 120
18348+ scroll-step: 1
18349+ End:
18350+*/
18351diff -urN linux-2.6.23.orig/fs/reiser4/jnode.c linux-2.6.23/fs/reiser4/jnode.c
18352--- linux-2.6.23.orig/fs/reiser4/jnode.c 1970-01-01 03:00:00.000000000 +0300
18353+++ linux-2.6.23/fs/reiser4/jnode.c 2007-12-04 16:49:30.000000000 +0300
18354@@ -0,0 +1,1924 @@
18355+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18356+ * reiser4/README */
18357+/* Jnode manipulation functions. */
18358+/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18359+
18360+ In particular, jnodes are used to track transactional information
18361+ associated with each block. Each znode contains jnode as ->zjnode field.
18362+
18363+ Jnode stands for either Josh or Journal node.
18364+*/
18365+
18366+/*
18367+ * Taxonomy.
18368+ *
18369+ * Jnode represents block containing data or meta-data. There are jnodes
18370+ * for:
18371+ *
18372+ * unformatted blocks (jnodes proper). There are plans, however to
18373+ * have a handle per extent unit rather than per each unformatted
18374+ * block, because there are so many of them.
18375+ *
18376+ * For bitmaps. Each bitmap is actually represented by two jnodes--one
18377+ * for working and another for "commit" data, together forming bnode.
18378+ *
18379+ * For io-heads. These are used by log writer.
18380+ *
18381+ * For formatted nodes (znode). See comment at the top of znode.c for
18382+ * details specific to the formatted nodes (znodes).
18383+ *
18384+ * Node data.
18385+ *
18386+ * Jnode provides access to the data of node it represents. Data are
18387+ * stored in a page. Page is kept in a page cache. This means, that jnodes
18388+ * are highly interconnected with page cache and VM internals.
18389+ *
18390+ * jnode has a pointer to page (->pg) containing its data. Pointer to data
18391+ * themselves is cached in ->data field to avoid frequent calls to
18392+ * page_address().
18393+ *
18394+ * jnode and page are attached to each other by jnode_attach_page(). This
18395+ * function places pointer to jnode in set_page_private(), sets PG_private
18396+ * flag and increments page counter.
18397+ *
18398+ * Opposite operation is performed by page_clear_jnode().
18399+ *
18400+ * jnode->pg is protected by jnode spin lock, and page->private is
18401+ * protected by page lock. See comment at the top of page_cache.c for
18402+ * more.
18403+ *
18404+ * page can be detached from jnode for two reasons:
18405+ *
18406+ * . jnode is removed from a tree (file is truncated, of formatted
18407+ * node is removed by balancing).
18408+ *
18409+ * . during memory pressure, VM calls ->releasepage() method
18410+ * (reiser4_releasepage()) to evict page from memory.
18411+ *
18412+ * (there, of course, is also umount, but this is special case we are not
18413+ * concerned with here).
18414+ *
18415+ * To protect jnode page from eviction, one calls jload() function that
18416+ * "pins" page in memory (loading it if necessary), increments
18417+ * jnode->d_count, and kmap()s page. Page is unpinned through call to
18418+ * jrelse().
18419+ *
18420+ * Jnode life cycle.
18421+ *
18422+ * jnode is created, placed in hash table, and, optionally, in per-inode
18423+ * radix tree. Page can be attached to jnode, pinned, released, etc.
18424+ *
18425+ * When jnode is captured into atom its reference counter is
18426+ * increased. While being part of an atom, jnode can be "early
18427+ * flushed". This means that as part of flush procedure, jnode is placed
18428+ * into "relocate set", and its page is submitted to the disk. After io
18429+ * completes, page can be detached, then loaded again, re-dirtied, etc.
18430+ *
18431+ * Thread acquired reference to jnode by calling jref() and releases it by
18432+ * jput(). When last reference is removed, jnode is still retained in
18433+ * memory (cached) if it has page attached, _unless_ it is scheduled for
18434+ * destruction (has JNODE_HEARD_BANSHEE bit set).
18435+ *
18436+ * Tree read-write lock was used as "existential" lock for jnodes. That is,
18437+ * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18438+ * that is, tree lock protected unreferenced jnodes stored in the hash
18439+ * table, from recycling.
18440+ *
18441+ * This resulted in high contention on tree lock, because jref()/jput() is
18442+ * frequent operation. To ameliorate this problem, RCU is used: when jput()
18443+ * is just about to release last reference on jnode it sets JNODE_RIP bit
18444+ * on it, and then proceed with jnode destruction (removing jnode from hash
18445+ * table, cbk_cache, detaching page, etc.). All places that change jnode
18446+ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18447+ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18448+ * jnode_rip_check() function), and pretend that nothing was found in hash
18449+ * table if bit is set.
18450+ *
18451+ * jput defers actual return of jnode into slab cache to some later time
18452+ * (by call_rcu()), this guarantees that other threads can safely continue
18453+ * working with JNODE_RIP-ped jnode.
18454+ *
18455+ */
18456+
18457+#include "reiser4.h"
18458+#include "debug.h"
18459+#include "dformat.h"
18460+#include "jnode.h"
18461+#include "plugin/plugin_header.h"
18462+#include "plugin/plugin.h"
18463+#include "txnmgr.h"
18464+/*#include "jnode.h"*/
18465+#include "znode.h"
18466+#include "tree.h"
18467+#include "tree_walk.h"
18468+#include "super.h"
18469+#include "inode.h"
18470+#include "page_cache.h"
18471+
18472+#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18473+#include <linux/types.h>
18474+#include <linux/slab.h>
18475+#include <linux/pagemap.h>
18476+#include <linux/swap.h>
18477+#include <linux/fs.h> /* for struct address_space */
18478+#include <linux/writeback.h> /* for inode_lock */
18479+
18480+static struct kmem_cache *_jnode_slab = NULL;
18481+
18482+static void jnode_set_type(jnode * node, jnode_type type);
18483+static int jdelete(jnode * node);
18484+static int jnode_try_drop(jnode * node);
18485+
18486+#if REISER4_DEBUG
18487+static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18488+#endif
18489+
18490+/* true if valid page is attached to jnode */
18491+static inline int jnode_is_parsed(jnode * node)
18492+{
18493+ return JF_ISSET(node, JNODE_PARSED);
18494+}
18495+
18496+/* hash table support */
18497+
18498+/* compare two jnode keys for equality. Used by hash-table macros */
18499+static inline int jnode_key_eq(const struct jnode_key * k1,
18500+ const struct jnode_key * k2)
18501+{
18502+ assert("nikita-2350", k1 != NULL);
18503+ assert("nikita-2351", k2 != NULL);
18504+
18505+ return (k1->index == k2->index && k1->objectid == k2->objectid);
18506+}
18507+
18508+/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18509+static inline __u32 jnode_key_hashfn(j_hash_table * table,
18510+ const struct jnode_key * key)
18511+{
18512+ assert("nikita-2352", key != NULL);
18513+ assert("nikita-3346", IS_POW(table->_buckets));
18514+
18515+ /* yes, this is remarkable simply (where not stupid) hash function. */
18516+ return (key->objectid + key->index) & (table->_buckets - 1);
18517+}
18518+
18519+/* The hash table definition */
18520+#define KMALLOC(size) reiser4_vmalloc(size)
18521+#define KFREE(ptr, size) vfree(ptr)
18522+TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j,
18523+ jnode_key_hashfn, jnode_key_eq);
18524+#undef KFREE
18525+#undef KMALLOC
18526+
18527+/* call this to initialise jnode hash table */
18528+int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18529+{
18530+ assert("nikita-2359", tree != NULL);
18531+ return j_hash_init(&tree->jhash_table, 16384);
18532+}
18533+
18534+/* call this to destroy jnode hash table. This is called during umount. */
18535+int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18536+{
18537+ j_hash_table *jtable;
18538+ jnode *node;
18539+ jnode *next;
18540+
18541+ assert("nikita-2360", tree != NULL);
18542+
18543+ /*
18544+ * Scan hash table and free all jnodes.
18545+ */
18546+ jtable = &tree->jhash_table;
18547+ if (jtable->_table) {
18548+ for_all_in_htable(jtable, j, node, next) {
18549+ assert("nikita-2361", !atomic_read(&node->x_count));
18550+ jdrop(node);
18551+ }
18552+
18553+ j_hash_done(&tree->jhash_table);
18554+ }
18555+ return 0;
18556+}
18557+
18558+/**
18559+ * init_jnodes - create jnode cache
18560+ *
18561+ * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18562+ */
18563+int init_jnodes(void)
18564+{
18565+ assert("umka-168", _jnode_slab == NULL);
18566+
18567+ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18568+ SLAB_HWCACHE_ALIGN |
18569+ SLAB_RECLAIM_ACCOUNT, NULL);
18570+ if (_jnode_slab == NULL)
18571+ return RETERR(-ENOMEM);
18572+
18573+ return 0;
18574+}
18575+
18576+/**
18577+ * done_znodes - delete znode cache
18578+ *
18579+ * This is called on reiser4 module unloading or system shutdown.
18580+ */
18581+void done_jnodes(void)
18582+{
18583+ destroy_reiser4_cache(&_jnode_slab);
18584+}
18585+
18586+/* Initialize a jnode. */
18587+void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18588+{
18589+ assert("umka-175", node != NULL);
18590+
18591+ memset(node, 0, sizeof(jnode));
18592+ ON_DEBUG(node->magic = JMAGIC);
18593+ jnode_set_type(node, type);
18594+ atomic_set(&node->d_count, 0);
18595+ atomic_set(&node->x_count, 0);
18596+ spin_lock_init(&node->guard);
18597+ spin_lock_init(&node->load);
18598+ node->atom = NULL;
18599+ node->tree = tree;
18600+ INIT_LIST_HEAD(&node->capture_link);
18601+
18602+ ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18603+
18604+ INIT_RCU_HEAD(&node->rcu);
18605+
18606+#if REISER4_DEBUG
18607+ {
18608+ reiser4_super_info_data *sbinfo;
18609+
18610+ sbinfo = get_super_private(tree->super);
18611+ spin_lock_irq(&sbinfo->all_guard);
18612+ list_add(&node->jnodes, &sbinfo->all_jnodes);
18613+ spin_unlock_irq(&sbinfo->all_guard);
18614+ }
18615+#endif
18616+}
18617+
18618+#if REISER4_DEBUG
18619+/*
18620+ * Remove jnode from ->all_jnodes list.
18621+ */
18622+static void jnode_done(jnode * node, reiser4_tree * tree)
18623+{
18624+ reiser4_super_info_data *sbinfo;
18625+
18626+ sbinfo = get_super_private(tree->super);
18627+
18628+ spin_lock_irq(&sbinfo->all_guard);
18629+ assert("nikita-2422", !list_empty(&node->jnodes));
18630+ list_del_init(&node->jnodes);
18631+ spin_unlock_irq(&sbinfo->all_guard);
18632+}
18633+#endif
18634+
18635+/* return already existing jnode of page */
18636+jnode *jnode_by_page(struct page *pg)
18637+{
18638+ assert("nikita-2066", pg != NULL);
18639+ assert("nikita-2400", PageLocked(pg));
18640+ assert("nikita-2068", PagePrivate(pg));
18641+ assert("nikita-2067", jprivate(pg) != NULL);
18642+ return jprivate(pg);
18643+}
18644+
18645+/* exported functions to allocate/free jnode objects outside this file */
18646+jnode *jalloc(void)
18647+{
18648+ jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
18649+ return jal;
18650+}
18651+
18652+/* return jnode back to the slab allocator */
18653+inline void jfree(jnode * node)
18654+{
18655+ assert("zam-449", node != NULL);
18656+
18657+ assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18658+ NODE_LIST(node) == NOT_CAPTURED));
18659+ assert("nikita-3222", list_empty(&node->jnodes));
18660+ assert("nikita-3221", jnode_page(node) == NULL);
18661+
18662+ /* not yet phash_jnode_destroy(node); */
18663+
18664+ kmem_cache_free(_jnode_slab, node);
18665+}
18666+
18667+/*
18668+ * This function is supplied as RCU callback. It actually frees jnode when
18669+ * last reference to it is gone.
18670+ */
18671+static void jnode_free_actor(struct rcu_head *head)
18672+{
18673+ jnode *node;
18674+ jnode_type jtype;
18675+
18676+ node = container_of(head, jnode, rcu);
18677+ jtype = jnode_get_type(node);
18678+
18679+ ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18680+
18681+ switch (jtype) {
18682+ case JNODE_IO_HEAD:
18683+ case JNODE_BITMAP:
18684+ case JNODE_UNFORMATTED_BLOCK:
18685+ jfree(node);
18686+ break;
18687+ case JNODE_FORMATTED_BLOCK:
18688+ zfree(JZNODE(node));
18689+ break;
18690+ case JNODE_INODE:
18691+ default:
18692+ wrong_return_value("nikita-3197", "Wrong jnode type");
18693+ }
18694+}
18695+
18696+/*
18697+ * Free a jnode. Post a callback to be executed later through RCU when all
18698+ * references to @node are released.
18699+ */
18700+static inline void jnode_free(jnode * node, jnode_type jtype)
18701+{
18702+ if (jtype != JNODE_INODE) {
18703+ /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18704+ call_rcu(&node->rcu, jnode_free_actor);
18705+ } else
18706+ jnode_list_remove(node);
18707+}
18708+
18709+/* allocate new unformatted jnode */
18710+static jnode *jnew_unformatted(void)
18711+{
18712+ jnode *jal;
18713+
18714+ jal = jalloc();
18715+ if (jal == NULL)
18716+ return NULL;
18717+
18718+ jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18719+ jal->key.j.mapping = NULL;
18720+ jal->key.j.index = (unsigned long)-1;
18721+ jal->key.j.objectid = 0;
18722+ return jal;
18723+}
18724+
18725+/* look for jnode with given mapping and offset within hash table */
18726+jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18727+{
18728+ struct jnode_key jkey;
18729+ jnode *node;
18730+
18731+ assert("nikita-2353", tree != NULL);
18732+
18733+ jkey.objectid = objectid;
18734+ jkey.index = index;
18735+
18736+ /*
18737+ * hash table is _not_ protected by any lock during lookups. All we
18738+ * have to do is to disable preemption to keep RCU happy.
18739+ */
18740+
18741+ rcu_read_lock();
18742+ node = j_hash_find(&tree->jhash_table, &jkey);
18743+ if (node != NULL) {
18744+ /* protect @node from recycling */
18745+ jref(node);
18746+ assert("nikita-2955", jnode_invariant(node, 0, 0));
18747+ node = jnode_rip_check(tree, node);
18748+ }
18749+ rcu_read_unlock();
18750+ return node;
18751+}
18752+
18753+/* per inode radix tree of jnodes is protected by tree's read write spin lock */
18754+static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
18755+{
18756+ assert("vs-1694", mapping->host != NULL);
18757+
18758+ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
18759+}
18760+
18761+jnode *jfind(struct address_space * mapping, unsigned long index)
18762+{
18763+ reiser4_tree *tree;
18764+ jnode *node;
18765+
18766+ assert("vs-1694", mapping->host != NULL);
18767+ tree = reiser4_tree_by_inode(mapping->host);
18768+
18769+ read_lock_tree(tree);
18770+ node = jfind_nolock(mapping, index);
18771+ if (node != NULL)
18772+ jref(node);
18773+ read_unlock_tree(tree);
18774+ return node;
18775+}
18776+
18777+static void inode_attach_jnode(jnode * node)
18778+{
18779+ struct inode *inode;
18780+ reiser4_inode *info;
18781+ struct radix_tree_root *rtree;
18782+
18783+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18784+ assert("zam-1043", node->key.j.mapping != NULL);
18785+ inode = node->key.j.mapping->host;
18786+ info = reiser4_inode_data(inode);
18787+ rtree = jnode_tree_by_reiser4_inode(info);
18788+ if (rtree->rnode == NULL) {
18789+ /* prevent inode from being pruned when it has jnodes attached
18790+ to it */
18791+ write_lock_irq(&inode->i_data.tree_lock);
18792+ inode->i_data.nrpages++;
18793+ write_unlock_irq(&inode->i_data.tree_lock);
18794+ }
18795+ assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
18796+ check_me("zam-1045",
18797+ !radix_tree_insert(rtree, node->key.j.index, node));
18798+ ON_DEBUG(info->nr_jnodes++);
18799+}
18800+
18801+static void inode_detach_jnode(jnode * node)
18802+{
18803+ struct inode *inode;
18804+ reiser4_inode *info;
18805+ struct radix_tree_root *rtree;
18806+
18807+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18808+ assert("zam-1044", node->key.j.mapping != NULL);
18809+ inode = node->key.j.mapping->host;
18810+ info = reiser4_inode_data(inode);
18811+ rtree = jnode_tree_by_reiser4_inode(info);
18812+
18813+ assert("zam-1051", info->nr_jnodes != 0);
18814+ assert("zam-1052", rtree->rnode != NULL);
18815+ ON_DEBUG(info->nr_jnodes--);
18816+
18817+ /* delete jnode from inode's radix tree of jnodes */
18818+ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
18819+ if (rtree->rnode == NULL) {
18820+ /* inode can be pruned now */
18821+ write_lock_irq(&inode->i_data.tree_lock);
18822+ inode->i_data.nrpages--;
18823+ write_unlock_irq(&inode->i_data.tree_lock);
18824+ }
18825+}
18826+
18827+/* put jnode into hash table (where they can be found by flush who does not know
18828+ mapping) and to inode's tree of jnodes (where they can be found (hopefully
18829+ faster) in places where mapping is known). Currently it is used by
18830+ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
18831+ created */
18832+static void
18833+hash_unformatted_jnode(jnode * node, struct address_space *mapping,
18834+ unsigned long index)
18835+{
18836+ j_hash_table *jtable;
18837+
18838+ assert("vs-1446", jnode_is_unformatted(node));
18839+ assert("vs-1442", node->key.j.mapping == 0);
18840+ assert("vs-1443", node->key.j.objectid == 0);
18841+ assert("vs-1444", node->key.j.index == (unsigned long)-1);
18842+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18843+
18844+ node->key.j.mapping = mapping;
18845+ node->key.j.objectid = get_inode_oid(mapping->host);
18846+ node->key.j.index = index;
18847+
18848+ jtable = &jnode_get_tree(node)->jhash_table;
18849+
18850+ /* race with some other thread inserting jnode into the hash table is
18851+ * impossible, because we keep the page lock. */
18852+ /*
18853+ * following assertion no longer holds because of RCU: it is possible
18854+ * jnode is in the hash table, but with JNODE_RIP bit set.
18855+ */
18856+ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
18857+ j_hash_insert_rcu(jtable, node);
18858+ inode_attach_jnode(node);
18859+}
18860+
18861+static void unhash_unformatted_node_nolock(jnode * node)
18862+{
18863+ assert("vs-1683", node->key.j.mapping != NULL);
18864+ assert("vs-1684",
18865+ node->key.j.objectid ==
18866+ get_inode_oid(node->key.j.mapping->host));
18867+
18868+ /* remove jnode from hash-table */
18869+ j_hash_remove_rcu(&node->tree->jhash_table, node);
18870+ inode_detach_jnode(node);
18871+ node->key.j.mapping = NULL;
18872+ node->key.j.index = (unsigned long)-1;
18873+ node->key.j.objectid = 0;
18874+
18875+}
18876+
18877+/* remove jnode from hash table and from inode's tree of jnodes. This is used in
18878+ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
18879+ reiser4_uncapture_jnode */
18880+void unhash_unformatted_jnode(jnode * node)
18881+{
18882+ assert("vs-1445", jnode_is_unformatted(node));
18883+
18884+ write_lock_tree(node->tree);
18885+ unhash_unformatted_node_nolock(node);
18886+ write_unlock_tree(node->tree);
18887+}
18888+
18889+/*
18890+ * search hash table for a jnode with given oid and index. If not found,
18891+ * allocate new jnode, insert it, and also insert into radix tree for the
18892+ * given inode/mapping.
18893+ */
18894+static jnode *find_get_jnode(reiser4_tree * tree,
18895+ struct address_space *mapping,
18896+ oid_t oid, unsigned long index)
18897+{
18898+ jnode *result;
18899+ jnode *shadow;
18900+ int preload;
18901+
18902+ result = jnew_unformatted();
18903+
18904+ if (unlikely(result == NULL))
18905+ return ERR_PTR(RETERR(-ENOMEM));
18906+
18907+ preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
18908+ if (preload != 0)
18909+ return ERR_PTR(preload);
18910+
18911+ write_lock_tree(tree);
18912+ shadow = jfind_nolock(mapping, index);
18913+ if (likely(shadow == NULL)) {
18914+ /* add new jnode to hash table and inode's radix tree of jnodes */
18915+ jref(result);
18916+ hash_unformatted_jnode(result, mapping, index);
18917+ } else {
18918+ /* jnode is found in inode's radix tree of jnodes */
18919+ jref(shadow);
18920+ jnode_free(result, JNODE_UNFORMATTED_BLOCK);
18921+ assert("vs-1498", shadow->key.j.mapping == mapping);
18922+ result = shadow;
18923+ }
18924+ write_unlock_tree(tree);
18925+
18926+ assert("nikita-2955",
18927+ ergo(result != NULL, jnode_invariant(result, 0, 0)));
18928+ radix_tree_preload_end();
18929+ return result;
18930+}
18931+
18932+/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
18933+ creates) jnode corresponding to page @pg. jnode is attached to page and
18934+ inserted into jnode hash-table. */
18935+static jnode *do_jget(reiser4_tree * tree, struct page *pg)
18936+{
18937+ /*
18938+ * There are two ways to create jnode: starting with pre-existing page
18939+ * and without page.
18940+ *
18941+ * When page already exists, jnode is created
18942+ * (jnode_of_page()->do_jget()) under page lock. This is done in
18943+ * ->writepage(), or when capturing anonymous page dirtied through
18944+ * mmap.
18945+ *
18946+ * Jnode without page is created by index_extent_jnode().
18947+ *
18948+ */
18949+
18950+ jnode *result;
18951+ oid_t oid = get_inode_oid(pg->mapping->host);
18952+
18953+ assert("umka-176", pg != NULL);
18954+ assert("nikita-2394", PageLocked(pg));
18955+
18956+ result = jprivate(pg);
18957+ if (likely(result != NULL))
18958+ return jref(result);
18959+
18960+ tree = reiser4_tree_by_page(pg);
18961+
18962+ /* check hash-table first */
18963+ result = jfind(pg->mapping, pg->index);
18964+ if (unlikely(result != NULL)) {
18965+ spin_lock_jnode(result);
18966+ jnode_attach_page(result, pg);
18967+ spin_unlock_jnode(result);
18968+ result->key.j.mapping = pg->mapping;
18969+ return result;
18970+ }
18971+
18972+ /* since page is locked, jnode should be allocated with GFP_NOFS flag */
18973+ reiser4_ctx_gfp_mask_force(GFP_NOFS);
18974+ result = find_get_jnode(tree, pg->mapping, oid, pg->index);
18975+ if (unlikely(IS_ERR(result)))
18976+ return result;
18977+ /* attach jnode to page */
18978+ spin_lock_jnode(result);
18979+ jnode_attach_page(result, pg);
18980+ spin_unlock_jnode(result);
18981+ return result;
18982+}
18983+
18984+/*
18985+ * return jnode for @pg, creating it if necessary.
18986+ */
18987+jnode *jnode_of_page(struct page * pg)
18988+{
18989+ jnode *result;
18990+
18991+ assert("umka-176", pg != NULL);
18992+ assert("nikita-2394", PageLocked(pg));
18993+
18994+ result = do_jget(reiser4_tree_by_page(pg), pg);
18995+
18996+ if (REISER4_DEBUG && !IS_ERR(result)) {
18997+ assert("nikita-3210", result == jprivate(pg));
18998+ assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
18999+ if (jnode_is_unformatted(jprivate(pg))) {
19000+ assert("nikita-2364",
19001+ jprivate(pg)->key.j.index == pg->index);
19002+ assert("nikita-2367",
19003+ jprivate(pg)->key.j.mapping == pg->mapping);
19004+ assert("nikita-2365",
19005+ jprivate(pg)->key.j.objectid ==
19006+ get_inode_oid(pg->mapping->host));
19007+ assert("vs-1200",
19008+ jprivate(pg)->key.j.objectid ==
19009+ pg->mapping->host->i_ino);
19010+ assert("nikita-2356",
19011+ jnode_is_unformatted(jnode_by_page(pg)));
19012+ }
19013+ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
19014+ }
19015+ return result;
19016+}
19017+
19018+/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
19019+ * page.*/
19020+void jnode_attach_page(jnode * node, struct page *pg)
19021+{
19022+ assert("nikita-2060", node != NULL);
19023+ assert("nikita-2061", pg != NULL);
19024+
19025+ assert("nikita-2050", jprivate(pg) == 0ul);
19026+ assert("nikita-2393", !PagePrivate(pg));
19027+ assert("vs-1741", node->pg == NULL);
19028+
19029+ assert("nikita-2396", PageLocked(pg));
19030+ assert_spin_locked(&(node->guard));
19031+
19032+ page_cache_get(pg);
19033+ set_page_private(pg, (unsigned long)node);
19034+ node->pg = pg;
19035+ SetPagePrivate(pg);
19036+}
19037+
19038+/* Dual to jnode_attach_page: break a binding between page and jnode */
19039+void page_clear_jnode(struct page *page, jnode * node)
19040+{
19041+ assert("nikita-2424", page != NULL);
19042+ assert("nikita-2425", PageLocked(page));
19043+ assert("nikita-2426", node != NULL);
19044+ assert_spin_locked(&(node->guard));
19045+ assert("nikita-2428", PagePrivate(page));
19046+
19047+ assert("nikita-3551", !PageWriteback(page));
19048+
19049+ JF_CLR(node, JNODE_PARSED);
19050+ set_page_private(page, 0ul);
19051+ ClearPagePrivate(page);
19052+ node->pg = NULL;
19053+ page_cache_release(page);
19054+}
19055+
19056+#if 0
19057+/* it is only used in one place to handle error */
19058+void
19059+page_detach_jnode(struct page *page, struct address_space *mapping,
19060+ unsigned long index)
19061+{
19062+ assert("nikita-2395", page != NULL);
19063+
19064+ lock_page(page);
19065+ if ((page->mapping == mapping) && (page->index == index)
19066+ && PagePrivate(page)) {
19067+ jnode *node;
19068+
19069+ node = jprivate(page);
19070+ spin_lock_jnode(node);
19071+ page_clear_jnode(page, node);
19072+ spin_unlock_jnode(node);
19073+ }
19074+ unlock_page(page);
19075+}
19076+#endif /* 0 */
19077+
19078+/* return @node page locked.
19079+
19080+ Locking ordering requires that one first takes page lock and afterwards
19081+ spin lock on node attached to this page. Sometimes it is necessary to go in
19082+ the opposite direction. This is done through standard trylock-and-release
19083+ loop.
19084+*/
19085+static struct page *jnode_lock_page(jnode * node)
19086+{
19087+ struct page *page;
19088+
19089+ assert("nikita-2052", node != NULL);
19090+ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19091+
19092+ while (1) {
19093+
19094+ spin_lock_jnode(node);
19095+ page = jnode_page(node);
19096+ if (page == NULL) {
19097+ break;
19098+ }
19099+
19100+ /* no need to page_cache_get( page ) here, because page cannot
19101+ be evicted from memory without detaching it from jnode and
19102+ this requires spin lock on jnode that we already hold.
19103+ */
19104+ if (!TestSetPageLocked(page)) {
19105+ /* We won a lock on jnode page, proceed. */
19106+ break;
19107+ }
19108+
19109+ /* Page is locked by someone else. */
19110+ page_cache_get(page);
19111+ spin_unlock_jnode(node);
19112+ wait_on_page_locked(page);
19113+ /* it is possible that page was detached from jnode and
19114+ returned to the free pool, or re-assigned while we were
19115+ waiting on locked bit. This will be rechecked on the next
19116+ loop iteration.
19117+ */
19118+ page_cache_release(page);
19119+
19120+ /* try again */
19121+ }
19122+ return page;
19123+}
19124+
19125+/*
19126+ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19127+ * validness of jnode content.
19128+ */
19129+static inline int jparse(jnode * node)
19130+{
19131+ int result;
19132+
19133+ assert("nikita-2466", node != NULL);
19134+
19135+ spin_lock_jnode(node);
19136+ if (likely(!jnode_is_parsed(node))) {
19137+ result = jnode_ops(node)->parse(node);
19138+ if (likely(result == 0))
19139+ JF_SET(node, JNODE_PARSED);
19140+ } else
19141+ result = 0;
19142+ spin_unlock_jnode(node);
19143+ return result;
19144+}
19145+
19146+/* Lock a page attached to jnode, create and attach page to jnode if it had no
19147+ * one. */
19148+static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19149+{
19150+ struct page *page;
19151+
19152+ spin_lock_jnode(node);
19153+ page = jnode_page(node);
19154+
19155+ if (page == NULL) {
19156+ spin_unlock_jnode(node);
19157+ page = find_or_create_page(jnode_get_mapping(node),
19158+ jnode_get_index(node), gfp_flags);
19159+ if (page == NULL)
19160+ return ERR_PTR(RETERR(-ENOMEM));
19161+ } else {
19162+ if (!TestSetPageLocked(page)) {
19163+ spin_unlock_jnode(node);
19164+ return page;
19165+ }
19166+ page_cache_get(page);
19167+ spin_unlock_jnode(node);
19168+ lock_page(page);
19169+ assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19170+ }
19171+
19172+ spin_lock_jnode(node);
19173+ if (!jnode_page(node))
19174+ jnode_attach_page(node, page);
19175+ spin_unlock_jnode(node);
19176+
19177+ page_cache_release(page);
19178+ assert("zam-894", jnode_page(node) == page);
19179+ return page;
19180+}
19181+
19182+/* Start read operation for jnode's page if page is not up-to-date. */
19183+static int jnode_start_read(jnode * node, struct page *page)
19184+{
19185+ assert("zam-893", PageLocked(page));
19186+
19187+ if (PageUptodate(page)) {
19188+ unlock_page(page);
19189+ return 0;
19190+ }
19191+ return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19192+}
19193+
19194+#if REISER4_DEBUG
19195+static void check_jload(jnode * node, struct page *page)
19196+{
19197+ if (jnode_is_znode(node)) {
19198+ node40_header *nh;
19199+ znode *z;
19200+
19201+ z = JZNODE(node);
19202+ if (znode_is_any_locked(z)) {
19203+ nh = (node40_header *) kmap(page);
19204+ /* this only works for node40-only file systems. For
19205+ * debugging. */
19206+ assert("nikita-3253",
19207+ z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19208+ kunmap(page);
19209+ }
19210+ assert("nikita-3565", znode_invariant(z));
19211+ }
19212+}
19213+#else
19214+#define check_jload(node, page) noop
19215+#endif
19216+
19217+/* prefetch jnode to speed up next call to jload. Call this when you are going
19218+ * to call jload() shortly. This will bring appropriate portion of jnode into
19219+ * CPU cache. */
19220+void jload_prefetch(jnode * node)
19221+{
19222+ prefetchw(&node->x_count);
19223+}
19224+
19225+/* load jnode's data into memory */
19226+int jload_gfp(jnode * node /* node to load */ ,
19227+ gfp_t gfp_flags /* allocation flags */ ,
19228+ int do_kmap /* true if page should be kmapped */ )
19229+{
19230+ struct page *page;
19231+ int result = 0;
19232+ int parsed;
19233+
19234+ assert("nikita-3010", reiser4_schedulable());
19235+
19236+ prefetchw(&node->pg);
19237+
19238+ /* taking d-reference implies taking x-reference. */
19239+ jref(node);
19240+
19241+ /*
19242+ * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19243+ * should be atomic, otherwise there is a race against
19244+ * reiser4_releasepage().
19245+ */
19246+ spin_lock(&(node->load));
19247+ add_d_ref(node);
19248+ parsed = jnode_is_parsed(node);
19249+ spin_unlock(&(node->load));
19250+
19251+ if (unlikely(!parsed)) {
19252+ page = jnode_get_page_locked(node, gfp_flags);
19253+ if (unlikely(IS_ERR(page))) {
19254+ result = PTR_ERR(page);
19255+ goto failed;
19256+ }
19257+
19258+ result = jnode_start_read(node, page);
19259+ if (unlikely(result != 0))
19260+ goto failed;
19261+
19262+ wait_on_page_locked(page);
19263+ if (unlikely(!PageUptodate(page))) {
19264+ result = RETERR(-EIO);
19265+ goto failed;
19266+ }
19267+
19268+ if (do_kmap)
19269+ node->data = kmap(page);
19270+
19271+ result = jparse(node);
19272+ if (unlikely(result != 0)) {
19273+ if (do_kmap)
19274+ kunmap(page);
19275+ goto failed;
19276+ }
19277+ check_jload(node, page);
19278+ } else {
19279+ page = jnode_page(node);
19280+ check_jload(node, page);
19281+ if (do_kmap)
19282+ node->data = kmap(page);
19283+ }
19284+
19285+ if (!is_writeout_mode())
19286+ /* We do not mark pages active if jload is called as a part of
19287+ * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19288+ * and write_logs() add no value to cached data, there is no
19289+ * sense to mark pages as active when they go to disk, it just
19290+ * confuses vm scanning routines because clean page could be
19291+ * moved out from inactive list as a result of this
19292+ * mark_page_accessed() call. */
19293+ mark_page_accessed(page);
19294+
19295+ return 0;
19296+
19297+ failed:
19298+ jrelse_tail(node);
19299+ return result;
19300+
19301+}
19302+
19303+/* start asynchronous reading for given jnode's page. */
19304+int jstartio(jnode * node)
19305+{
19306+ struct page *page;
19307+
19308+ page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19309+ if (IS_ERR(page))
19310+ return PTR_ERR(page);
19311+
19312+ return jnode_start_read(node, page);
19313+}
19314+
19315+/* Initialize a node by calling appropriate plugin instead of reading
19316+ * node from disk as in jload(). */
19317+int jinit_new(jnode * node, gfp_t gfp_flags)
19318+{
19319+ struct page *page;
19320+ int result;
19321+
19322+ jref(node);
19323+ add_d_ref(node);
19324+
19325+ page = jnode_get_page_locked(node, gfp_flags);
19326+ if (IS_ERR(page)) {
19327+ result = PTR_ERR(page);
19328+ goto failed;
19329+ }
19330+
19331+ SetPageUptodate(page);
19332+ unlock_page(page);
19333+
19334+ node->data = kmap(page);
19335+
19336+ if (!jnode_is_parsed(node)) {
19337+ jnode_plugin *jplug = jnode_ops(node);
19338+ spin_lock_jnode(node);
19339+ result = jplug->init(node);
19340+ spin_unlock_jnode(node);
19341+ if (result) {
19342+ kunmap(page);
19343+ goto failed;
19344+ }
19345+ JF_SET(node, JNODE_PARSED);
19346+ }
19347+
19348+ return 0;
19349+
19350+ failed:
19351+ jrelse(node);
19352+ return result;
19353+}
19354+
19355+/* release a reference to jnode acquired by jload(), decrement ->d_count */
19356+void jrelse_tail(jnode * node /* jnode to release references to */ )
19357+{
19358+ assert("nikita-489", atomic_read(&node->d_count) > 0);
19359+ atomic_dec(&node->d_count);
19360+ /* release reference acquired in jload_gfp() or jinit_new() */
19361+ jput(node);
19362+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
19363+ LOCK_CNT_DEC(d_refs);
19364+}
19365+
19366+/* drop reference to node data. When last reference is dropped, data are
19367+ unloaded. */
19368+void jrelse(jnode * node /* jnode to release references to */ )
19369+{
19370+ struct page *page;
19371+
19372+ assert("nikita-487", node != NULL);
19373+ assert_spin_not_locked(&(node->guard));
19374+
19375+ page = jnode_page(node);
19376+ if (likely(page != NULL)) {
19377+ /*
19378+ * it is safe not to lock jnode here, because at this point
19379+ * @node->d_count is greater than zero (if jrelse() is used
19380+ * correctly, that is). JNODE_PARSED may be not set yet, if,
19381+ * for example, we got here as a result of error handling path
19382+ * in jload(). Anyway, page cannot be detached by
19383+ * reiser4_releasepage(). truncate will invalidate page
19384+ * regardless, but this should not be a problem.
19385+ */
19386+ kunmap(page);
19387+ }
19388+ jrelse_tail(node);
19389+}
19390+
19391+/* called from jput() to wait for io completion */
19392+static void jnode_finish_io(jnode * node)
19393+{
19394+ struct page *page;
19395+
19396+ assert("nikita-2922", node != NULL);
19397+
19398+ spin_lock_jnode(node);
19399+ page = jnode_page(node);
19400+ if (page != NULL) {
19401+ page_cache_get(page);
19402+ spin_unlock_jnode(node);
19403+ wait_on_page_writeback(page);
19404+ page_cache_release(page);
19405+ } else
19406+ spin_unlock_jnode(node);
19407+}
19408+
19409+/*
19410+ * This is called by jput() when last reference to jnode is released. This is
19411+ * separate function, because we want fast path of jput() to be inline and,
19412+ * therefore, small.
19413+ */
19414+void jput_final(jnode * node)
19415+{
19416+ int r_i_p;
19417+
19418+ /* A fast check for keeping node in cache. We always keep node in cache
19419+ * if its page is present and node was not marked for deletion */
19420+ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19421+ rcu_read_unlock();
19422+ return;
19423+ }
19424+ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19425+ /*
19426+ * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19427+ * this case it is safe to access node after unlock.
19428+ */
19429+ rcu_read_unlock();
19430+ if (r_i_p) {
19431+ jnode_finish_io(node);
19432+ if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19433+ /* node is removed from the tree. */
19434+ jdelete(node);
19435+ else
19436+ jnode_try_drop(node);
19437+ }
19438+ /* if !r_i_p some other thread is already killing it */
19439+}
19440+
19441+int jwait_io(jnode * node, int rw)
19442+{
19443+ struct page *page;
19444+ int result;
19445+
19446+ assert("zam-447", node != NULL);
19447+ assert("zam-448", jnode_page(node) != NULL);
19448+
19449+ page = jnode_page(node);
19450+
19451+ result = 0;
19452+ if (rw == READ) {
19453+ wait_on_page_locked(page);
19454+ } else {
19455+ assert("nikita-2227", rw == WRITE);
19456+ wait_on_page_writeback(page);
19457+ }
19458+ if (PageError(page))
19459+ result = RETERR(-EIO);
19460+
19461+ return result;
19462+}
19463+
19464+/*
19465+ * jnode types and plugins.
19466+ *
19467+ * jnode by itself is a "base type". There are several different jnode
19468+ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19469+ * has to do different things based on jnode type. In the standard reiser4 way
19470+ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19471+ *
19472+ * Functions below deal with jnode types and define methods of jnode plugin.
19473+ *
19474+ */
19475+
19476+/* set jnode type. This is done during jnode initialization. */
19477+static void jnode_set_type(jnode * node, jnode_type type)
19478+{
19479+ static unsigned long type_to_mask[] = {
19480+ [JNODE_UNFORMATTED_BLOCK] = 1,
19481+ [JNODE_FORMATTED_BLOCK] = 0,
19482+ [JNODE_BITMAP] = 2,
19483+ [JNODE_IO_HEAD] = 6,
19484+ [JNODE_INODE] = 4
19485+ };
19486+
19487+ assert("zam-647", type < LAST_JNODE_TYPE);
19488+ assert("nikita-2815", !jnode_is_loaded(node));
19489+ assert("nikita-3386", node->state == 0);
19490+
19491+ node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19492+}
19493+
19494+/* ->init() method of jnode plugin for jnodes that don't require plugin
19495+ * specific initialization. */
19496+static int init_noinit(jnode * node UNUSED_ARG)
19497+{
19498+ return 0;
19499+}
19500+
19501+/* ->parse() method of jnode plugin for jnodes that don't require plugin
19502+ * specific pasring. */
19503+static int parse_noparse(jnode * node UNUSED_ARG)
19504+{
19505+ return 0;
19506+}
19507+
19508+/* ->mapping() method for unformatted jnode */
19509+struct address_space *mapping_jnode(const jnode * node)
19510+{
19511+ struct address_space *map;
19512+
19513+ assert("nikita-2713", node != NULL);
19514+
19515+ /* mapping is stored in jnode */
19516+
19517+ map = node->key.j.mapping;
19518+ assert("nikita-2714", map != NULL);
19519+ assert("nikita-2897", is_reiser4_inode(map->host));
19520+ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19521+ return map;
19522+}
19523+
19524+/* ->index() method for unformatted jnodes */
19525+unsigned long index_jnode(const jnode * node)
19526+{
19527+ /* index is stored in jnode */
19528+ return node->key.j.index;
19529+}
19530+
19531+/* ->remove() method for unformatted jnodes */
19532+static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19533+{
19534+ /* remove jnode from hash table and radix tree */
19535+ if (node->key.j.mapping)
19536+ unhash_unformatted_node_nolock(node);
19537+}
19538+
19539+/* ->mapping() method for znodes */
19540+static struct address_space *mapping_znode(const jnode * node)
19541+{
19542+ /* all znodes belong to fake inode */
19543+ return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19544+}
19545+
19546+/* ->index() method for znodes */
19547+static unsigned long index_znode(const jnode * node)
19548+{
19549+ unsigned long addr;
19550+ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19551+
19552+ /* index of znode is just its address (shifted) */
19553+ addr = (unsigned long)node;
19554+ return (addr - PAGE_OFFSET) >> znode_shift_order;
19555+}
19556+
19557+/* ->mapping() method for bitmap jnode */
19558+static struct address_space *mapping_bitmap(const jnode * node)
19559+{
19560+ /* all bitmap blocks belong to special bitmap inode */
19561+ return get_super_private(jnode_get_tree(node)->super)->bitmap->
19562+ i_mapping;
19563+}
19564+
19565+/* ->index() method for jnodes that are indexed by address */
19566+static unsigned long index_is_address(const jnode * node)
19567+{
19568+ unsigned long ind;
19569+
19570+ ind = (unsigned long)node;
19571+ return ind - PAGE_OFFSET;
19572+}
19573+
19574+/* resolve race with jput */
19575+jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19576+{
19577+ /*
19578+ * This is used as part of RCU-based jnode handling.
19579+ *
19580+ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19581+ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19582+ * not protected during this, so concurrent thread may execute
19583+ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19584+ * freed in jput_final(). To avoid such races, jput_final() sets
19585+ * JNODE_RIP on jnode (under tree lock). All places that work with
19586+ * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19587+ * (first without taking tree lock), and if this bit is set, released
19588+ * reference acquired by the current thread and returns NULL.
19589+ *
19590+ * As a result, if jnode is being concurrently freed, NULL is returned
19591+ * and caller should pretend that jnode wasn't found in the first
19592+ * place.
19593+ *
19594+ * Otherwise it's safe to release "rcu-read-lock" and continue with
19595+ * jnode.
19596+ */
19597+ if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19598+ read_lock_tree(tree);
19599+ if (JF_ISSET(node, JNODE_RIP)) {
19600+ dec_x_ref(node);
19601+ node = NULL;
19602+ }
19603+ read_unlock_tree(tree);
19604+ }
19605+ return node;
19606+}
19607+
19608+reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19609+{
19610+ struct inode *inode;
19611+ item_plugin *iplug;
19612+ loff_t off;
19613+
19614+ assert("nikita-3092", node != NULL);
19615+ assert("nikita-3093", key != NULL);
19616+ assert("nikita-3094", jnode_is_unformatted(node));
19617+
19618+ off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19619+ inode = mapping_jnode(node)->host;
19620+
19621+ if (node->parent_item_id != 0)
19622+ iplug = item_plugin_by_id(node->parent_item_id);
19623+ else
19624+ iplug = NULL;
19625+
19626+ if (iplug != NULL && iplug->f.key_by_offset)
19627+ iplug->f.key_by_offset(inode, off, key);
19628+ else {
19629+ file_plugin *fplug;
19630+
19631+ fplug = inode_file_plugin(inode);
19632+ assert("zam-1007", fplug != NULL);
19633+ assert("zam-1008", fplug->key_by_inode != NULL);
19634+
19635+ fplug->key_by_inode(inode, off, key);
19636+ }
19637+
19638+ return key;
19639+}
19640+
19641+/* ->parse() method for formatted nodes */
19642+static int parse_znode(jnode * node)
19643+{
19644+ return zparse(JZNODE(node));
19645+}
19646+
19647+/* ->delete() method for formatted nodes */
19648+static void delete_znode(jnode * node, reiser4_tree * tree)
19649+{
19650+ znode *z;
19651+
19652+ assert_rw_write_locked(&(tree->tree_lock));
19653+ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19654+
19655+ z = JZNODE(node);
19656+ assert("vs-899", z->c_count == 0);
19657+
19658+ /* delete znode from sibling list. */
19659+ sibling_list_remove(z);
19660+
19661+ znode_remove(z, tree);
19662+}
19663+
19664+/* ->remove() method for formatted nodes */
19665+static int remove_znode(jnode * node, reiser4_tree * tree)
19666+{
19667+ znode *z;
19668+
19669+ assert_rw_write_locked(&(tree->tree_lock));
19670+ z = JZNODE(node);
19671+
19672+ if (z->c_count == 0) {
19673+ /* detach znode from sibling list. */
19674+ sibling_list_drop(z);
19675+ /* this is called with tree spin-lock held, so call
19676+ znode_remove() directly (rather than znode_lock_remove()). */
19677+ znode_remove(z, tree);
19678+ return 0;
19679+ }
19680+ return RETERR(-EBUSY);
19681+}
19682+
19683+/* ->init() method for formatted nodes */
19684+static int init_znode(jnode * node)
19685+{
19686+ znode *z;
19687+
19688+ z = JZNODE(node);
19689+ /* call node plugin to do actual initialization */
19690+ return z->nplug->init(z);
19691+}
19692+
19693+/* ->clone() method for formatted nodes */
19694+static jnode *clone_formatted(jnode * node)
19695+{
19696+ znode *clone;
19697+
19698+ assert("vs-1430", jnode_is_znode(node));
19699+ clone = zalloc(reiser4_ctx_gfp_mask_get());
19700+ if (clone == NULL)
19701+ return ERR_PTR(RETERR(-ENOMEM));
19702+ zinit(clone, NULL, current_tree);
19703+ jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19704+ /* ZJNODE(clone)->key.z is not initialized */
19705+ clone->level = JZNODE(node)->level;
19706+
19707+ return ZJNODE(clone);
19708+}
19709+
19710+/* jplug->clone for unformatted nodes */
19711+static jnode *clone_unformatted(jnode * node)
19712+{
19713+ jnode *clone;
19714+
19715+ assert("vs-1431", jnode_is_unformatted(node));
19716+ clone = jalloc();
19717+ if (clone == NULL)
19718+ return ERR_PTR(RETERR(-ENOMEM));
19719+
19720+ jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19721+ jnode_set_block(clone, jnode_get_block(node));
19722+
19723+ return clone;
19724+
19725+}
19726+
19727+/*
19728+ * Setup jnode plugin methods for various jnode types.
19729+ */
19730+jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19731+ [JNODE_UNFORMATTED_BLOCK] = {
19732+ .h = {
19733+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19734+ .id = JNODE_UNFORMATTED_BLOCK,
19735+ .pops = NULL,
19736+ .label = "unformatted",
19737+ .desc = "unformatted node",
19738+ .linkage = {NULL, NULL}
19739+ },
19740+ .init = init_noinit,
19741+ .parse = parse_noparse,
19742+ .mapping = mapping_jnode,
19743+ .index = index_jnode,
19744+ .clone = clone_unformatted
19745+ },
19746+ [JNODE_FORMATTED_BLOCK] = {
19747+ .h = {
19748+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19749+ .id = JNODE_FORMATTED_BLOCK,
19750+ .pops = NULL,
19751+ .label = "formatted",
19752+ .desc = "formatted tree node",
19753+ .linkage = {NULL, NULL}
19754+ },
19755+ .init = init_znode,
19756+ .parse = parse_znode,
19757+ .mapping = mapping_znode,
19758+ .index = index_znode,
19759+ .clone = clone_formatted
19760+ },
19761+ [JNODE_BITMAP] = {
19762+ .h = {
19763+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19764+ .id = JNODE_BITMAP,
19765+ .pops = NULL,
19766+ .label = "bitmap",
19767+ .desc = "bitmap node",
19768+ .linkage = {NULL, NULL}
19769+ },
19770+ .init = init_noinit,
19771+ .parse = parse_noparse,
19772+ .mapping = mapping_bitmap,
19773+ .index = index_is_address,
19774+ .clone = NULL
19775+ },
19776+ [JNODE_IO_HEAD] = {
19777+ .h = {
19778+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19779+ .id = JNODE_IO_HEAD,
19780+ .pops = NULL,
19781+ .label = "io head",
19782+ .desc = "io head",
19783+ .linkage = {NULL, NULL}
19784+ },
19785+ .init = init_noinit,
19786+ .parse = parse_noparse,
19787+ .mapping = mapping_bitmap,
19788+ .index = index_is_address,
19789+ .clone = NULL
19790+ },
19791+ [JNODE_INODE] = {
19792+ .h = {
19793+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19794+ .id = JNODE_INODE,
19795+ .pops = NULL,
19796+ .label = "inode",
19797+ .desc = "inode's builtin jnode",
19798+ .linkage = {NULL, NULL}
19799+ },
19800+ .init = NULL,
19801+ .parse = NULL,
19802+ .mapping = NULL,
19803+ .index = NULL,
19804+ .clone = NULL
19805+ }
19806+};
19807+
19808+/*
19809+ * jnode destruction.
19810+ *
19811+ * Thread may use a jnode after it acquired a reference to it. References are
19812+ * counted in ->x_count field. Reference protects jnode from being
19813+ * recycled. This is different from protecting jnode data (that are stored in
19814+ * jnode page) from being evicted from memory. Data are protected by jload()
19815+ * and released by jrelse().
19816+ *
19817+ * If thread already possesses a reference to the jnode it can acquire another
19818+ * one through jref(). Initial reference is obtained (usually) by locating
19819+ * jnode in some indexing structure that depends on jnode type: formatted
19820+ * nodes are kept in global hash table, where they are indexed by block
19821+ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
19822+ * table, which is indexed by oid and offset within file, and in per-inode
19823+ * radix tree.
19824+ *
19825+ * Reference to jnode is released by jput(). If last reference is released,
19826+ * jput_final() is called. This function determines whether jnode has to be
19827+ * deleted (this happens when corresponding node is removed from the file
19828+ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
19829+ * should be just "removed" (deleted from memory).
19830+ *
19831+ * Jnode destruction is signally delicate dance because of locking and RCU.
19832+ */
19833+
19834+/*
19835+ * Returns true if jnode cannot be removed right now. This check is called
19836+ * under tree lock. If it returns true, jnode is irrevocably committed to be
19837+ * deleted/removed.
19838+ */
19839+static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
19840+{
19841+ /* if other thread managed to acquire a reference to this jnode, don't
19842+ * free it. */
19843+ if (atomic_read(&node->x_count) > 0)
19844+ return 1;
19845+ /* also, don't free znode that has children in memory */
19846+ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
19847+ return 1;
19848+ return 0;
19849+}
19850+
19851+/*
19852+ * this is called as part of removing jnode. Based on jnode type, call
19853+ * corresponding function that removes jnode from indices and returns it back
19854+ * to the appropriate slab (through RCU).
19855+ */
19856+static inline void
19857+jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
19858+{
19859+ switch (jtype) {
19860+ case JNODE_UNFORMATTED_BLOCK:
19861+ remove_jnode(node, tree);
19862+ break;
19863+ case JNODE_IO_HEAD:
19864+ case JNODE_BITMAP:
19865+ break;
19866+ case JNODE_INODE:
19867+ break;
19868+ case JNODE_FORMATTED_BLOCK:
19869+ remove_znode(node, tree);
19870+ break;
19871+ default:
19872+ wrong_return_value("nikita-3196", "Wrong jnode type");
19873+ }
19874+}
19875+
19876+/*
19877+ * this is called as part of deleting jnode. Based on jnode type, call
19878+ * corresponding function that removes jnode from indices and returns it back
19879+ * to the appropriate slab (through RCU).
19880+ *
19881+ * This differs from jnode_remove() only for formatted nodes---for them
19882+ * sibling list handling is different for removal and deletion.
19883+ */
19884+static inline void
19885+jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
19886+{
19887+ switch (jtype) {
19888+ case JNODE_UNFORMATTED_BLOCK:
19889+ remove_jnode(node, tree);
19890+ break;
19891+ case JNODE_IO_HEAD:
19892+ case JNODE_BITMAP:
19893+ break;
19894+ case JNODE_FORMATTED_BLOCK:
19895+ delete_znode(node, tree);
19896+ break;
19897+ case JNODE_INODE:
19898+ default:
19899+ wrong_return_value("nikita-3195", "Wrong jnode type");
19900+ }
19901+}
19902+
19903+#if REISER4_DEBUG
19904+/*
19905+ * remove jnode from the debugging list of all jnodes hanging off super-block.
19906+ */
19907+void jnode_list_remove(jnode * node)
19908+{
19909+ reiser4_super_info_data *sbinfo;
19910+
19911+ sbinfo = get_super_private(jnode_get_tree(node)->super);
19912+
19913+ spin_lock_irq(&sbinfo->all_guard);
19914+ assert("nikita-2422", !list_empty(&node->jnodes));
19915+ list_del_init(&node->jnodes);
19916+ spin_unlock_irq(&sbinfo->all_guard);
19917+}
19918+#endif
19919+
19920+/*
19921+ * this is called by jput_final() to remove jnode when last reference to it is
19922+ * released.
19923+ */
19924+static int jnode_try_drop(jnode * node)
19925+{
19926+ int result;
19927+ reiser4_tree *tree;
19928+ jnode_type jtype;
19929+
19930+ assert("nikita-2491", node != NULL);
19931+ assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
19932+
19933+ tree = jnode_get_tree(node);
19934+ jtype = jnode_get_type(node);
19935+
19936+ spin_lock_jnode(node);
19937+ write_lock_tree(tree);
19938+ /*
19939+ * if jnode has a page---leave it alone. Memory pressure will
19940+ * eventually kill page and jnode.
19941+ */
19942+ if (jnode_page(node) != NULL) {
19943+ write_unlock_tree(tree);
19944+ spin_unlock_jnode(node);
19945+ JF_CLR(node, JNODE_RIP);
19946+ return RETERR(-EBUSY);
19947+ }
19948+
19949+ /* re-check ->x_count under tree lock. */
19950+ result = jnode_is_busy(node, jtype);
19951+ if (result == 0) {
19952+ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19953+ assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
19954+
19955+ spin_unlock_jnode(node);
19956+ /* no page and no references---despatch him. */
19957+ jnode_remove(node, jtype, tree);
19958+ write_unlock_tree(tree);
19959+ jnode_free(node, jtype);
19960+ } else {
19961+ /* busy check failed: reference was acquired by concurrent
19962+ * thread. */
19963+ write_unlock_tree(tree);
19964+ spin_unlock_jnode(node);
19965+ JF_CLR(node, JNODE_RIP);
19966+ }
19967+ return result;
19968+}
19969+
19970+/* jdelete() -- Delete jnode from the tree and file system */
19971+static int jdelete(jnode * node /* jnode to finish with */ )
19972+{
19973+ struct page *page;
19974+ int result;
19975+ reiser4_tree *tree;
19976+ jnode_type jtype;
19977+
19978+ assert("nikita-467", node != NULL);
19979+ assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
19980+
19981+ jtype = jnode_get_type(node);
19982+
19983+ page = jnode_lock_page(node);
19984+ assert_spin_locked(&(node->guard));
19985+
19986+ tree = jnode_get_tree(node);
19987+
19988+ write_lock_tree(tree);
19989+ /* re-check ->x_count under tree lock. */
19990+ result = jnode_is_busy(node, jtype);
19991+ if (likely(!result)) {
19992+ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19993+ assert("jmacd-511", atomic_read(&node->d_count) == 0);
19994+
19995+ /* detach page */
19996+ if (page != NULL) {
19997+ /*
19998+ * FIXME this is racy against jnode_extent_write().
19999+ */
20000+ page_clear_jnode(page, node);
20001+ }
20002+ spin_unlock_jnode(node);
20003+ /* goodbye */
20004+ jnode_delete(node, jtype, tree);
20005+ write_unlock_tree(tree);
20006+ jnode_free(node, jtype);
20007+ /* @node is no longer valid pointer */
20008+ if (page != NULL)
20009+ reiser4_drop_page(page);
20010+ } else {
20011+ /* busy check failed: reference was acquired by concurrent
20012+ * thread. */
20013+ JF_CLR(node, JNODE_RIP);
20014+ write_unlock_tree(tree);
20015+ spin_unlock_jnode(node);
20016+ if (page != NULL)
20017+ unlock_page(page);
20018+ }
20019+ return result;
20020+}
20021+
20022+/* drop jnode on the floor.
20023+
20024+ Return value:
20025+
20026+ -EBUSY: failed to drop jnode, because there are still references to it
20027+
20028+ 0: successfully dropped jnode
20029+
20030+*/
20031+static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
20032+{
20033+ struct page *page;
20034+ jnode_type jtype;
20035+ int result;
20036+
20037+ assert("zam-602", node != NULL);
20038+ assert_rw_not_read_locked(&(tree->tree_lock));
20039+ assert_rw_not_write_locked(&(tree->tree_lock));
20040+ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20041+
20042+ jtype = jnode_get_type(node);
20043+
20044+ page = jnode_lock_page(node);
20045+ assert_spin_locked(&(node->guard));
20046+
20047+ write_lock_tree(tree);
20048+
20049+ /* re-check ->x_count under tree lock. */
20050+ result = jnode_is_busy(node, jtype);
20051+ if (!result) {
20052+ assert("nikita-2488", page == jnode_page(node));
20053+ assert("nikita-2533", atomic_read(&node->d_count) == 0);
20054+ if (page != NULL) {
20055+ assert("nikita-2126", !PageDirty(page));
20056+ assert("nikita-2127", PageUptodate(page));
20057+ assert("nikita-2181", PageLocked(page));
20058+ page_clear_jnode(page, node);
20059+ }
20060+ spin_unlock_jnode(node);
20061+ jnode_remove(node, jtype, tree);
20062+ write_unlock_tree(tree);
20063+ jnode_free(node, jtype);
20064+ if (page != NULL) {
20065+ reiser4_drop_page(page);
20066+ }
20067+ } else {
20068+ /* busy check failed: reference was acquired by concurrent
20069+ * thread. */
20070+ JF_CLR(node, JNODE_RIP);
20071+ write_unlock_tree(tree);
20072+ spin_unlock_jnode(node);
20073+ if (page != NULL)
20074+ unlock_page(page);
20075+ }
20076+ return result;
20077+}
20078+
20079+/* This function frees jnode "if possible". In particular, [dcx]_count has to
20080+ be 0 (where applicable). */
20081+void jdrop(jnode * node)
20082+{
20083+ jdrop_in_tree(node, jnode_get_tree(node));
20084+}
20085+
20086+/* IO head jnode implementation; The io heads are simple j-nodes with limited
20087+ functionality (these j-nodes are not in any hash table) just for reading
20088+ from and writing to disk. */
20089+
20090+jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20091+{
20092+ jnode *jal = jalloc();
20093+
20094+ if (jal != NULL) {
20095+ jnode_init(jal, current_tree, JNODE_IO_HEAD);
20096+ jnode_set_block(jal, block);
20097+ }
20098+
20099+ jref(jal);
20100+
20101+ return jal;
20102+}
20103+
20104+void reiser4_drop_io_head(jnode * node)
20105+{
20106+ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20107+
20108+ jput(node);
20109+ jdrop(node);
20110+}
20111+
20112+/* protect keep jnode data from reiser4_releasepage() */
20113+void pin_jnode_data(jnode * node)
20114+{
20115+ assert("zam-671", jnode_page(node) != NULL);
20116+ page_cache_get(jnode_page(node));
20117+}
20118+
20119+/* make jnode data free-able again */
20120+void unpin_jnode_data(jnode * node)
20121+{
20122+ assert("zam-672", jnode_page(node) != NULL);
20123+ page_cache_release(jnode_page(node));
20124+}
20125+
20126+struct address_space *jnode_get_mapping(const jnode * node)
20127+{
20128+ assert("nikita-3162", node != NULL);
20129+ return jnode_ops(node)->mapping(node);
20130+}
20131+
20132+#if REISER4_DEBUG
20133+/* debugging aid: jnode invariant */
20134+int jnode_invariant_f(const jnode * node, char const **msg)
20135+{
20136+#define _ergo(ant, con) \
20137+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20138+#define _check(exp) ((*msg) = #exp, (exp))
20139+
20140+ return _check(node != NULL) &&
20141+ /* [jnode-queued] */
20142+ /* only relocated node can be queued, except that when znode
20143+ * is being deleted, its JNODE_RELOC bit is cleared */
20144+ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20145+ JF_ISSET(node, JNODE_RELOC) ||
20146+ JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20147+ _check(node->jnodes.prev != NULL) &&
20148+ _check(node->jnodes.next != NULL) &&
20149+ /* [jnode-dirty] invariant */
20150+ /* dirty inode is part of atom */
20151+ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20152+ /* [jnode-oid] invariant */
20153+ /* for unformatted node ->objectid and ->mapping fields are
20154+ * consistent */
20155+ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20156+ node->key.j.objectid ==
20157+ get_inode_oid(node->key.j.mapping->host)) &&
20158+ /* [jnode-atom-valid] invariant */
20159+ /* node atom has valid state */
20160+ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20161+ /* [jnode-page-binding] invariant */
20162+ /* if node points to page, it points back to node */
20163+ _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20164+ /* [jnode-refs] invariant */
20165+ /* only referenced jnode can be loaded */
20166+ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20167+
20168+}
20169+
20170+static const char *jnode_type_name(jnode_type type)
20171+{
20172+ switch (type) {
20173+ case JNODE_UNFORMATTED_BLOCK:
20174+ return "unformatted";
20175+ case JNODE_FORMATTED_BLOCK:
20176+ return "formatted";
20177+ case JNODE_BITMAP:
20178+ return "bitmap";
20179+ case JNODE_IO_HEAD:
20180+ return "io head";
20181+ case JNODE_INODE:
20182+ return "inode";
20183+ case LAST_JNODE_TYPE:
20184+ return "last";
20185+ default:{
20186+ static char unknown[30];
20187+
20188+ sprintf(unknown, "unknown %i", type);
20189+ return unknown;
20190+ }
20191+ }
20192+}
20193+
20194+#define jnode_state_name( node, flag ) \
20195+ ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20196+
20197+/* debugging aid: output human readable information about @node */
20198+static void info_jnode(const char *prefix /* prefix to print */ ,
20199+ const jnode * node /* node to print */ )
20200+{
20201+ assert("umka-068", prefix != NULL);
20202+
20203+ if (node == NULL) {
20204+ printk("%s: null\n", prefix);
20205+ return;
20206+ }
20207+
20208+ printk
20209+ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20210+ " block: %s, d_count: %d, x_count: %d, "
20211+ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20212+ node->state,
20213+ jnode_state_name(node, JNODE_PARSED),
20214+ jnode_state_name(node, JNODE_HEARD_BANSHEE),
20215+ jnode_state_name(node, JNODE_LEFT_CONNECTED),
20216+ jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20217+ jnode_state_name(node, JNODE_ORPHAN),
20218+ jnode_state_name(node, JNODE_CREATED),
20219+ jnode_state_name(node, JNODE_RELOC),
20220+ jnode_state_name(node, JNODE_OVRWR),
20221+ jnode_state_name(node, JNODE_DIRTY),
20222+ jnode_state_name(node, JNODE_IS_DYING),
20223+ jnode_state_name(node, JNODE_RIP),
20224+ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20225+ jnode_state_name(node, JNODE_WRITEBACK),
20226+ jnode_state_name(node, JNODE_NEW),
20227+ jnode_state_name(node, JNODE_DKSET),
20228+ jnode_state_name(node, JNODE_REPACK),
20229+ jnode_state_name(node, JNODE_CLUSTER_PAGE),
20230+ jnode_get_level(node), sprint_address(jnode_get_block(node)),
20231+ atomic_read(&node->d_count), atomic_read(&node->x_count),
20232+ jnode_page(node), node->atom, 0, 0,
20233+ jnode_type_name(jnode_get_type(node)));
20234+ if (jnode_is_unformatted(node)) {
20235+ printk("inode: %llu, index: %lu, ",
20236+ node->key.j.objectid, node->key.j.index);
20237+ }
20238+}
20239+
20240+/* debugging aid: check znode invariant and panic if it doesn't hold */
20241+static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20242+{
20243+ char const *failed_msg;
20244+ int result;
20245+ reiser4_tree *tree;
20246+
20247+ tree = jnode_get_tree(node);
20248+
20249+ assert("umka-063312", node != NULL);
20250+ assert("umka-064321", tree != NULL);
20251+
20252+ if (!jlocked && !tlocked)
20253+ spin_lock_jnode((jnode *) node);
20254+ if (!tlocked)
20255+ read_lock_tree(jnode_get_tree(node));
20256+ result = jnode_invariant_f(node, &failed_msg);
20257+ if (!result) {
20258+ info_jnode("corrupted node", node);
20259+ warning("jmacd-555", "Condition %s failed", failed_msg);
20260+ }
20261+ if (!tlocked)
20262+ read_unlock_tree(jnode_get_tree(node));
20263+ if (!jlocked && !tlocked)
20264+ spin_unlock_jnode((jnode *) node);
20265+ return result;
20266+}
20267+
20268+#endif /* REISER4_DEBUG */
20269+
20270+/* Make Linus happy.
20271+ Local variables:
20272+ c-indentation-style: "K&R"
20273+ mode-name: "LC"
20274+ c-basic-offset: 8
20275+ tab-width: 8
20276+ fill-column: 80
20277+ End:
20278+*/
20279diff -urN linux-2.6.23.orig/fs/reiser4/jnode.h linux-2.6.23/fs/reiser4/jnode.h
20280--- linux-2.6.23.orig/fs/reiser4/jnode.h 1970-01-01 03:00:00.000000000 +0300
20281+++ linux-2.6.23/fs/reiser4/jnode.h 2007-12-04 16:49:30.000000000 +0300
20282@@ -0,0 +1,702 @@
20283+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20284+ * reiser4/README */
20285+
20286+/* Declaration of jnode. See jnode.c for details. */
20287+
20288+#ifndef __JNODE_H__
20289+#define __JNODE_H__
20290+
20291+#include "forward.h"
20292+#include "type_safe_hash.h"
20293+#include "txnmgr.h"
20294+#include "key.h"
20295+#include "debug.h"
20296+#include "dformat.h"
20297+#include "page_cache.h"
20298+#include "context.h"
20299+
20300+#include "plugin/plugin.h"
20301+
20302+#include <linux/fs.h>
20303+#include <linux/mm.h>
20304+#include <linux/spinlock.h>
20305+#include <asm/atomic.h>
20306+#include <linux/bitops.h>
20307+#include <linux/list.h>
20308+#include <linux/rcupdate.h>
20309+
20310+/* declare hash table of jnodes (jnodes proper, that is, unformatted
20311+ nodes) */
20312+TYPE_SAFE_HASH_DECLARE(j, jnode);
20313+
20314+/* declare hash table of znodes */
20315+TYPE_SAFE_HASH_DECLARE(z, znode);
20316+
20317+struct jnode_key {
20318+ __u64 objectid;
20319+ unsigned long index;
20320+ struct address_space *mapping;
20321+};
20322+
20323+/*
20324+ Jnode is the "base class" of other nodes in reiser4. It is also happens to
20325+ be exactly the node we use for unformatted tree nodes.
20326+
20327+ Jnode provides following basic functionality:
20328+
20329+ . reference counting and indexing.
20330+
20331+ . integration with page cache. Jnode has ->pg reference to which page can
20332+ be attached.
20333+
20334+ . interface to transaction manager. It is jnode that is kept in transaction
20335+ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20336+ means, there should be special type of jnode for inode.)
20337+
20338+ Locking:
20339+
20340+ Spin lock: the following fields are protected by the per-jnode spin lock:
20341+
20342+ ->state
20343+ ->atom
20344+ ->capture_link
20345+
20346+ Following fields are protected by the global tree lock:
20347+
20348+ ->link
20349+ ->key.z (content of ->key.z is only changed in znode_rehash())
20350+ ->key.j
20351+
20352+ Atomic counters
20353+
20354+ ->x_count
20355+ ->d_count
20356+
20357+ ->pg, and ->data are protected by spin lock for unused jnode and are
20358+ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20359+ is false).
20360+
20361+ ->tree is immutable after creation
20362+
20363+ Unclear
20364+
20365+ ->blocknr: should be under jnode spin-lock, but current interface is based
20366+ on passing of block address.
20367+
20368+ If you ever need to spin lock two nodes at once, do this in "natural"
20369+ memory order: lock znode with lower address first. (See lock_two_nodes().)
20370+
20371+ Invariants involving this data-type:
20372+
20373+ [jnode-dirty]
20374+ [jnode-refs]
20375+ [jnode-oid]
20376+ [jnode-queued]
20377+ [jnode-atom-valid]
20378+ [jnode-page-binding]
20379+*/
20380+
20381+struct jnode {
20382+#if REISER4_DEBUG
20383+#define JMAGIC 0x52654973 /* "ReIs" */
20384+ int magic;
20385+#endif
20386+ /* FIRST CACHE LINE (16 bytes): data used by jload */
20387+
20388+ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20389+ /* 0 */ unsigned long state;
20390+
20391+ /* lock, protecting jnode's fields. */
20392+ /* 4 */ spinlock_t load;
20393+
20394+ /* counter of references to jnode itself. Increased on jref().
20395+ Decreased on jput().
20396+ */
20397+ /* 8 */ atomic_t x_count;
20398+
20399+ /* counter of references to jnode's data. Pin data page(s) in
20400+ memory while this is greater than 0. Increased on jload().
20401+ Decreased on jrelse().
20402+ */
20403+ /* 12 */ atomic_t d_count;
20404+
20405+ /* SECOND CACHE LINE: data used by hash table lookups */
20406+
20407+ /* 16 */ union {
20408+ /* znodes are hashed by block number */
20409+ reiser4_block_nr z;
20410+ /* unformatted nodes are hashed by mapping plus offset */
20411+ struct jnode_key j;
20412+ } key;
20413+
20414+ /* THIRD CACHE LINE */
20415+
20416+ /* 32 */ union {
20417+ /* pointers to maintain hash-table */
20418+ z_hash_link z;
20419+ j_hash_link j;
20420+ } link;
20421+
20422+ /* pointer to jnode page. */
20423+ /* 36 */ struct page *pg;
20424+ /* pointer to node itself. This is page_address(node->pg) when page is
20425+ attached to the jnode
20426+ */
20427+ /* 40 */ void *data;
20428+
20429+ /* 44 */ reiser4_tree *tree;
20430+
20431+ /* FOURTH CACHE LINE: atom related fields */
20432+
20433+ /* 48 */ spinlock_t guard;
20434+
20435+ /* atom the block is in, if any */
20436+ /* 52 */ txn_atom *atom;
20437+
20438+ /* capture list */
20439+ /* 56 */ struct list_head capture_link;
20440+
20441+ /* FIFTH CACHE LINE */
20442+
20443+ /* 64 */ struct rcu_head rcu;
20444+ /* crosses cache line */
20445+
20446+ /* SIXTH CACHE LINE */
20447+
20448+ /* the real blocknr (where io is going to/from) */
20449+ /* 80 */ reiser4_block_nr blocknr;
20450+ /* Parent item type, unformatted and CRC need it for offset => key conversion. */
20451+ /* NOTE: this parent_item_id looks like jnode type. */
20452+ /* 88 */ reiser4_plugin_id parent_item_id;
20453+ /* 92 */
20454+#if REISER4_DEBUG
20455+ /* list of all jnodes for debugging purposes. */
20456+ struct list_head jnodes;
20457+ /* how many times this jnode was written in one transaction */
20458+ int written;
20459+ /* this indicates which atom's list the jnode is on */
20460+ atom_list list;
20461+#endif
20462+} __attribute__ ((aligned(16)));
20463+
20464+/*
20465+ * jnode types. Enumeration of existing jnode types.
20466+ */
20467+typedef enum {
20468+ JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20469+ JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20470+ JNODE_BITMAP, /* bitmap */
20471+ JNODE_IO_HEAD, /* jnode representing a block in the
20472+ * wandering log */
20473+ JNODE_INODE, /* jnode embedded into inode */
20474+ LAST_JNODE_TYPE
20475+} jnode_type;
20476+
20477+/* jnode states */
20478+typedef enum {
20479+ /* jnode's page is loaded and data checked */
20480+ JNODE_PARSED = 0,
20481+ /* node was deleted, not all locks on it were released. This
20482+ node is empty and is going to be removed from the tree
20483+ shortly. */
20484+ JNODE_HEARD_BANSHEE = 1,
20485+ /* left sibling pointer is valid */
20486+ JNODE_LEFT_CONNECTED = 2,
20487+ /* right sibling pointer is valid */
20488+ JNODE_RIGHT_CONNECTED = 3,
20489+
20490+ /* znode was just created and doesn't yet have a pointer from
20491+ its parent */
20492+ JNODE_ORPHAN = 4,
20493+
20494+ /* this node was created by its transaction and has not been assigned
20495+ a block address. */
20496+ JNODE_CREATED = 5,
20497+
20498+ /* this node is currently relocated */
20499+ JNODE_RELOC = 6,
20500+ /* this node is currently wandered */
20501+ JNODE_OVRWR = 7,
20502+
20503+ /* this znode has been modified */
20504+ JNODE_DIRTY = 8,
20505+
20506+ /* znode lock is being invalidated */
20507+ JNODE_IS_DYING = 9,
20508+
20509+ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20510+
20511+ /* jnode is queued for flushing. */
20512+ JNODE_FLUSH_QUEUED = 12,
20513+
20514+ /* In the following bits jnode type is encoded. */
20515+ JNODE_TYPE_1 = 13,
20516+ JNODE_TYPE_2 = 14,
20517+ JNODE_TYPE_3 = 15,
20518+
20519+ /* jnode is being destroyed */
20520+ JNODE_RIP = 16,
20521+
20522+ /* znode was not captured during locking (it might so be because
20523+ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20524+ JNODE_MISSED_IN_CAPTURE = 17,
20525+
20526+ /* write is in progress */
20527+ JNODE_WRITEBACK = 18,
20528+
20529+ /* FIXME: now it is used by crypto-compress plugin only */
20530+ JNODE_NEW = 19,
20531+
20532+ /* delimiting keys are already set for this znode. */
20533+ JNODE_DKSET = 20,
20534+
20535+ /* when this bit is set page and jnode can not be disconnected */
20536+ JNODE_WRITE_PREPARED = 21,
20537+
20538+ JNODE_CLUSTER_PAGE = 22,
20539+ /* Jnode is marked for repacking, that means the reiser4 flush and the
20540+ * block allocator should process this node special way */
20541+ JNODE_REPACK = 23,
20542+ /* node should be converted by flush in squalloc phase */
20543+ JNODE_CONVERTIBLE = 24,
20544+ /*
20545+ * When jnode is dirtied for the first time in given transaction,
20546+ * do_jnode_make_dirty() checks whether this jnode can possible became
20547+ * member of overwrite set. If so, this bit is set, and one block is
20548+ * reserved in the ->flush_reserved space of atom.
20549+ *
20550+ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20551+ *
20552+ * (1) flush decides that we want this block to go into relocate
20553+ * set after all.
20554+ *
20555+ * (2) wandering log is allocated (by log writer)
20556+ *
20557+ * (3) extent is allocated
20558+ *
20559+ */
20560+ JNODE_FLUSH_RESERVED = 29
20561+} reiser4_jnode_state;
20562+
20563+/* Macros for accessing the jnode state. */
20564+
20565+static inline void JF_CLR(jnode * j, int f)
20566+{
20567+ assert("unknown-1", j->magic == JMAGIC);
20568+ clear_bit(f, &j->state);
20569+}
20570+static inline int JF_ISSET(const jnode * j, int f)
20571+{
20572+ assert("unknown-2", j->magic == JMAGIC);
20573+ return test_bit(f, &((jnode *) j)->state);
20574+}
20575+static inline void JF_SET(jnode * j, int f)
20576+{
20577+ assert("unknown-3", j->magic == JMAGIC);
20578+ set_bit(f, &j->state);
20579+}
20580+
20581+static inline int JF_TEST_AND_SET(jnode * j, int f)
20582+{
20583+ assert("unknown-4", j->magic == JMAGIC);
20584+ return test_and_set_bit(f, &j->state);
20585+}
20586+
20587+static inline void spin_lock_jnode(jnode *node)
20588+{
20589+ /* check that spinlocks of lower priorities are not held */
20590+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20591+ LOCK_CNT_NIL(spin_locked_txnh) &&
20592+ LOCK_CNT_NIL(spin_locked_zlock) &&
20593+ LOCK_CNT_NIL(rw_locked_dk) &&
20594+ LOCK_CNT_LT(spin_locked_jnode, 2)));
20595+
20596+ spin_lock(&(node->guard));
20597+
20598+ LOCK_CNT_INC(spin_locked_jnode);
20599+ LOCK_CNT_INC(spin_locked);
20600+}
20601+
20602+static inline void spin_unlock_jnode(jnode *node)
20603+{
20604+ assert_spin_locked(&(node->guard));
20605+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20606+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20607+
20608+ LOCK_CNT_DEC(spin_locked_jnode);
20609+ LOCK_CNT_DEC(spin_locked);
20610+
20611+ spin_unlock(&(node->guard));
20612+}
20613+
20614+static inline int jnode_is_in_deleteset(const jnode * node)
20615+{
20616+ return JF_ISSET(node, JNODE_RELOC);
20617+}
20618+
20619+extern int init_jnodes(void);
20620+extern void done_jnodes(void);
20621+
20622+/* Jnode routines */
20623+extern jnode *jalloc(void);
20624+extern void jfree(jnode * node) NONNULL;
20625+extern jnode *jclone(jnode *);
20626+extern jnode *jlookup(reiser4_tree * tree,
20627+ oid_t objectid, unsigned long ind) NONNULL;
20628+extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20629+extern jnode *jnode_by_page(struct page *pg) NONNULL;
20630+extern jnode *jnode_of_page(struct page *pg) NONNULL;
20631+void jnode_attach_page(jnode * node, struct page *pg);
20632+
20633+void unhash_unformatted_jnode(jnode *);
20634+extern jnode *page_next_jnode(jnode * node) NONNULL;
20635+extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20636+extern void jnode_make_dirty(jnode * node) NONNULL;
20637+extern void jnode_make_clean(jnode * node) NONNULL;
20638+extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20639+extern void jnode_make_wander(jnode *) NONNULL;
20640+extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
20641+extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20642+extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20643+
20644+/**
20645+ * jnode_get_block
20646+ * @node: jnode to query
20647+ *
20648+ */
20649+static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20650+{
20651+ assert("nikita-528", node != NULL);
20652+
20653+ return &node->blocknr;
20654+}
20655+
20656+/**
20657+ * jnode_set_block
20658+ * @node: jnode to update
20659+ * @blocknr: new block nr
20660+ */
20661+static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20662+{
20663+ assert("nikita-2020", node != NULL);
20664+ assert("umka-055", blocknr != NULL);
20665+ node->blocknr = *blocknr;
20666+}
20667+
20668+
20669+/* block number for IO. Usually this is the same as jnode_get_block(), unless
20670+ * jnode was emergency flushed---then block number chosen by eflush is
20671+ * used. */
20672+static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20673+{
20674+ assert("nikita-2768", node != NULL);
20675+ assert_spin_locked(&(node->guard));
20676+
20677+ return jnode_get_block(node);
20678+}
20679+
20680+/* Jnode flush interface. */
20681+extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos);
20682+extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos);
20683+
20684+/* FIXME-VS: these are used in plugin/item/extent.c */
20685+
20686+/* does extent_get_block have to be called */
20687+#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
20688+#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20689+
20690+/* the node should be converted during flush squalloc phase */
20691+#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
20692+#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
20693+
20694+/* Macros to convert from jnode to znode, znode to jnode. These are macros
20695+ because C doesn't allow overloading of const prototypes. */
20696+#define ZJNODE(x) (& (x) -> zjnode)
20697+#define JZNODE(x) \
20698+({ \
20699+ typeof (x) __tmp_x; \
20700+ \
20701+ __tmp_x = (x); \
20702+ assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \
20703+ (znode*) __tmp_x; \
20704+})
20705+
20706+extern int jnodes_tree_init(reiser4_tree * tree);
20707+extern int jnodes_tree_done(reiser4_tree * tree);
20708+
20709+#if REISER4_DEBUG
20710+
20711+extern int znode_is_any_locked(const znode * node);
20712+extern void jnode_list_remove(jnode * node);
20713+
20714+#else
20715+
20716+#define jnode_list_remove(node) noop
20717+
20718+#endif
20719+
20720+int znode_is_root(const znode * node) NONNULL;
20721+
20722+/* bump reference counter on @node */
20723+static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
20724+{
20725+ assert("nikita-1911", node != NULL);
20726+
20727+ atomic_inc(&node->x_count);
20728+ LOCK_CNT_INC(x_refs);
20729+}
20730+
20731+static inline void dec_x_ref(jnode * node)
20732+{
20733+ assert("nikita-3215", node != NULL);
20734+ assert("nikita-3216", atomic_read(&node->x_count) > 0);
20735+
20736+ atomic_dec(&node->x_count);
20737+ assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20738+ LOCK_CNT_DEC(x_refs);
20739+}
20740+
20741+/* jref() - increase counter of references to jnode/znode (x_count) */
20742+static inline jnode *jref(jnode * node)
20743+{
20744+ assert("jmacd-508", (node != NULL) && !IS_ERR(node));
20745+ add_x_ref(node);
20746+ return node;
20747+}
20748+
20749+/* get the page of jnode */
20750+static inline struct page *jnode_page(const jnode * node)
20751+{
20752+ return node->pg;
20753+}
20754+
20755+/* return pointer to jnode data */
20756+static inline char *jdata(const jnode * node)
20757+{
20758+ assert("nikita-1415", node != NULL);
20759+ assert("nikita-3198", jnode_page(node) != NULL);
20760+ return node->data;
20761+}
20762+
20763+static inline int jnode_is_loaded(const jnode * node)
20764+{
20765+ assert("zam-506", node != NULL);
20766+ return atomic_read(&node->d_count) > 0;
20767+}
20768+
20769+extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
20770+
20771+static inline void jnode_set_reloc(jnode * node)
20772+{
20773+ assert("nikita-2431", node != NULL);
20774+ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
20775+ JF_SET(node, JNODE_RELOC);
20776+}
20777+
20778+/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
20779+
20780+extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
20781+
20782+static inline int jload(jnode *node)
20783+{
20784+ return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
20785+}
20786+
20787+extern int jinit_new(jnode *, gfp_t) NONNULL;
20788+extern int jstartio(jnode *) NONNULL;
20789+
20790+extern void jdrop(jnode *) NONNULL;
20791+extern int jwait_io(jnode *, int rw) NONNULL;
20792+
20793+void jload_prefetch(jnode *);
20794+
20795+extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
20796+extern void reiser4_drop_io_head(jnode * node) NONNULL;
20797+
20798+static inline reiser4_tree *jnode_get_tree(const jnode * node)
20799+{
20800+ assert("nikita-2691", node != NULL);
20801+ return node->tree;
20802+}
20803+
20804+extern void pin_jnode_data(jnode *);
20805+extern void unpin_jnode_data(jnode *);
20806+
20807+static inline jnode_type jnode_get_type(const jnode * node)
20808+{
20809+ static const unsigned long state_mask =
20810+ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
20811+
20812+ static jnode_type mask_to_type[] = {
20813+ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
20814+
20815+ /* 000 */
20816+ [0] = JNODE_FORMATTED_BLOCK,
20817+ /* 001 */
20818+ [1] = JNODE_UNFORMATTED_BLOCK,
20819+ /* 010 */
20820+ [2] = JNODE_BITMAP,
20821+ /* 011 */
20822+ [3] = LAST_JNODE_TYPE, /*invalid */
20823+ /* 100 */
20824+ [4] = JNODE_INODE,
20825+ /* 101 */
20826+ [5] = LAST_JNODE_TYPE,
20827+ /* 110 */
20828+ [6] = JNODE_IO_HEAD,
20829+ /* 111 */
20830+ [7] = LAST_JNODE_TYPE, /* invalid */
20831+ };
20832+
20833+ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
20834+}
20835+
20836+/* returns true if node is a znode */
20837+static inline int jnode_is_znode(const jnode * node)
20838+{
20839+ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
20840+}
20841+
20842+static inline int jnode_is_flushprepped(jnode * node)
20843+{
20844+ assert("jmacd-78212", node != NULL);
20845+ assert_spin_locked(&(node->guard));
20846+ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
20847+ JF_ISSET(node, JNODE_OVRWR);
20848+}
20849+
20850+/* Return true if @node has already been processed by the squeeze and allocate
20851+ process. This implies the block address has been finalized for the
20852+ duration of this atom (or it is clean and will remain in place). If this
20853+ returns true you may use the block number as a hint. */
20854+static inline int jnode_check_flushprepped(jnode * node)
20855+{
20856+ int result;
20857+
20858+ /* It must be clean or relocated or wandered. New allocations are set to relocate. */
20859+ spin_lock_jnode(node);
20860+ result = jnode_is_flushprepped(node);
20861+ spin_unlock_jnode(node);
20862+ return result;
20863+}
20864+
20865+/* returns true if node is unformatted */
20866+static inline int jnode_is_unformatted(const jnode * node)
20867+{
20868+ assert("jmacd-0123", node != NULL);
20869+ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
20870+}
20871+
20872+/* returns true if node represents a cluster cache page */
20873+static inline int jnode_is_cluster_page(const jnode * node)
20874+{
20875+ assert("edward-50", node != NULL);
20876+ return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
20877+}
20878+
20879+/* returns true is node is builtin inode's jnode */
20880+static inline int jnode_is_inode(const jnode * node)
20881+{
20882+ assert("vs-1240", node != NULL);
20883+ return jnode_get_type(node) == JNODE_INODE;
20884+}
20885+
20886+static inline jnode_plugin *jnode_ops_of(const jnode_type type)
20887+{
20888+ assert("nikita-2367", type < LAST_JNODE_TYPE);
20889+ return jnode_plugin_by_id((reiser4_plugin_id) type);
20890+}
20891+
20892+static inline jnode_plugin *jnode_ops(const jnode * node)
20893+{
20894+ assert("nikita-2366", node != NULL);
20895+
20896+ return jnode_ops_of(jnode_get_type(node));
20897+}
20898+
20899+/* Get the index of a block. */
20900+static inline unsigned long jnode_get_index(jnode * node)
20901+{
20902+ return jnode_ops(node)->index(node);
20903+}
20904+
20905+/* return true if "node" is the root */
20906+static inline int jnode_is_root(const jnode * node)
20907+{
20908+ return jnode_is_znode(node) && znode_is_root(JZNODE(node));
20909+}
20910+
20911+extern struct address_space *mapping_jnode(const jnode * node);
20912+extern unsigned long index_jnode(const jnode * node);
20913+
20914+static inline void jput(jnode * node);
20915+extern void jput_final(jnode * node);
20916+
20917+/* bump data counter on @node */
20918+static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
20919+{
20920+ assert("nikita-1962", node != NULL);
20921+
20922+ atomic_inc(&node->d_count);
20923+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
20924+ LOCK_CNT_INC(d_refs);
20925+}
20926+
20927+/* jput() - decrement x_count reference counter on znode.
20928+
20929+ Count may drop to 0, jnode stays in cache until memory pressure causes the
20930+ eviction of its page. The c_count variable also ensures that children are
20931+ pressured out of memory before the parent. The jnode remains hashed as
20932+ long as the VM allows its page to stay in memory.
20933+*/
20934+static inline void jput(jnode * node)
20935+{
20936+ assert("jmacd-509", node != NULL);
20937+ assert("jmacd-510", atomic_read(&node->x_count) > 0);
20938+ assert("zam-926", reiser4_schedulable());
20939+ LOCK_CNT_DEC(x_refs);
20940+
20941+ rcu_read_lock();
20942+ /*
20943+ * we don't need any kind of lock here--jput_final() uses RCU.
20944+ */
20945+ if (unlikely(atomic_dec_and_test(&node->x_count))) {
20946+ jput_final(node);
20947+ } else
20948+ rcu_read_unlock();
20949+ assert("nikita-3473", reiser4_schedulable());
20950+}
20951+
20952+extern void jrelse(jnode * node);
20953+extern void jrelse_tail(jnode * node);
20954+
20955+extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
20956+
20957+/* resolve race with jput */
20958+static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
20959+{
20960+ if (unlikely(JF_ISSET(node, JNODE_RIP)))
20961+ node = jnode_rip_sync(tree, node);
20962+ return node;
20963+}
20964+
20965+extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
20966+
20967+#if REISER4_DEBUG
20968+extern int jnode_invariant_f(const jnode *node, char const **msg);
20969+#endif
20970+
20971+extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
20972+
20973+/* __JNODE_H__ */
20974+#endif
20975+
20976+/* Make Linus happy.
20977+ Local variables:
20978+ c-indentation-style: "K&R"
20979+ mode-name: "LC"
20980+ c-basic-offset: 8
20981+ tab-width: 8
20982+ fill-column: 120
20983+ End:
20984+*/
20985diff -urN linux-2.6.23.orig/fs/reiser4/kassign.c linux-2.6.23/fs/reiser4/kassign.c
20986--- linux-2.6.23.orig/fs/reiser4/kassign.c 1970-01-01 03:00:00.000000000 +0300
20987+++ linux-2.6.23/fs/reiser4/kassign.c 2007-12-04 22:59:05.790367863 +0300
20988@@ -0,0 +1,661 @@
20989+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20990+ * reiser4/README */
20991+
20992+/* Key assignment policy implementation */
20993+
20994+/*
20995+ * In reiser4 every piece of file system data and meta-data has a key. Keys
20996+ * are used to store information in and retrieve it from reiser4 internal
20997+ * tree. In addition to this, keys define _ordering_ of all file system
20998+ * information: things having close keys are placed into the same or
20999+ * neighboring (in the tree order) nodes of the tree. As our block allocator
21000+ * tries to respect tree order (see flush.c), keys also define order in which
21001+ * things are laid out on the disk, and hence, affect performance directly.
21002+ *
21003+ * Obviously, assignment of keys to data and meta-data should be consistent
21004+ * across whole file system. Algorithm that calculates a key for a given piece
21005+ * of data or meta-data is referred to as "key assignment".
21006+ *
21007+ * Key assignment is too expensive to be implemented as a plugin (that is,
21008+ * with an ability to support different key assignment schemas in the same
21009+ * compiled kernel image). As a compromise, all key-assignment functions and
21010+ * data-structures are collected in this single file, so that modifications to
21011+ * key assignment algorithm can be localized. Additional changes may be
21012+ * required in key.[ch].
21013+ *
21014+ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
21015+ * may guess, there is "Plan B" too.
21016+ *
21017+ */
21018+
21019+/*
21020+ * Additional complication with key assignment implementation is a requirement
21021+ * to support different key length.
21022+ */
21023+
21024+/*
21025+ * KEY ASSIGNMENT: PLAN A, LONG KEYS.
21026+ *
21027+ * DIRECTORY ITEMS
21028+ *
21029+ * | 60 | 4 | 7 |1| 56 | 64 | 64 |
21030+ * +--------------+---+---+-+-------------+------------------+-----------------+
21031+ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
21032+ * +--------------+---+---+-+-------------+------------------+-----------------+
21033+ * | | | | |
21034+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21035+ *
21036+ * dirid objectid of directory this item is for
21037+ *
21038+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21039+ *
21040+ * H 1 if last 8 bytes of the key contain hash,
21041+ * 0 if last 8 bytes of the key contain prefix-3
21042+ *
21043+ * prefix-1 first 7 characters of file name.
21044+ * Padded by zeroes if name is not long enough.
21045+ *
21046+ * prefix-2 next 8 characters of the file name.
21047+ *
21048+ * prefix-3 next 8 characters of the file name.
21049+ *
21050+ * hash hash of the rest of file name (i.e., portion of file
21051+ * name not included into prefix-1 and prefix-2).
21052+ *
21053+ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21054+ * in the key. Such file names are called "short". They are distinguished by H
21055+ * bit set 0 in the key.
21056+ *
21057+ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21058+ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21059+ * key. Last 8 bytes of the key are occupied by hash of the remaining
21060+ * characters of the name.
21061+ *
21062+ * This key assignment reaches following important goals:
21063+ *
21064+ * (1) directory entries are sorted in approximately lexicographical
21065+ * order.
21066+ *
21067+ * (2) collisions (when multiple directory items have the same key), while
21068+ * principally unavoidable in a tree with fixed length keys, are rare.
21069+ *
21070+ * STAT DATA
21071+ *
21072+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21073+ * +--------------+---+-----------------+---+--------------+-----------------+
21074+ * | locality id | 1 | ordering | 0 | objectid | 0 |
21075+ * +--------------+---+-----------------+---+--------------+-----------------+
21076+ * | | | | |
21077+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21078+ *
21079+ * locality id object id of a directory where first name was created for
21080+ * the object
21081+ *
21082+ * ordering copy of second 8-byte portion of the key of directory
21083+ * entry for the first name of this object. Ordering has a form
21084+ * {
21085+ * fibration :7;
21086+ * h :1;
21087+ * prefix1 :56;
21088+ * }
21089+ * see description of key for directory entry above.
21090+ *
21091+ * objectid object id for this object
21092+ *
21093+ * This key assignment policy is designed to keep stat-data in the same order
21094+ * as corresponding directory items, thus speeding up readdir/stat types of
21095+ * workload.
21096+ *
21097+ * FILE BODY
21098+ *
21099+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21100+ * +--------------+---+-----------------+---+--------------+-----------------+
21101+ * | locality id | 4 | ordering | 0 | objectid | offset |
21102+ * +--------------+---+-----------------+---+--------------+-----------------+
21103+ * | | | | |
21104+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21105+ *
21106+ * locality id object id of a directory where first name was created for
21107+ * the object
21108+ *
21109+ * ordering the same as in the key of stat-data for this object
21110+ *
21111+ * objectid object id for this object
21112+ *
21113+ * offset logical offset from the beginning of this file.
21114+ * Measured in bytes.
21115+ *
21116+ *
21117+ * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21118+ *
21119+ * DIRECTORY ITEMS
21120+ *
21121+ * | 60 | 4 | 7 |1| 56 | 64 |
21122+ * +--------------+---+---+-+-------------+-----------------+
21123+ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21124+ * +--------------+---+---+-+-------------+-----------------+
21125+ * | | | |
21126+ * | 8 bytes | 8 bytes | 8 bytes |
21127+ *
21128+ * dirid objectid of directory this item is for
21129+ *
21130+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21131+ *
21132+ * H 1 if last 8 bytes of the key contain hash,
21133+ * 0 if last 8 bytes of the key contain prefix-2
21134+ *
21135+ * prefix-1 first 7 characters of file name.
21136+ * Padded by zeroes if name is not long enough.
21137+ *
21138+ * prefix-2 next 8 characters of the file name.
21139+ *
21140+ * hash hash of the rest of file name (i.e., portion of file
21141+ * name not included into prefix-1).
21142+ *
21143+ * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21144+ * the key. Such file names are called "short". They are distinguished by H
21145+ * bit set in the key.
21146+ *
21147+ * Other file names are "long". For long name, H bit is 0, and first 7
21148+ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21149+ * key are occupied by hash of the remaining characters of the name.
21150+ *
21151+ * STAT DATA
21152+ *
21153+ * | 60 | 4 | 4 | 60 | 64 |
21154+ * +--------------+---+---+--------------+-----------------+
21155+ * | locality id | 1 | 0 | objectid | 0 |
21156+ * +--------------+---+---+--------------+-----------------+
21157+ * | | | |
21158+ * | 8 bytes | 8 bytes | 8 bytes |
21159+ *
21160+ * locality id object id of a directory where first name was created for
21161+ * the object
21162+ *
21163+ * objectid object id for this object
21164+ *
21165+ * FILE BODY
21166+ *
21167+ * | 60 | 4 | 4 | 60 | 64 |
21168+ * +--------------+---+---+--------------+-----------------+
21169+ * | locality id | 4 | 0 | objectid | offset |
21170+ * +--------------+---+---+--------------+-----------------+
21171+ * | | | |
21172+ * | 8 bytes | 8 bytes | 8 bytes |
21173+ *
21174+ * locality id object id of a directory where first name was created for
21175+ * the object
21176+ *
21177+ * objectid object id for this object
21178+ *
21179+ * offset logical offset from the beginning of this file.
21180+ * Measured in bytes.
21181+ *
21182+ *
21183+ */
21184+
21185+#include "debug.h"
21186+#include "key.h"
21187+#include "kassign.h"
21188+#include "vfs_ops.h"
21189+#include "inode.h"
21190+#include "super.h"
21191+#include "dscale.h"
21192+
21193+#include <linux/types.h> /* for __u?? */
21194+#include <linux/fs.h> /* for struct super_block, etc */
21195+
21196+/* bitmask for H bit (see comment at the beginning of this file */
21197+static const __u64 longname_mark = 0x0100000000000000ull;
21198+/* bitmask for F and H portions of the key. */
21199+static const __u64 fibration_mask = 0xff00000000000000ull;
21200+
21201+/* return true if name is not completely encoded in @key */
21202+int is_longname_key(const reiser4_key * key)
21203+{
21204+ __u64 highpart;
21205+
21206+ assert("nikita-2863", key != NULL);
21207+ if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21208+ reiser4_print_key("oops", key);
21209+ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21210+
21211+ if (REISER4_LARGE_KEY)
21212+ highpart = get_key_ordering(key);
21213+ else
21214+ highpart = get_key_objectid(key);
21215+
21216+ return (highpart & longname_mark) ? 1 : 0;
21217+}
21218+
21219+/* return true if @name is too long to be completely encoded in the key */
21220+int is_longname(const char *name UNUSED_ARG, int len)
21221+{
21222+ if (REISER4_LARGE_KEY)
21223+ return len > 23;
21224+ else
21225+ return len > 15;
21226+}
21227+
21228+/* code ascii string into __u64.
21229+
21230+ Put characters of @name into result (@str) one after another starting
21231+ from @start_idx-th highest (arithmetically) byte. This produces
21232+ endian-safe encoding. memcpy(2) will not do.
21233+
21234+*/
21235+static __u64 pack_string(const char *name /* string to encode */ ,
21236+ int start_idx /* highest byte in result from
21237+ * which to start encoding */ )
21238+{
21239+ unsigned i;
21240+ __u64 str;
21241+
21242+ str = 0;
21243+ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21244+ str <<= 8;
21245+ str |= (unsigned char)name[i];
21246+ }
21247+ str <<= (sizeof str - i - start_idx) << 3;
21248+ return str;
21249+}
21250+
21251+/* opposite to pack_string(). Takes value produced by pack_string(), restores
21252+ * string encoded in it and stores result in @buf */
21253+char * reiser4_unpack_string(__u64 value, char *buf)
21254+{
21255+ do {
21256+ *buf = value >> (64 - 8);
21257+ if (*buf)
21258+ ++buf;
21259+ value <<= 8;
21260+ } while (value != 0);
21261+ *buf = 0;
21262+ return buf;
21263+}
21264+
21265+/* obtain name encoded in @key and store it in @buf */
21266+char *extract_name_from_key(const reiser4_key * key, char *buf)
21267+{
21268+ char *c;
21269+
21270+ assert("nikita-2868", !is_longname_key(key));
21271+
21272+ c = buf;
21273+ if (REISER4_LARGE_KEY) {
21274+ c = reiser4_unpack_string(get_key_ordering(key) &
21275+ ~fibration_mask, c);
21276+ c = reiser4_unpack_string(get_key_fulloid(key), c);
21277+ } else
21278+ c = reiser4_unpack_string(get_key_fulloid(key) &
21279+ ~fibration_mask, c);
21280+ reiser4_unpack_string(get_key_offset(key), c);
21281+ return buf;
21282+}
21283+
21284+/**
21285+ * complete_entry_key - calculate entry key by name
21286+ * @dir: directory where entry is (or will be) in
21287+ * @name: name to calculate key of
21288+ * @len: lenth of name
21289+ * @result: place to store result in
21290+ *
21291+ * Sets fields of entry key @result which depend on file name.
21292+ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21293+ * objectid and offset. Otherwise, objectid and offset are set.
21294+ */
21295+void complete_entry_key(const struct inode *dir, const char *name,
21296+ int len, reiser4_key *result)
21297+{
21298+#if REISER4_LARGE_KEY
21299+ __u64 ordering;
21300+ __u64 objectid;
21301+ __u64 offset;
21302+
21303+ assert("nikita-1139", dir != NULL);
21304+ assert("nikita-1142", result != NULL);
21305+ assert("nikita-2867", strlen(name) == len);
21306+
21307+ /*
21308+ * key allocation algorithm for directory entries in case of large
21309+ * keys:
21310+ *
21311+ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21312+ * characters into ordering field of key, next 8 charactes (if any)
21313+ * into objectid field of key and next 8 ones (of any) into offset
21314+ * field of key
21315+ *
21316+ * If file name is longer than 23 characters, put first 7 characters
21317+ * into key's ordering, next 8 to objectid and hash of remaining
21318+ * characters into offset field.
21319+ *
21320+ * To distinguish above cases, in latter set up unused high bit in
21321+ * ordering field.
21322+ */
21323+
21324+ /* [0-6] characters to ordering */
21325+ ordering = pack_string(name, 1);
21326+ if (len > 7) {
21327+ /* [7-14] characters to objectid */
21328+ objectid = pack_string(name + 7, 0);
21329+ if (len > 15) {
21330+ if (len <= 23) {
21331+ /* [15-23] characters to offset */
21332+ offset = pack_string(name + 15, 0);
21333+ } else {
21334+ /* note in a key the fact that offset contains hash. */
21335+ ordering |= longname_mark;
21336+
21337+ /* offset is the hash of the file name's tail. */
21338+ offset = inode_hash_plugin(dir)->hash(name + 15,
21339+ len - 15);
21340+ }
21341+ } else {
21342+ offset = 0ull;
21343+ }
21344+ } else {
21345+ objectid = 0ull;
21346+ offset = 0ull;
21347+ }
21348+
21349+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21350+ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21351+
21352+ set_key_ordering(result, ordering);
21353+ set_key_fulloid(result, objectid);
21354+ set_key_offset(result, offset);
21355+ return;
21356+
21357+#else
21358+ __u64 objectid;
21359+ __u64 offset;
21360+
21361+ assert("nikita-1139", dir != NULL);
21362+ assert("nikita-1142", result != NULL);
21363+ assert("nikita-2867", strlen(name) == len);
21364+
21365+ /*
21366+ * key allocation algorithm for directory entries in case of not large
21367+ * keys:
21368+ *
21369+ * If name is not longer than 7 + 8 = 15 characters, put first 7
21370+ * characters into objectid field of key, next 8 charactes (if any)
21371+ * into offset field of key
21372+ *
21373+ * If file name is longer than 15 characters, put first 7 characters
21374+ * into key's objectid, and hash of remaining characters into offset
21375+ * field.
21376+ *
21377+ * To distinguish above cases, in latter set up unused high bit in
21378+ * objectid field.
21379+ */
21380+
21381+ /* [0-6] characters to objectid */
21382+ objectid = pack_string(name, 1);
21383+ if (len > 7) {
21384+ if (len <= 15) {
21385+ /* [7-14] characters to offset */
21386+ offset = pack_string(name + 7, 0);
21387+ } else {
21388+ /* note in a key the fact that offset contains hash. */
21389+ objectid |= longname_mark;
21390+
21391+ /* offset is the hash of the file name. */
21392+ offset = inode_hash_plugin(dir)->hash(name + 7,
21393+ len - 7);
21394+ }
21395+ } else
21396+ offset = 0ull;
21397+
21398+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21399+ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21400+
21401+ set_key_fulloid(result, objectid);
21402+ set_key_offset(result, offset);
21403+ return;
21404+#endif /* ! REISER4_LARGE_KEY */
21405+}
21406+
21407+/* true, if @key is the key of "." */
21408+int is_dot_key(const reiser4_key * key /* key to check */ )
21409+{
21410+ assert("nikita-1717", key != NULL);
21411+ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21412+ return
21413+ (get_key_ordering(key) == 0ull) &&
21414+ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21415+}
21416+
21417+/* build key for stat-data.
21418+
21419+ return key of stat-data of this object. This should became sd plugin
21420+ method in the future. For now, let it be here.
21421+
21422+*/
21423+reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21424+ reiser4_key * result /* resulting key of @target
21425+ stat-data */ )
21426+{
21427+ assert("nikita-261", result != NULL);
21428+
21429+ reiser4_key_init(result);
21430+ set_key_locality(result, reiser4_inode_data(target)->locality_id);
21431+ set_key_ordering(result, get_inode_ordering(target));
21432+ set_key_objectid(result, get_inode_oid(target));
21433+ set_key_type(result, KEY_SD_MINOR);
21434+ set_key_offset(result, (__u64) 0);
21435+ return result;
21436+}
21437+
21438+/* encode part of key into &obj_key_id
21439+
21440+ This encodes into @id part of @key sufficient to restore @key later,
21441+ given that latter is key of object (key of stat-data).
21442+
21443+ See &obj_key_id
21444+*/
21445+int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21446+ obj_key_id * id /* id where key is encoded in */ )
21447+{
21448+ assert("nikita-1151", key != NULL);
21449+ assert("nikita-1152", id != NULL);
21450+
21451+ memcpy(id, key, sizeof *id);
21452+ return 0;
21453+}
21454+
21455+/* encode reference to @obj in @id.
21456+
21457+ This is like build_obj_key_id() above, but takes inode as parameter. */
21458+int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21459+ obj_key_id * id /* result */ )
21460+{
21461+ reiser4_key sdkey;
21462+
21463+ assert("nikita-1166", obj != NULL);
21464+ assert("nikita-1167", id != NULL);
21465+
21466+ build_sd_key(obj, &sdkey);
21467+ build_obj_key_id(&sdkey, id);
21468+ return 0;
21469+}
21470+
21471+/* decode @id back into @key
21472+
21473+ Restore key of object stat-data from @id. This is dual to
21474+ build_obj_key_id() above.
21475+*/
21476+int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21477+ * from */ ,
21478+ reiser4_key * key /* result */ )
21479+{
21480+ assert("nikita-1153", id != NULL);
21481+ assert("nikita-1154", key != NULL);
21482+
21483+ reiser4_key_init(key);
21484+ memcpy(key, id, sizeof *id);
21485+ return 0;
21486+}
21487+
21488+/* extract objectid of directory from key of directory entry within said
21489+ directory.
21490+ */
21491+oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21492+ * directory
21493+ * entry */ )
21494+{
21495+ assert("nikita-1314", de_key != NULL);
21496+ return get_key_locality(de_key);
21497+}
21498+
21499+/* encode into @id key of directory entry.
21500+
21501+ Encode into @id information sufficient to later distinguish directory
21502+ entries within the same directory. This is not whole key, because all
21503+ directory entries within directory item share locality which is equal
21504+ to objectid of their directory.
21505+
21506+*/
21507+int build_de_id(const struct inode *dir /* inode of directory */ ,
21508+ const struct qstr *name /* name to be given to @obj by
21509+ * directory entry being
21510+ * constructed */ ,
21511+ de_id * id /* short key of directory entry */ )
21512+{
21513+ reiser4_key key;
21514+
21515+ assert("nikita-1290", dir != NULL);
21516+ assert("nikita-1292", id != NULL);
21517+
21518+ /* NOTE-NIKITA this is suboptimal. */
21519+ inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21520+ return build_de_id_by_key(&key, id);
21521+}
21522+
21523+/* encode into @id key of directory entry.
21524+
21525+ Encode into @id information sufficient to later distinguish directory
21526+ entries within the same directory. This is not whole key, because all
21527+ directory entries within directory item share locality which is equal
21528+ to objectid of their directory.
21529+
21530+*/
21531+int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21532+ * entry */ ,
21533+ de_id * id /* short key of directory entry */ )
21534+{
21535+ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21536+ return 0;
21537+}
21538+
21539+/* restore from @id key of directory entry.
21540+
21541+ Function dual to build_de_id(): given @id and locality, build full
21542+ key of directory entry within directory item.
21543+
21544+*/
21545+int extract_key_from_de_id(const oid_t locality /* locality of directory
21546+ * entry */ ,
21547+ const de_id * id /* directory entry id */ ,
21548+ reiser4_key * key /* result */ )
21549+{
21550+ /* no need to initialise key here: all fields are overwritten */
21551+ memcpy(((__u64 *) key) + 1, id, sizeof *id);
21552+ set_key_locality(key, locality);
21553+ set_key_type(key, KEY_FILE_NAME_MINOR);
21554+ return 0;
21555+}
21556+
21557+/* compare two &de_id's */
21558+cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21559+ const de_id * id2 /* second &de_id to compare */ )
21560+{
21561+ /* NOTE-NIKITA ugly implementation */
21562+ reiser4_key k1;
21563+ reiser4_key k2;
21564+
21565+ extract_key_from_de_id((oid_t) 0, id1, &k1);
21566+ extract_key_from_de_id((oid_t) 0, id2, &k2);
21567+ return keycmp(&k1, &k2);
21568+}
21569+
21570+/* compare &de_id with key */
21571+cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21572+ const reiser4_key * key /* key to compare */ )
21573+{
21574+ cmp_t result;
21575+ reiser4_key *k1;
21576+
21577+ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21578+ result = KEY_DIFF_EL(k1, key, 1);
21579+ if (result == EQUAL_TO) {
21580+ result = KEY_DIFF_EL(k1, key, 2);
21581+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
21582+ result = KEY_DIFF_EL(k1, key, 3);
21583+ }
21584+ }
21585+ return result;
21586+}
21587+
21588+/*
21589+ * return number of bytes necessary to encode @inode identity.
21590+ */
21591+int inode_onwire_size(const struct inode *inode)
21592+{
21593+ int result;
21594+
21595+ result = dscale_bytes(get_inode_oid(inode));
21596+ result += dscale_bytes(get_inode_locality(inode));
21597+
21598+ /*
21599+ * ordering is large (it usually has highest bits set), so it makes
21600+ * little sense to dscale it.
21601+ */
21602+ if (REISER4_LARGE_KEY)
21603+ result += sizeof(get_inode_ordering(inode));
21604+ return result;
21605+}
21606+
21607+/*
21608+ * encode @inode identity at @start
21609+ */
21610+char *build_inode_onwire(const struct inode *inode, char *start)
21611+{
21612+ start += dscale_write(start, get_inode_locality(inode));
21613+ start += dscale_write(start, get_inode_oid(inode));
21614+
21615+ if (REISER4_LARGE_KEY) {
21616+ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21617+ start += sizeof(get_inode_ordering(inode));
21618+ }
21619+ return start;
21620+}
21621+
21622+/*
21623+ * extract key that was previously encoded by build_inode_onwire() at @addr
21624+ */
21625+char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21626+{
21627+ __u64 val;
21628+
21629+ addr += dscale_read(addr, &val);
21630+ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21631+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21632+ addr += dscale_read(addr, &val);
21633+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21634+#if REISER4_LARGE_KEY
21635+ memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21636+ addr += sizeof key_id->ordering;
21637+#endif
21638+ return addr;
21639+}
21640+
21641+/* Make Linus happy.
21642+ Local variables:
21643+ c-indentation-style: "K&R"
21644+ mode-name: "LC"
21645+ c-basic-offset: 8
21646+ tab-width: 8
21647+ fill-column: 120
21648+ End:
21649+*/
21650diff -urN linux-2.6.23.orig/fs/reiser4/kassign.h linux-2.6.23/fs/reiser4/kassign.h
21651--- linux-2.6.23.orig/fs/reiser4/kassign.h 1970-01-01 03:00:00.000000000 +0300
21652+++ linux-2.6.23/fs/reiser4/kassign.h 2007-12-04 22:59:05.794368893 +0300
21653@@ -0,0 +1,110 @@
21654+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21655+ * reiser4/README */
21656+
21657+/* Key assignment policy interface. See kassign.c for details. */
21658+
21659+#if !defined( __KASSIGN_H__ )
21660+#define __KASSIGN_H__
21661+
21662+#include "forward.h"
21663+#include "key.h"
21664+#include "dformat.h"
21665+
21666+#include <linux/types.h> /* for __u?? */
21667+#include <linux/fs.h> /* for struct super_block, etc */
21668+#include <linux/dcache.h> /* for struct qstr */
21669+
21670+/* key assignment functions */
21671+
21672+/* Information from which key of file stat-data can be uniquely
21673+ restored. This depends on key assignment policy for
21674+ stat-data. Currently it's enough to store object id and locality id
21675+ (60+60==120) bits, because minor packing locality and offset of
21676+ stat-data key are always known constants: KEY_SD_MINOR and 0
21677+ respectively. For simplicity 4 bits are wasted in each id, and just
21678+ two 64 bit integers are stored.
21679+
21680+ This field has to be byte-aligned, because we don't want to waste
21681+ space in directory entries. There is another side of a coin of
21682+ course: we waste CPU and bus bandwidth in stead, by copying data back
21683+ and forth.
21684+
21685+ Next optimization: &obj_key_id is mainly used to address stat data from
21686+ directory entries. Under the assumption that majority of files only have
21687+ only name (one hard link) from *the* parent directory it seems reasonable
21688+ to only store objectid of stat data and take its locality from key of
21689+ directory item.
21690+
21691+ This requires some flag to be added to the &obj_key_id to distinguish
21692+ between these two cases. Remaining bits in flag byte are then asking to be
21693+ used to store file type.
21694+
21695+ This optimization requires changes in directory item handling code.
21696+
21697+*/
21698+typedef struct obj_key_id {
21699+ d8 locality[sizeof(__u64)];
21700+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21701+ )
21702+ d8 objectid[sizeof(__u64)];
21703+}
21704+obj_key_id;
21705+
21706+/* Information sufficient to uniquely identify directory entry within
21707+ compressed directory item.
21708+
21709+ For alignment issues see &obj_key_id above.
21710+*/
21711+typedef struct de_id {
21712+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21713+ d8 objectid[sizeof(__u64)];
21714+ d8 offset[sizeof(__u64)];
21715+}
21716+de_id;
21717+
21718+extern int inode_onwire_size(const struct inode *obj);
21719+extern char *build_inode_onwire(const struct inode *obj, char *area);
21720+extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21721+
21722+extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21723+extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21724+extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21725+extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21726+extern int build_de_id(const struct inode *dir, const struct qstr *name,
21727+ de_id * id);
21728+extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21729+extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21730+ reiser4_key * key);
21731+extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21732+extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21733+
21734+extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21735+extern void build_entry_key_common(const struct inode *dir,
21736+ const struct qstr *name,
21737+ reiser4_key * result);
21738+extern void build_entry_key_stable_entry(const struct inode *dir,
21739+ const struct qstr *name,
21740+ reiser4_key * result);
21741+extern int is_dot_key(const reiser4_key * key);
21742+extern reiser4_key *build_sd_key(const struct inode *target,
21743+ reiser4_key * result);
21744+
21745+extern int is_longname_key(const reiser4_key * key);
21746+extern int is_longname(const char *name, int len);
21747+extern char *extract_name_from_key(const reiser4_key * key, char *buf);
21748+extern char *reiser4_unpack_string(__u64 value, char *buf);
21749+extern void complete_entry_key(const struct inode *dir, const char *name,
21750+ int len, reiser4_key *result);
21751+
21752+/* __KASSIGN_H__ */
21753+#endif
21754+
21755+/* Make Linus happy.
21756+ Local variables:
21757+ c-indentation-style: "K&R"
21758+ mode-name: "LC"
21759+ c-basic-offset: 8
21760+ tab-width: 8
21761+ fill-column: 120
21762+ End:
21763+*/
21764diff -urN linux-2.6.23.orig/fs/reiser4/Kconfig linux-2.6.23/fs/reiser4/Kconfig
21765--- linux-2.6.23.orig/fs/reiser4/Kconfig 1970-01-01 03:00:00.000000000 +0300
21766+++ linux-2.6.23/fs/reiser4/Kconfig 2007-12-04 16:49:30.000000000 +0300
21767@@ -0,0 +1,34 @@
21768+config REISER4_FS
21769+ tristate "Reiser4 (EXPERIMENTAL)"
21770+ depends on EXPERIMENTAL
21771+ select ZLIB_INFLATE
21772+ select ZLIB_DEFLATE
21773+ select LZO_COMPRESS
21774+ select LZO_DECOMPRESS
21775+ select CRYPTO
21776+ help
21777+ Reiser4 is a filesystem that performs all filesystem operations
21778+ as atomic transactions, which means that it either performs a
21779+ write, or it does not, and in the event of a crash it does not
21780+ partially perform it or corrupt it.
21781+
21782+ It stores files in dancing trees, which are like balanced trees but
21783+ faster. It packs small files together so that they share blocks
21784+ without wasting space. This means you can use it to store really
21785+ small files. It also means that it saves you disk space. It avoids
21786+ hassling you with anachronisms like having a maximum number of
21787+ inodes, and wasting space if you use less than that number.
21788+
21789+ Reiser4 is a distinct filesystem type from reiserfs (V3).
21790+ It's therefore not possible to use reiserfs file systems
21791+ with reiser4.
21792+
21793+ To learn more about reiser4, go to http://www.namesys.com
21794+
21795+config REISER4_DEBUG
21796+ bool "Enable reiser4 debug mode"
21797+ depends on REISER4_FS
21798+ help
21799+ Don't use this unless you are debugging reiser4.
21800+
21801+ If unsure, say N.
21802diff -urN linux-2.6.23.orig/fs/reiser4/key.c linux-2.6.23/fs/reiser4/key.c
21803--- linux-2.6.23.orig/fs/reiser4/key.c 1970-01-01 03:00:00.000000000 +0300
21804+++ linux-2.6.23/fs/reiser4/key.c 2007-12-04 16:49:30.000000000 +0300
21805@@ -0,0 +1,137 @@
21806+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21807+
21808+/* Key manipulations. */
21809+
21810+#include "debug.h"
21811+#include "key.h"
21812+#include "super.h"
21813+#include "reiser4.h"
21814+
21815+#include <linux/types.h> /* for __u?? */
21816+
21817+/* Minimal possible key: all components are zero. It is presumed that this is
21818+ independent of key scheme. */
21819+static const reiser4_key MINIMAL_KEY = {
21820+ .el = {
21821+ 0ull,
21822+ ON_LARGE_KEY(0ull,)
21823+ 0ull,
21824+ 0ull
21825+ }
21826+};
21827+
21828+/* Maximal possible key: all components are ~0. It is presumed that this is
21829+ independent of key scheme. */
21830+static const reiser4_key MAXIMAL_KEY = {
21831+ .el = {
21832+ __constant_cpu_to_le64(~0ull),
21833+ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
21834+ __constant_cpu_to_le64(~0ull),
21835+ __constant_cpu_to_le64(~0ull)
21836+ }
21837+};
21838+
21839+/* Initialize key. */
21840+void reiser4_key_init(reiser4_key * key /* key to init */ )
21841+{
21842+ assert("nikita-1169", key != NULL);
21843+ memset(key, 0, sizeof *key);
21844+}
21845+
21846+/* minimal possible key in the tree. Return pointer to the static storage. */
21847+const reiser4_key *reiser4_min_key(void)
21848+{
21849+ return &MINIMAL_KEY;
21850+}
21851+
21852+/* maximum possible key in the tree. Return pointer to the static storage. */
21853+const reiser4_key *reiser4_max_key(void)
21854+{
21855+ return &MAXIMAL_KEY;
21856+}
21857+
21858+#if REISER4_DEBUG
21859+/* debugging aid: print symbolic name of key type */
21860+static const char *type_name(unsigned int key_type /* key type */ )
21861+{
21862+ switch (key_type) {
21863+ case KEY_FILE_NAME_MINOR:
21864+ return "file name";
21865+ case KEY_SD_MINOR:
21866+ return "stat data";
21867+ case KEY_ATTR_NAME_MINOR:
21868+ return "attr name";
21869+ case KEY_ATTR_BODY_MINOR:
21870+ return "attr body";
21871+ case KEY_BODY_MINOR:
21872+ return "file body";
21873+ default:
21874+ return "unknown";
21875+ }
21876+}
21877+
21878+/* debugging aid: print human readable information about key */
21879+void reiser4_print_key(const char *prefix /* prefix to print */ ,
21880+ const reiser4_key * key /* key to print */ )
21881+{
21882+ /* turn bold on */
21883+ /* printf ("\033[1m"); */
21884+ if (key == NULL)
21885+ printk("%s: null key\n", prefix);
21886+ else {
21887+ if (REISER4_LARGE_KEY)
21888+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
21889+ get_key_locality(key),
21890+ get_key_type(key),
21891+ get_key_ordering(key),
21892+ get_key_band(key),
21893+ get_key_objectid(key), get_key_offset(key));
21894+ else
21895+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
21896+ get_key_locality(key),
21897+ get_key_type(key),
21898+ get_key_band(key),
21899+ get_key_objectid(key), get_key_offset(key));
21900+ /*
21901+ * if this is a key of directory entry, try to decode part of
21902+ * a name stored in the key, and output it.
21903+ */
21904+ if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
21905+ char buf[DE_NAME_BUF_LEN];
21906+ char *c;
21907+
21908+ c = buf;
21909+ c = reiser4_unpack_string(get_key_ordering(key), c);
21910+ reiser4_unpack_string(get_key_fulloid(key), c);
21911+ printk("[%s", buf);
21912+ if (is_longname_key(key))
21913+ /*
21914+ * only part of the name is stored in the key.
21915+ */
21916+ printk("...]\n");
21917+ else {
21918+ /*
21919+ * whole name is stored in the key.
21920+ */
21921+ reiser4_unpack_string(get_key_offset(key), buf);
21922+ printk("%s]\n", buf);
21923+ }
21924+ } else {
21925+ printk("[%s]\n", type_name(get_key_type(key)));
21926+ }
21927+ }
21928+ /* turn bold off */
21929+ /* printf ("\033[m\017"); */
21930+}
21931+
21932+#endif
21933+
21934+/* Make Linus happy.
21935+ Local variables:
21936+ c-indentation-style: "K&R"
21937+ mode-name: "LC"
21938+ c-basic-offset: 8
21939+ tab-width: 8
21940+ fill-column: 120
21941+ End:
21942+*/
21943diff -urN linux-2.6.23.orig/fs/reiser4/key.h linux-2.6.23/fs/reiser4/key.h
21944--- linux-2.6.23.orig/fs/reiser4/key.h 1970-01-01 03:00:00.000000000 +0300
21945+++ linux-2.6.23/fs/reiser4/key.h 2007-12-04 16:49:30.000000000 +0300
21946@@ -0,0 +1,384 @@
21947+/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21948+
21949+/* Declarations of key-related data-structures and operations on keys. */
21950+
21951+#if !defined( __REISER4_KEY_H__ )
21952+#define __REISER4_KEY_H__
21953+
21954+#include "dformat.h"
21955+#include "forward.h"
21956+#include "debug.h"
21957+
21958+#include <linux/types.h> /* for __u?? */
21959+
21960+/* Operations on keys in reiser4 tree */
21961+
21962+/* No access to any of these fields shall be done except via a
21963+ wrapping macro/function, and that wrapping macro/function shall
21964+ convert to little endian order. Compare keys will consider cpu byte order. */
21965+
21966+/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
21967+ which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
21968+ within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong
21969+ approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
21970+ right one. */
21971+
21972+/* possible values for minor packing locality (4 bits required) */
21973+typedef enum {
21974+ /* file name */
21975+ KEY_FILE_NAME_MINOR = 0,
21976+ /* stat-data */
21977+ KEY_SD_MINOR = 1,
21978+ /* file attribute name */
21979+ KEY_ATTR_NAME_MINOR = 2,
21980+ /* file attribute value */
21981+ KEY_ATTR_BODY_MINOR = 3,
21982+ /* file body (tail or extent) */
21983+ KEY_BODY_MINOR = 4,
21984+} key_minor_locality;
21985+
21986+/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
21987+ Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
21988+ and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to
21989+ segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
21990+ block_alloc.c to check the node type when deciding where to allocate the node.
21991+
21992+ The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it
21993+ should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our
21994+ current implementation tails have a different minor packing locality from extents, and no files have both extents and
21995+ tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now....
21996+*/
21997+
21998+/* Arbitrary major packing localities can be assigned to objects using
21999+ the reiser4(filenameA/..packing<=some_number) system call.
22000+
22001+ In reiser4, the creat() syscall creates a directory
22002+
22003+ whose default flow (that which is referred to if the directory is
22004+ read as a file) is the traditional unix file body.
22005+
22006+ whose directory plugin is the 'filedir'
22007+
22008+ whose major packing locality is that of the parent of the object created.
22009+
22010+ The static_stat item is a particular commonly used directory
22011+ compression (the one for normal unix files).
22012+
22013+ The filedir plugin checks to see if the static_stat item exists.
22014+ There is a unique key for static_stat. If yes, then it uses the
22015+ static_stat item for all of the values that it contains. The
22016+ static_stat item contains a flag for each stat it contains which
22017+ indicates whether one should look outside the static_stat item for its
22018+ contents.
22019+*/
22020+
22021+/* offset of fields in reiser4_key. Value of each element of this enum
22022+ is index within key (thought as array of __u64's) where this field
22023+ is. */
22024+typedef enum {
22025+ /* major "locale", aka dirid. Sits in 1st element */
22026+ KEY_LOCALITY_INDEX = 0,
22027+ /* minor "locale", aka item type. Sits in 1st element */
22028+ KEY_TYPE_INDEX = 0,
22029+ ON_LARGE_KEY(KEY_ORDERING_INDEX,)
22030+ /* "object band". Sits in 2nd element */
22031+ KEY_BAND_INDEX,
22032+ /* objectid. Sits in 2nd element */
22033+ KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
22034+ /* full objectid. Sits in 2nd element */
22035+ KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22036+ /* Offset. Sits in 3rd element */
22037+ KEY_OFFSET_INDEX,
22038+ /* Name hash. Sits in 3rd element */
22039+ KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22040+ KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22041+ KEY_LAST_INDEX
22042+} reiser4_key_field_index;
22043+
22044+/* key in reiser4 internal "balanced" tree. It is just array of three
22045+ 64bit integers in disk byte order (little-endian by default). This
22046+ array is actually indexed by reiser4_key_field. Each __u64 within
22047+ this array is called "element". Logical key component encoded within
22048+ elements are called "fields".
22049+
22050+ We declare this as union with second component dummy to suppress
22051+ inconvenient array<->pointer casts implied in C. */
22052+union reiser4_key {
22053+ __le64 el[KEY_LAST_INDEX];
22054+ int pad;
22055+};
22056+
22057+/* bitmasks showing where within reiser4_key particular key is stored. */
22058+/* major locality occupies higher 60 bits of the first element */
22059+#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22060+
22061+/* minor locality occupies lower 4 bits of the first element */
22062+#define KEY_TYPE_MASK 0xfull
22063+
22064+/* controversial band occupies higher 4 bits of the 2nd element */
22065+#define KEY_BAND_MASK 0xf000000000000000ull
22066+
22067+/* objectid occupies lower 60 bits of the 2nd element */
22068+#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22069+
22070+/* full 64bit objectid*/
22071+#define KEY_FULLOID_MASK 0xffffffffffffffffull
22072+
22073+/* offset is just 3rd L.M.Nt itself */
22074+#define KEY_OFFSET_MASK 0xffffffffffffffffull
22075+
22076+/* ordering is whole second element */
22077+#define KEY_ORDERING_MASK 0xffffffffffffffffull
22078+
22079+/* how many bits key element should be shifted to left to get particular field */
22080+typedef enum {
22081+ KEY_LOCALITY_SHIFT = 4,
22082+ KEY_TYPE_SHIFT = 0,
22083+ KEY_BAND_SHIFT = 60,
22084+ KEY_OBJECTID_SHIFT = 0,
22085+ KEY_FULLOID_SHIFT = 0,
22086+ KEY_OFFSET_SHIFT = 0,
22087+ KEY_ORDERING_SHIFT = 0,
22088+} reiser4_key_field_shift;
22089+
22090+static inline __u64
22091+get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22092+{
22093+ assert("nikita-753", key != NULL);
22094+ assert("nikita-754", off < KEY_LAST_INDEX);
22095+ return le64_to_cpu(get_unaligned(&key->el[off]));
22096+}
22097+
22098+static inline void
22099+set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22100+{
22101+ assert("nikita-755", key != NULL);
22102+ assert("nikita-756", off < KEY_LAST_INDEX);
22103+ put_unaligned(cpu_to_le64(value), &key->el[off]);
22104+}
22105+
22106+/* macro to define getter and setter functions for field F with type T */
22107+#define DEFINE_KEY_FIELD( L, U, T ) \
22108+static inline T get_key_ ## L ( const reiser4_key *key ) \
22109+{ \
22110+ assert( "nikita-750", key != NULL ); \
22111+ return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \
22112+ KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \
22113+} \
22114+ \
22115+static inline void set_key_ ## L ( reiser4_key *key, T loc ) \
22116+{ \
22117+ __u64 el; \
22118+ \
22119+ assert( "nikita-752", key != NULL ); \
22120+ \
22121+ el = get_key_el( key, KEY_ ## U ## _INDEX ); \
22122+ /* clear field bits in the key */ \
22123+ el &= ~KEY_ ## U ## _MASK; \
22124+ /* actually it should be \
22125+ \
22126+ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22127+ \
22128+ but we trust user to never pass values that wouldn't fit \
22129+ into field. Clearing extra bits is one operation, but this \
22130+ function is time-critical. \
22131+ But check this in assertion. */ \
22132+ assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \
22133+ ~KEY_ ## U ## _MASK ) == 0 ); \
22134+ el |= ( loc << KEY_ ## U ## _SHIFT ); \
22135+ set_key_el( key, KEY_ ## U ## _INDEX, el ); \
22136+}
22137+
22138+typedef __u64 oid_t;
22139+
22140+/* define get_key_locality(), set_key_locality() */
22141+DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22142+/* define get_key_type(), set_key_type() */
22143+DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22144+/* define get_key_band(), set_key_band() */
22145+DEFINE_KEY_FIELD(band, BAND, __u64);
22146+/* define get_key_objectid(), set_key_objectid() */
22147+DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22148+/* define get_key_fulloid(), set_key_fulloid() */
22149+DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22150+/* define get_key_offset(), set_key_offset() */
22151+DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22152+#if (REISER4_LARGE_KEY)
22153+/* define get_key_ordering(), set_key_ordering() */
22154+DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22155+#else
22156+static inline __u64 get_key_ordering(const reiser4_key * key)
22157+{
22158+ return 0;
22159+}
22160+
22161+static inline void set_key_ordering(reiser4_key * key, __u64 val)
22162+{
22163+}
22164+#endif
22165+
22166+/* key comparison result */
22167+typedef enum { LESS_THAN = -1, /* if first key is less than second */
22168+ EQUAL_TO = 0, /* if keys are equal */
22169+ GREATER_THAN = +1 /* if first key is greater than second */
22170+} cmp_t;
22171+
22172+void reiser4_key_init(reiser4_key * key);
22173+
22174+/* minimal possible key in the tree. Return pointer to the static storage. */
22175+extern const reiser4_key *reiser4_min_key(void);
22176+extern const reiser4_key *reiser4_max_key(void);
22177+
22178+/* helper macro for keycmp() */
22179+#define KEY_DIFF(k1, k2, field) \
22180+({ \
22181+ typeof (get_key_ ## field (k1)) f1; \
22182+ typeof (get_key_ ## field (k2)) f2; \
22183+ \
22184+ f1 = get_key_ ## field (k1); \
22185+ f2 = get_key_ ## field (k2); \
22186+ \
22187+ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22188+})
22189+
22190+/* helper macro for keycmp() */
22191+#define KEY_DIFF_EL(k1, k2, off) \
22192+({ \
22193+ __u64 e1; \
22194+ __u64 e2; \
22195+ \
22196+ e1 = get_key_el(k1, off); \
22197+ e2 = get_key_el(k2, off); \
22198+ \
22199+ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22200+})
22201+
22202+/* compare `k1' and `k2'. This function is a heart of "key allocation
22203+ policy". All you need to implement new policy is to add yet another
22204+ clause here. */
22205+static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22206+ const reiser4_key * k2 /* second key to compare */ )
22207+{
22208+ cmp_t result;
22209+
22210+ /*
22211+ * This function is the heart of reiser4 tree-routines. Key comparison
22212+ * is among most heavily used operations in the file system.
22213+ */
22214+
22215+ assert("nikita-439", k1 != NULL);
22216+ assert("nikita-440", k2 != NULL);
22217+
22218+ /* there is no actual branch here: condition is compile time constant
22219+ * and constant folding and propagation ensures that only one branch
22220+ * is actually compiled in. */
22221+
22222+ if (REISER4_PLANA_KEY_ALLOCATION) {
22223+ /* if physical order of fields in a key is identical
22224+ with logical order, we can implement key comparison
22225+ as three 64bit comparisons. */
22226+ /* logical order of fields in plan-a:
22227+ locality->type->objectid->offset. */
22228+ /* compare locality and type at once */
22229+ result = KEY_DIFF_EL(k1, k2, 0);
22230+ if (result == EQUAL_TO) {
22231+ /* compare objectid (and band if it's there) */
22232+ result = KEY_DIFF_EL(k1, k2, 1);
22233+ /* compare offset */
22234+ if (result == EQUAL_TO) {
22235+ result = KEY_DIFF_EL(k1, k2, 2);
22236+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22237+ result = KEY_DIFF_EL(k1, k2, 3);
22238+ }
22239+ }
22240+ }
22241+ } else if (REISER4_3_5_KEY_ALLOCATION) {
22242+ result = KEY_DIFF(k1, k2, locality);
22243+ if (result == EQUAL_TO) {
22244+ result = KEY_DIFF(k1, k2, objectid);
22245+ if (result == EQUAL_TO) {
22246+ result = KEY_DIFF(k1, k2, type);
22247+ if (result == EQUAL_TO)
22248+ result = KEY_DIFF(k1, k2, offset);
22249+ }
22250+ }
22251+ } else
22252+ impossible("nikita-441", "Unknown key allocation scheme!");
22253+ return result;
22254+}
22255+
22256+/* true if @k1 equals @k2 */
22257+static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22258+ const reiser4_key * k2 /* second key to compare */ )
22259+{
22260+ assert("nikita-1879", k1 != NULL);
22261+ assert("nikita-1880", k2 != NULL);
22262+ return !memcmp(k1, k2, sizeof *k1);
22263+}
22264+
22265+/* true if @k1 is less than @k2 */
22266+static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22267+ const reiser4_key * k2 /* second key to compare */ )
22268+{
22269+ assert("nikita-1952", k1 != NULL);
22270+ assert("nikita-1953", k2 != NULL);
22271+ return keycmp(k1, k2) == LESS_THAN;
22272+}
22273+
22274+/* true if @k1 is less than or equal to @k2 */
22275+static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22276+ const reiser4_key * k2 /* second key to compare */ )
22277+{
22278+ assert("nikita-1954", k1 != NULL);
22279+ assert("nikita-1955", k2 != NULL);
22280+ return keycmp(k1, k2) != GREATER_THAN;
22281+}
22282+
22283+/* true if @k1 is greater than @k2 */
22284+static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22285+ const reiser4_key * k2 /* second key to compare */ )
22286+{
22287+ assert("nikita-1959", k1 != NULL);
22288+ assert("nikita-1960", k2 != NULL);
22289+ return keycmp(k1, k2) == GREATER_THAN;
22290+}
22291+
22292+/* true if @k1 is greater than or equal to @k2 */
22293+static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22294+ const reiser4_key * k2 /* second key to compare */ )
22295+{
22296+ assert("nikita-1956", k1 != NULL);
22297+ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22298+ * November 3: Laika */
22299+ return keycmp(k1, k2) != LESS_THAN;
22300+}
22301+
22302+static inline void prefetchkey(reiser4_key * key)
22303+{
22304+ prefetch(key);
22305+ prefetch(&key->el[KEY_CACHELINE_END]);
22306+}
22307+
22308+/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22309+ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22310+/* size of a buffer suitable to hold human readable key representation */
22311+#define KEY_BUF_LEN (80)
22312+
22313+#if REISER4_DEBUG
22314+extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22315+#else
22316+#define reiser4_print_key(p,k) noop
22317+#endif
22318+
22319+/* __FS_REISERFS_KEY_H__ */
22320+#endif
22321+
22322+/* Make Linus happy.
22323+ Local variables:
22324+ c-indentation-style: "K&R"
22325+ mode-name: "LC"
22326+ c-basic-offset: 8
22327+ tab-width: 8
22328+ fill-column: 120
22329+ End:
22330+*/
22331diff -urN linux-2.6.23.orig/fs/reiser4/ktxnmgrd.c linux-2.6.23/fs/reiser4/ktxnmgrd.c
22332--- linux-2.6.23.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 03:00:00.000000000 +0300
22333+++ linux-2.6.23/fs/reiser4/ktxnmgrd.c 2007-12-04 16:49:30.000000000 +0300
22334@@ -0,0 +1,214 @@
22335+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22336+/* Transaction manager daemon. */
22337+
22338+/*
22339+ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22340+ * needed/important for the following reasons:
22341+ *
22342+ * 1. in reiser4 atom is not committed immediately when last transaction
22343+ * handle closes, unless atom is either too old or too large (see
22344+ * atom_should_commit()). This is done to avoid committing too frequently.
22345+ * because:
22346+ *
22347+ * 2. sometimes we don't want to commit atom when closing last transaction
22348+ * handle even if it is old and fat enough. For example, because we are at
22349+ * this point under directory semaphore, and committing would stall all
22350+ * accesses to this directory.
22351+ *
22352+ * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22353+ * either due to (tunable) timeout or because it was explicitly woken up by
22354+ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22355+ * eligible.
22356+ *
22357+ */
22358+
22359+#include "debug.h"
22360+#include "txnmgr.h"
22361+#include "tree.h"
22362+#include "ktxnmgrd.h"
22363+#include "super.h"
22364+#include "reiser4.h"
22365+
22366+#include <linux/sched.h> /* for struct task_struct */
22367+#include <linux/wait.h>
22368+#include <linux/suspend.h>
22369+#include <linux/kernel.h>
22370+#include <linux/writeback.h>
22371+#include <linux/kthread.h>
22372+#include <linux/freezer.h>
22373+
22374+static int scan_mgr(struct super_block *);
22375+
22376+/*
22377+ * change current->comm so that ps, top, and friends will see changed
22378+ * state. This serves no useful purpose whatsoever, but also costs nothing. May
22379+ * be it will make lonely system administrator feeling less alone at 3 A.M.
22380+ */
22381+#define set_comm( state ) \
22382+ snprintf( current -> comm, sizeof( current -> comm ), \
22383+ "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22384+
22385+/**
22386+ * ktxnmgrd - kernel txnmgr daemon
22387+ * @arg: pointer to super block
22388+ *
22389+ * The background transaction manager daemon, started as a kernel thread during
22390+ * reiser4 initialization.
22391+ */
22392+static int ktxnmgrd(void *arg)
22393+{
22394+ struct super_block *super;
22395+ ktxnmgrd_context *ctx;
22396+ txn_mgr *mgr;
22397+ int done = 0;
22398+
22399+ super = arg;
22400+ mgr = &get_super_private(super)->tmgr;
22401+
22402+ /*
22403+ * do_fork() just copies task_struct into the new thread. ->fs_context
22404+ * shouldn't be copied of course. This shouldn't be a problem for the
22405+ * rest of the code though.
22406+ */
22407+ current->journal_info = NULL;
22408+ ctx = mgr->daemon;
22409+ while (1) {
22410+ try_to_freeze();
22411+ set_comm("wait");
22412+ {
22413+ DEFINE_WAIT(__wait);
22414+
22415+ prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22416+ if (kthread_should_stop()) {
22417+ done = 1;
22418+ } else
22419+ schedule_timeout(ctx->timeout);
22420+ finish_wait(&ctx->wait, &__wait);
22421+ }
22422+ if (done)
22423+ break;
22424+ set_comm("run");
22425+ spin_lock(&ctx->guard);
22426+ /*
22427+ * wait timed out or ktxnmgrd was woken up by explicit request
22428+ * to commit something. Scan list of atoms in txnmgr and look
22429+ * for too old atoms.
22430+ */
22431+ do {
22432+ ctx->rescan = 0;
22433+ scan_mgr(super);
22434+ spin_lock(&ctx->guard);
22435+ if (ctx->rescan) {
22436+ /*
22437+ * the list could be modified while ctx
22438+ * spinlock was released, we have to repeat
22439+ * scanning from the beginning
22440+ */
22441+ break;
22442+ }
22443+ } while (ctx->rescan);
22444+ spin_unlock(&ctx->guard);
22445+ }
22446+ return 0;
22447+}
22448+
22449+#undef set_comm
22450+
22451+/**
22452+ * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22453+ * @super: pointer to super block
22454+ *
22455+ * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22456+ * manager. Starts kernel txnmgr daemon. This is called on mount.
22457+ */
22458+int reiser4_init_ktxnmgrd(struct super_block *super)
22459+{
22460+ txn_mgr *mgr;
22461+ ktxnmgrd_context *ctx;
22462+
22463+ mgr = &get_super_private(super)->tmgr;
22464+
22465+ assert("zam-1014", mgr->daemon == NULL);
22466+
22467+ ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22468+ if (!ctx)
22469+ return RETERR(-ENOMEM);
22470+
22471+ assert("nikita-2442", ctx != NULL);
22472+
22473+ init_waitqueue_head(&ctx->wait);
22474+
22475+ /*kcond_init(&ctx->startup);*/
22476+ spin_lock_init(&ctx->guard);
22477+ ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22478+ ctx->rescan = 1;
22479+ mgr->daemon = ctx;
22480+
22481+ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22482+ if (IS_ERR(ctx->tsk)) {
22483+ int ret = PTR_ERR(ctx->tsk);
22484+ mgr->daemon = NULL;
22485+ kfree(ctx);
22486+ return RETERR(ret);
22487+ }
22488+ return 0;
22489+}
22490+
22491+void ktxnmgrd_kick(txn_mgr *mgr)
22492+{
22493+ assert("nikita-3234", mgr != NULL);
22494+ assert("nikita-3235", mgr->daemon != NULL);
22495+ wake_up(&mgr->daemon->wait);
22496+}
22497+
22498+int is_current_ktxnmgrd(void)
22499+{
22500+ return (get_current_super_private()->tmgr.daemon->tsk == current);
22501+}
22502+
22503+/**
22504+ * scan_mgr - commit atoms which are to be committed
22505+ * @super: super block to commit atoms of
22506+ *
22507+ * Commits old atoms.
22508+ */
22509+static int scan_mgr(struct super_block *super)
22510+{
22511+ int ret;
22512+ reiser4_context ctx;
22513+
22514+ init_stack_context(&ctx, super);
22515+
22516+ ret = commit_some_atoms(&get_super_private(super)->tmgr);
22517+
22518+ reiser4_exit_context(&ctx);
22519+ return ret;
22520+}
22521+
22522+/**
22523+ * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22524+ * @mgr:
22525+ *
22526+ * This is called on umount. Stops ktxnmgrd and free t
22527+ */
22528+void reiser4_done_ktxnmgrd(struct super_block *super)
22529+{
22530+ txn_mgr *mgr;
22531+
22532+ mgr = &get_super_private(super)->tmgr;
22533+ assert("zam-1012", mgr->daemon != NULL);
22534+
22535+ kthread_stop(mgr->daemon->tsk);
22536+ kfree(mgr->daemon);
22537+ mgr->daemon = NULL;
22538+}
22539+
22540+/*
22541+ * Local variables:
22542+ * c-indentation-style: "K&R"
22543+ * mode-name: "LC"
22544+ * c-basic-offset: 8
22545+ * tab-width: 8
22546+ * fill-column: 120
22547+ * End:
22548+ */
22549diff -urN linux-2.6.23.orig/fs/reiser4/ktxnmgrd.h linux-2.6.23/fs/reiser4/ktxnmgrd.h
22550--- linux-2.6.23.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 03:00:00.000000000 +0300
22551+++ linux-2.6.23/fs/reiser4/ktxnmgrd.h 2007-12-04 16:49:30.000000000 +0300
22552@@ -0,0 +1,52 @@
22553+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22554+ * reiser4/README */
22555+
22556+/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22557+
22558+#ifndef __KTXNMGRD_H__
22559+#define __KTXNMGRD_H__
22560+
22561+#include "txnmgr.h"
22562+
22563+#include <linux/fs.h>
22564+#include <linux/wait.h>
22565+#include <linux/completion.h>
22566+#include <linux/spinlock.h>
22567+#include <asm/atomic.h>
22568+#include <linux/sched.h> /* for struct task_struct */
22569+
22570+/* in this structure all data necessary to start up, shut down and communicate
22571+ * with ktxnmgrd are kept. */
22572+struct ktxnmgrd_context {
22573+ /* wait queue head on which ktxnmgrd sleeps */
22574+ wait_queue_head_t wait;
22575+ /* spin lock protecting all fields of this structure */
22576+ spinlock_t guard;
22577+ /* timeout of sleeping on ->wait */
22578+ signed long timeout;
22579+ /* kernel thread running ktxnmgrd */
22580+ struct task_struct *tsk;
22581+ /* list of all file systems served by this ktxnmgrd */
22582+ struct list_head queue;
22583+ /* should ktxnmgrd repeat scanning of atoms? */
22584+ unsigned int rescan:1;
22585+};
22586+
22587+extern int reiser4_init_ktxnmgrd(struct super_block *);
22588+extern void reiser4_done_ktxnmgrd(struct super_block *);
22589+
22590+extern void ktxnmgrd_kick(txn_mgr * mgr);
22591+extern int is_current_ktxnmgrd(void);
22592+
22593+/* __KTXNMGRD_H__ */
22594+#endif
22595+
22596+/* Make Linus happy.
22597+ Local variables:
22598+ c-indentation-style: "K&R"
22599+ mode-name: "LC"
22600+ c-basic-offset: 8
22601+ tab-width: 8
22602+ fill-column: 120
22603+ End:
22604+*/
22605diff -urN linux-2.6.23.orig/fs/reiser4/lock.c linux-2.6.23/fs/reiser4/lock.c
22606--- linux-2.6.23.orig/fs/reiser4/lock.c 1970-01-01 03:00:00.000000000 +0300
22607+++ linux-2.6.23/fs/reiser4/lock.c 2007-12-04 16:49:30.000000000 +0300
22608@@ -0,0 +1,1232 @@
22609+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22610+ * reiser4/README */
22611+
22612+/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22613+ order. V4 balances the tree from the bottom up, and searches the tree from
22614+ the top down, and that is really the way we want it, so tradition won't work
22615+ for us.
22616+
22617+ Instead we have two lock orderings, a high priority lock ordering, and a low
22618+ priority lock ordering. Each node in the tree has a lock in its znode.
22619+
22620+ Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22621+ has a set (maybe empty) of already locked nodes ("process locked set"). Each
22622+ process may have a pending lock request to a node locked by another process.
22623+ Note: we lock and unlock, but do not transfer locks: it is possible
22624+ transferring locks instead would save some bus locking....
22625+
22626+ Deadlock occurs when we have a loop constructed from process locked sets and
22627+ lock request vectors.
22628+
22629+ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22630+ memory is extended with "znodes" with which we connect nodes with their left
22631+ and right neighbors using sibling pointers stored in the znodes. When we
22632+ perform balancing operations we often go from left to right and from right to
22633+ left.
22634+
22635+ +-P1-+ +-P3-+
22636+ |+--+| V1 |+--+|
22637+ ||N1|| -------> ||N3||
22638+ |+--+| |+--+|
22639+ +----+ +----+
22640+ ^ |
22641+ |V2 |V3
22642+ | v
22643+ +---------P2---------+
22644+ |+--+ +--+|
22645+ ||N2| -------- |N4||
22646+ |+--+ +--+|
22647+ +--------------------+
22648+
22649+ We solve this by ensuring that only low priority processes lock in top to
22650+ bottom order and from right to left, and high priority processes lock from
22651+ bottom to top and left to right.
22652+
22653+ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22654+ kill those damn busy loops.
22655+ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22656+ stage) cannot be ordered that way. There are no rules what nodes can belong
22657+ to the atom and what nodes cannot. We cannot define what is right or left
22658+ direction, what is top or bottom. We can take immediate parent or side
22659+ neighbor of one node, but nobody guarantees that, say, left neighbor node is
22660+ not a far right neighbor for other nodes from the same atom. It breaks
22661+ deadlock avoidance rules and hi-low priority locking cannot be applied for
22662+ atom locks.
22663+
22664+ How does it help to avoid deadlocks ?
22665+
22666+ Suppose we have a deadlock with n processes. Processes from one priority
22667+ class never deadlock because they take locks in one consistent
22668+ order.
22669+
22670+ So, any possible deadlock loop must have low priority as well as high
22671+ priority processes. There are no other lock priority levels except low and
22672+ high. We know that any deadlock loop contains at least one node locked by a
22673+ low priority process and requested by a high priority process. If this
22674+ situation is caught and resolved it is sufficient to avoid deadlocks.
22675+
22676+ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22677+
22678+ The deadlock prevention algorithm is based on comparing
22679+ priorities of node owners (processes which keep znode locked) and
22680+ requesters (processes which want to acquire a lock on znode). We
22681+ implement a scheme where low-priority owners yield locks to
22682+ high-priority requesters. We created a signal passing system that
22683+ is used to ask low-priority processes to yield one or more locked
22684+ znodes.
22685+
22686+ The condition when a znode needs to change its owners is described by the
22687+ following formula:
22688+
22689+ #############################################
22690+ # #
22691+ # (number of high-priority requesters) > 0 #
22692+ # AND #
22693+ # (numbers of high-priority owners) == 0 #
22694+ # #
22695+ #############################################
22696+
22697+ Note that a low-priority process delays node releasing if another
22698+ high-priority process owns this node. So, slightly more strictly speaking,
22699+ to have a deadlock capable cycle you must have a loop in which a high
22700+ priority process is waiting on a low priority process to yield a node, which
22701+ is slightly different from saying a high priority process is waiting on a
22702+ node owned by a low priority process.
22703+
22704+ It is enough to avoid deadlocks if we prevent any low-priority process from
22705+ falling asleep if its locked set contains a node which satisfies the
22706+ deadlock condition.
22707+
22708+ That condition is implicitly or explicitly checked in all places where new
22709+ high-priority requests may be added or removed from node request queue or
22710+ high-priority process takes or releases a lock on node. The main
22711+ goal of these checks is to never lose the moment when node becomes "has
22712+ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22713+ at that time.
22714+
22715+ The information about received signals is stored in the per-process
22716+ structure (lock stack) and analyzed before a low-priority process goes to
22717+ sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22718+ sleeping process up and forces him to re-check lock status and received
22719+ signal info. If "must-yield-this-lock" signals were received the locking
22720+ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22721+
22722+ V4 LOCKING DRAWBACKS
22723+
22724+ If we have already balanced on one level, and we are propagating our changes
22725+ upward to a higher level, it could be very messy to surrender all locks on
22726+ the lower level because we put so much computational work into it, and
22727+ reverting them to their state before they were locked might be very complex.
22728+ We also don't want to acquire all locks before performing balancing because
22729+ that would either be almost as much work as the balancing, or it would be
22730+ too conservative and lock too much. We want balancing to be done only at
22731+ high priority. Yet, we might want to go to the left one node and use some
22732+ of its empty space... So we make one attempt at getting the node to the left
22733+ using try_lock, and if it fails we do without it, because we didn't really
22734+ need it, it was only a nice to have.
22735+
22736+ LOCK STRUCTURES DESCRIPTION
22737+
22738+ The following data structures are used in the reiser4 locking
22739+ implementation:
22740+
22741+ All fields related to long-term locking are stored in znode->lock.
22742+
22743+ The lock stack is a per thread object. It owns all znodes locked by the
22744+ thread. One znode may be locked by several threads in case of read lock or
22745+ one znode may be write locked by one thread several times. The special link
22746+ objects (lock handles) support n<->m relation between znodes and lock
22747+ owners.
22748+
22749+ <Thread 1> <Thread 2>
22750+
22751+ +---------+ +---------+
22752+ | LS1 | | LS2 |
22753+ +---------+ +---------+
22754+ ^ ^
22755+ |---------------+ +----------+
22756+ v v v v
22757+ +---------+ +---------+ +---------+ +---------+
22758+ | LH1 | | LH2 | | LH3 | | LH4 |
22759+ +---------+ +---------+ +---------+ +---------+
22760+ ^ ^ ^ ^
22761+ | +------------+ |
22762+ v v v
22763+ +---------+ +---------+ +---------+
22764+ | Z1 | | Z2 | | Z3 |
22765+ +---------+ +---------+ +---------+
22766+
22767+ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22768+ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22769+ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
22770+ Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22771+ list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22772+ is locked (for read) twice by different threads and two lock handles are on
22773+ its list. Each lock handle represents a single relation of a locking of a
22774+ znode by a thread. Locking of a znode is an establishing of a locking
22775+ relation between the lock stack and the znode by adding of a new lock handle
22776+ to a list of lock handles, the lock stack. The lock stack links all lock
22777+ handles for all znodes locked by the lock stack. The znode list groups all
22778+ lock handles for all locks stacks which locked the znode.
22779+
22780+ Yet another relation may exist between znode and lock owners. If lock
22781+ procedure cannot immediately take lock on an object it adds the lock owner
22782+ on special `requestors' list belongs to znode. That list represents a
22783+ queue of pending lock requests. Because one lock owner may request only
22784+ only one lock object at a time, it is a 1->n relation between lock objects
22785+ and a lock owner implemented as it is described above. Full information
22786+ (priority, pointers to lock and link objects) about each lock request is
22787+ stored in lock owner structure in `request' field.
22788+
22789+ SHORT_TERM LOCKING
22790+
22791+ This is a list of primitive operations over lock stacks / lock handles /
22792+ znodes and locking descriptions for them.
22793+
22794+ 1. locking / unlocking which is done by two list insertion/deletion, one
22795+ to/from znode's list of lock handles, another one is to/from lock stack's
22796+ list of lock handles. The first insertion is protected by
22797+ znode->lock.guard spinlock. The list owned by the lock stack can be
22798+ modified only by thread who owns the lock stack and nobody else can
22799+ modify/read it. There is nothing to be protected by a spinlock or
22800+ something else.
22801+
22802+ 2. adding/removing a lock request to/from znode requesters list. The rule is
22803+ that znode->lock.guard spinlock should be taken for this.
22804+
22805+ 3. we can traverse list of lock handles and use references to lock stacks who
22806+ locked given znode if znode->lock.guard spinlock is taken.
22807+
22808+ 4. If a lock stack is associated with a znode as a lock requestor or lock
22809+ owner its existence is guaranteed by znode->lock.guard spinlock. Some its
22810+ (lock stack's) fields should be protected from being accessed in parallel
22811+ by two or more threads. Please look at lock_stack structure definition
22812+ for the info how those fields are protected. */
22813+
22814+/* Znode lock and capturing intertwining. */
22815+/* In current implementation we capture formatted nodes before locking
22816+ them. Take a look on longterm lock znode, reiser4_try_capture() request
22817+ precedes locking requests. The longterm_lock_znode function unconditionally
22818+ captures znode before even checking of locking conditions.
22819+
22820+ Another variant is to capture znode after locking it. It was not tested, but
22821+ at least one deadlock condition is supposed to be there. One thread has
22822+ locked a znode (Node-1) and calls reiser4_try_capture() for it.
22823+ reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
22824+ Second thread is a flushing thread, its current atom is the atom Node-1
22825+ belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
22826+ is locked by the first thread. The described situation is a deadlock. */
22827+
22828+#include "debug.h"
22829+#include "txnmgr.h"
22830+#include "znode.h"
22831+#include "jnode.h"
22832+#include "tree.h"
22833+#include "plugin/node/node.h"
22834+#include "super.h"
22835+
22836+#include <linux/spinlock.h>
22837+
22838+#if REISER4_DEBUG
22839+static int request_is_deadlock_safe(znode *, znode_lock_mode,
22840+ znode_lock_request);
22841+#endif
22842+
22843+/* Returns a lock owner associated with current thread */
22844+lock_stack *get_current_lock_stack(void)
22845+{
22846+ return &get_current_context()->stack;
22847+}
22848+
22849+/* Wakes up all low priority owners informing them about possible deadlock */
22850+static void wake_up_all_lopri_owners(znode * node)
22851+{
22852+ lock_handle *handle;
22853+
22854+ assert_spin_locked(&(node->lock.guard));
22855+ list_for_each_entry(handle, &node->lock.owners, owners_link) {
22856+ assert("nikita-1832", handle->node == node);
22857+ /* count this signal in owner->nr_signaled */
22858+ if (!handle->signaled) {
22859+ handle->signaled = 1;
22860+ atomic_inc(&handle->owner->nr_signaled);
22861+ /* Wake up a single process */
22862+ reiser4_wake_up(handle->owner);
22863+ }
22864+ }
22865+}
22866+
22867+/* Adds a lock to a lock owner, which means creating a link to the lock and
22868+ putting the link into the two lists all links are on (the doubly linked list
22869+ that forms the lock_stack, and the doubly linked list of links attached
22870+ to a lock.
22871+*/
22872+static inline void
22873+link_object(lock_handle * handle, lock_stack * owner, znode * node)
22874+{
22875+ assert("jmacd-810", handle->owner == NULL);
22876+ assert_spin_locked(&(node->lock.guard));
22877+
22878+ handle->owner = owner;
22879+ handle->node = node;
22880+
22881+ assert("reiser4-4",
22882+ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
22883+
22884+ /* add lock handle to the end of lock_stack's list of locks */
22885+ list_add_tail(&handle->locks_link, &owner->locks);
22886+ ON_DEBUG(owner->nr_locks++);
22887+ reiser4_ctx_gfp_mask_set();
22888+
22889+ /* add lock handle to the head of znode's list of owners */
22890+ list_add(&handle->owners_link, &node->lock.owners);
22891+ handle->signaled = 0;
22892+}
22893+
22894+/* Breaks a relation between a lock and its owner */
22895+static inline void unlink_object(lock_handle * handle)
22896+{
22897+ assert("zam-354", handle->owner != NULL);
22898+ assert("nikita-1608", handle->node != NULL);
22899+ assert_spin_locked(&(handle->node->lock.guard));
22900+ assert("nikita-1829", handle->owner == get_current_lock_stack());
22901+ assert("reiser4-5", handle->owner->nr_locks > 0);
22902+
22903+ /* remove lock handle from lock_stack's list of locks */
22904+ list_del(&handle->locks_link);
22905+ ON_DEBUG(handle->owner->nr_locks--);
22906+ reiser4_ctx_gfp_mask_set();
22907+ assert("reiser4-6",
22908+ ergo(list_empty_careful(&handle->owner->locks),
22909+ handle->owner->nr_locks == 0));
22910+ /* remove lock handle from znode's list of owners */
22911+ list_del(&handle->owners_link);
22912+ /* indicates that lock handle is free now */
22913+ handle->node = NULL;
22914+#if REISER4_DEBUG
22915+ INIT_LIST_HEAD(&handle->locks_link);
22916+ INIT_LIST_HEAD(&handle->owners_link);
22917+ handle->owner = NULL;
22918+#endif
22919+}
22920+
22921+/* Actually locks an object knowing that we are able to do this */
22922+static void lock_object(lock_stack * owner)
22923+{
22924+ struct lock_request *request;
22925+ znode *node;
22926+
22927+ request = &owner->request;
22928+ node = request->node;
22929+ assert_spin_locked(&(node->lock.guard));
22930+ if (request->mode == ZNODE_READ_LOCK) {
22931+ node->lock.nr_readers++;
22932+ } else {
22933+ /* check that we don't switched from read to write lock */
22934+ assert("nikita-1840", node->lock.nr_readers <= 0);
22935+ /* We allow recursive locking; a node can be locked several
22936+ times for write by same process */
22937+ node->lock.nr_readers--;
22938+ }
22939+
22940+ link_object(request->handle, owner, node);
22941+
22942+ if (owner->curpri) {
22943+ node->lock.nr_hipri_owners++;
22944+ }
22945+}
22946+
22947+/* Check for recursive write locking */
22948+static int recursive(lock_stack * owner)
22949+{
22950+ int ret;
22951+ znode *node;
22952+ lock_handle *lh;
22953+
22954+ node = owner->request.node;
22955+
22956+ /* Owners list is not empty for a locked node */
22957+ assert("zam-314", !list_empty_careful(&node->lock.owners));
22958+ assert("nikita-1841", owner == get_current_lock_stack());
22959+ assert_spin_locked(&(node->lock.guard));
22960+
22961+ lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
22962+ ret = (lh->owner == owner);
22963+
22964+ /* Recursive read locking should be done usual way */
22965+ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
22966+ /* mixing of read/write locks is not allowed */
22967+ assert("zam-341", !ret || znode_is_wlocked(node));
22968+
22969+ return ret;
22970+}
22971+
22972+#if REISER4_DEBUG
22973+/* Returns true if the lock is held by the calling thread. */
22974+int znode_is_any_locked(const znode * node)
22975+{
22976+ lock_handle *handle;
22977+ lock_stack *stack;
22978+ int ret;
22979+
22980+ if (!znode_is_locked(node)) {
22981+ return 0;
22982+ }
22983+
22984+ stack = get_current_lock_stack();
22985+
22986+ spin_lock_stack(stack);
22987+
22988+ ret = 0;
22989+
22990+ list_for_each_entry(handle, &stack->locks, locks_link) {
22991+ if (handle->node == node) {
22992+ ret = 1;
22993+ break;
22994+ }
22995+ }
22996+
22997+ spin_unlock_stack(stack);
22998+
22999+ return ret;
23000+}
23001+
23002+#endif
23003+
23004+/* Returns true if a write lock is held by the calling thread. */
23005+int znode_is_write_locked(const znode * node)
23006+{
23007+ lock_stack *stack;
23008+ lock_handle *handle;
23009+
23010+ assert("jmacd-8765", node != NULL);
23011+
23012+ if (!znode_is_wlocked(node)) {
23013+ return 0;
23014+ }
23015+
23016+ stack = get_current_lock_stack();
23017+
23018+ /*
23019+ * When znode is write locked, all owner handles point to the same lock
23020+ * stack. Get pointer to lock stack from the first lock handle from
23021+ * znode's owner list
23022+ */
23023+ handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
23024+
23025+ return (handle->owner == stack);
23026+}
23027+
23028+/* This "deadlock" condition is the essential part of reiser4 locking
23029+ implementation. This condition is checked explicitly by calling
23030+ check_deadlock_condition() or implicitly in all places where znode lock
23031+ state (set of owners and request queue) is changed. Locking code is
23032+ designed to use this condition to trigger procedure of passing object from
23033+ low priority owner(s) to high priority one(s).
23034+
23035+ The procedure results in passing an event (setting lock_handle->signaled
23036+ flag) and counting this event in nr_signaled field of owner's lock stack
23037+ object and wakeup owner's process.
23038+*/
23039+static inline int check_deadlock_condition(znode * node)
23040+{
23041+ assert_spin_locked(&(node->lock.guard));
23042+ return node->lock.nr_hipri_requests > 0
23043+ && node->lock.nr_hipri_owners == 0;
23044+}
23045+
23046+static int check_livelock_condition(znode * node, znode_lock_mode mode)
23047+{
23048+ zlock * lock = &node->lock;
23049+
23050+ return mode == ZNODE_READ_LOCK &&
23051+ lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23052+}
23053+
23054+/* checks lock/request compatibility */
23055+static int can_lock_object(lock_stack * owner)
23056+{
23057+ znode *node = owner->request.node;
23058+
23059+ assert_spin_locked(&(node->lock.guard));
23060+
23061+ /* See if the node is disconnected. */
23062+ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23063+ return RETERR(-EINVAL);
23064+
23065+ /* Do not ever try to take a lock if we are going in low priority
23066+ direction and a node have a high priority request without high
23067+ priority owners. */
23068+ if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23069+ return RETERR(-E_REPEAT);
23070+ if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23071+ return RETERR(-E_REPEAT);
23072+ if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23073+ return RETERR(-E_REPEAT);
23074+ return 0;
23075+}
23076+
23077+/* Setting of a high priority to the process. It clears "signaled" flags
23078+ because znode locked by high-priority process can't satisfy our "deadlock
23079+ condition". */
23080+static void set_high_priority(lock_stack * owner)
23081+{
23082+ assert("nikita-1846", owner == get_current_lock_stack());
23083+ /* Do nothing if current priority is already high */
23084+ if (!owner->curpri) {
23085+ /* We don't need locking for owner->locks list, because, this
23086+ * function is only called with the lock stack of the current
23087+ * thread, and no other thread can play with owner->locks list
23088+ * and/or change ->node pointers of lock handles in this list.
23089+ *
23090+ * (Interrupts also are not involved.)
23091+ */
23092+ lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23093+ while (&owner->locks != &item->locks_link) {
23094+ znode *node = item->node;
23095+
23096+ spin_lock_zlock(&node->lock);
23097+
23098+ node->lock.nr_hipri_owners++;
23099+
23100+ /* we can safely set signaled to zero, because
23101+ previous statement (nr_hipri_owners ++) guarantees
23102+ that signaled will be never set again. */
23103+ item->signaled = 0;
23104+ spin_unlock_zlock(&node->lock);
23105+
23106+ item = list_entry(item->locks_link.next, lock_handle, locks_link);
23107+ }
23108+ owner->curpri = 1;
23109+ atomic_set(&owner->nr_signaled, 0);
23110+ }
23111+}
23112+
23113+/* Sets a low priority to the process. */
23114+static void set_low_priority(lock_stack * owner)
23115+{
23116+ assert("nikita-3075", owner == get_current_lock_stack());
23117+ /* Do nothing if current priority is already low */
23118+ if (owner->curpri) {
23119+ /* scan all locks (lock handles) held by @owner, which is
23120+ actually current thread, and check whether we are reaching
23121+ deadlock possibility anywhere.
23122+ */
23123+ lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23124+ while (&owner->locks != &handle->locks_link) {
23125+ znode *node = handle->node;
23126+ spin_lock_zlock(&node->lock);
23127+ /* this thread just was hipri owner of @node, so
23128+ nr_hipri_owners has to be greater than zero. */
23129+ assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23130+ node->lock.nr_hipri_owners--;
23131+ /* If we have deadlock condition, adjust a nr_signaled
23132+ field. It is enough to set "signaled" flag only for
23133+ current process, other low-pri owners will be
23134+ signaled and waken up after current process unlocks
23135+ this object and any high-priority requestor takes
23136+ control. */
23137+ if (check_deadlock_condition(node)
23138+ && !handle->signaled) {
23139+ handle->signaled = 1;
23140+ atomic_inc(&owner->nr_signaled);
23141+ }
23142+ spin_unlock_zlock(&node->lock);
23143+ handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23144+ }
23145+ owner->curpri = 0;
23146+ }
23147+}
23148+
23149+static void remove_lock_request(lock_stack * requestor)
23150+{
23151+ zlock * lock = &requestor->request.node->lock;
23152+
23153+ if (requestor->curpri) {
23154+ assert("nikita-1838", lock->nr_hipri_requests > 0);
23155+ lock->nr_hipri_requests--;
23156+ if (requestor->request.mode == ZNODE_WRITE_LOCK)
23157+ lock->nr_hipri_write_requests --;
23158+ }
23159+ list_del(&requestor->requestors_link);
23160+}
23161+
23162+static void invalidate_all_lock_requests(znode * node)
23163+{
23164+ lock_stack *requestor, *tmp;
23165+
23166+ assert_spin_locked(&(node->lock.guard));
23167+
23168+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23169+ remove_lock_request(requestor);
23170+ requestor->request.ret_code = -EINVAL;
23171+ reiser4_wake_up(requestor);
23172+ requestor->request.mode = ZNODE_NO_LOCK;
23173+ }
23174+}
23175+
23176+static void dispatch_lock_requests(znode * node)
23177+{
23178+ lock_stack *requestor, *tmp;
23179+
23180+ assert_spin_locked(&(node->lock.guard));
23181+
23182+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23183+ if (znode_is_write_locked(node))
23184+ break;
23185+ if (!can_lock_object(requestor)) {
23186+ lock_object(requestor);
23187+ remove_lock_request(requestor);
23188+ requestor->request.ret_code = 0;
23189+ reiser4_wake_up(requestor);
23190+ requestor->request.mode = ZNODE_NO_LOCK;
23191+ }
23192+ }
23193+}
23194+
23195+/* release long-term lock, acquired by longterm_lock_znode() */
23196+void longterm_unlock_znode(lock_handle * handle)
23197+{
23198+ znode *node = handle->node;
23199+ lock_stack *oldowner = handle->owner;
23200+ int hipri;
23201+ int readers;
23202+ int rdelta;
23203+ int youdie;
23204+
23205+ /*
23206+ * this is time-critical and highly optimized code. Modify carefully.
23207+ */
23208+
23209+ assert("jmacd-1021", handle != NULL);
23210+ assert("jmacd-1022", handle->owner != NULL);
23211+ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23212+
23213+ assert("zam-130", oldowner == get_current_lock_stack());
23214+
23215+ LOCK_CNT_DEC(long_term_locked_znode);
23216+
23217+ /*
23218+ * to minimize amount of operations performed under lock, pre-compute
23219+ * all variables used within critical section. This makes code
23220+ * obscure.
23221+ */
23222+
23223+ /* was this lock of hi or lo priority */
23224+ hipri = oldowner->curpri ? 1 : 0;
23225+ /* number of readers */
23226+ readers = node->lock.nr_readers;
23227+ /* +1 if write lock, -1 if read lock */
23228+ rdelta = (readers > 0) ? -1 : +1;
23229+ /* true if node is to die and write lock is released */
23230+ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23231+
23232+ spin_lock_zlock(&node->lock);
23233+
23234+ assert("zam-101", znode_is_locked(node));
23235+
23236+ /* Adjust a number of high priority owners of this lock */
23237+ assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23238+ node->lock.nr_hipri_owners -= hipri;
23239+
23240+ /* Handle znode deallocation on last write-lock release. */
23241+ if (znode_is_wlocked_once(node)) {
23242+ if (youdie) {
23243+ forget_znode(handle);
23244+ assert("nikita-2191", znode_invariant(node));
23245+ zput(node);
23246+ return;
23247+ }
23248+ }
23249+
23250+ if (handle->signaled)
23251+ atomic_dec(&oldowner->nr_signaled);
23252+
23253+ /* Unlocking means owner<->object link deletion */
23254+ unlink_object(handle);
23255+
23256+ /* This is enough to be sure whether an object is completely
23257+ unlocked. */
23258+ node->lock.nr_readers += rdelta;
23259+
23260+ /* If the node is locked it must have an owners list. Likewise, if
23261+ the node is unlocked it must have an empty owners list. */
23262+ assert("zam-319", equi(znode_is_locked(node),
23263+ !list_empty_careful(&node->lock.owners)));
23264+
23265+#if REISER4_DEBUG
23266+ if (!znode_is_locked(node))
23267+ ++node->times_locked;
23268+#endif
23269+
23270+ /* If there are pending lock requests we wake up a requestor */
23271+ if (!znode_is_wlocked(node))
23272+ dispatch_lock_requests(node);
23273+ if (check_deadlock_condition(node))
23274+ wake_up_all_lopri_owners(node);
23275+ spin_unlock_zlock(&node->lock);
23276+
23277+ /* minus one reference from handle->node */
23278+ assert("nikita-2190", znode_invariant(node));
23279+ ON_DEBUG(check_lock_data());
23280+ ON_DEBUG(check_lock_node_data(node));
23281+ zput(node);
23282+}
23283+
23284+/* final portion of longterm-lock */
23285+static int
23286+lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23287+{
23288+ znode *node = owner->request.node;
23289+
23290+ assert_spin_locked(&(node->lock.guard));
23291+
23292+ /* If we broke with (ok == 0) it means we can_lock, now do it. */
23293+ if (ok == 0) {
23294+ lock_object(owner);
23295+ owner->request.mode = 0;
23296+ /* count a reference from lockhandle->node
23297+
23298+ znode was already referenced at the entry to this function,
23299+ hence taking spin-lock here is not necessary (see comment
23300+ in the zref()).
23301+ */
23302+ zref(node);
23303+
23304+ LOCK_CNT_INC(long_term_locked_znode);
23305+ }
23306+ spin_unlock_zlock(&node->lock);
23307+ ON_DEBUG(check_lock_data());
23308+ ON_DEBUG(check_lock_node_data(node));
23309+ return ok;
23310+}
23311+
23312+/*
23313+ * version of longterm_znode_lock() optimized for the most common case: read
23314+ * lock without any special flags. This is the kind of lock that any tree
23315+ * traversal takes on the root node of the tree, which is very frequent.
23316+ */
23317+static int longterm_lock_tryfast(lock_stack * owner)
23318+{
23319+ int result;
23320+ znode *node;
23321+ zlock *lock;
23322+
23323+ node = owner->request.node;
23324+ lock = &node->lock;
23325+
23326+ assert("nikita-3340", reiser4_schedulable());
23327+ assert("nikita-3341", request_is_deadlock_safe(node,
23328+ ZNODE_READ_LOCK,
23329+ ZNODE_LOCK_LOPRI));
23330+ spin_lock_zlock(lock);
23331+ result = can_lock_object(owner);
23332+ spin_unlock_zlock(lock);
23333+
23334+ if (likely(result != -EINVAL)) {
23335+ spin_lock_znode(node);
23336+ result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23337+ spin_unlock_znode(node);
23338+ spin_lock_zlock(lock);
23339+ if (unlikely(result != 0)) {
23340+ owner->request.mode = 0;
23341+ } else {
23342+ result = can_lock_object(owner);
23343+ if (unlikely(result == -E_REPEAT)) {
23344+ /* fall back to longterm_lock_znode() */
23345+ spin_unlock_zlock(lock);
23346+ return 1;
23347+ }
23348+ }
23349+ return lock_tail(owner, result, ZNODE_READ_LOCK);
23350+ } else
23351+ return 1;
23352+}
23353+
23354+/* locks given lock object */
23355+int longterm_lock_znode(
23356+ /* local link object (allocated by lock owner thread, usually on its own
23357+ * stack) */
23358+ lock_handle * handle,
23359+ /* znode we want to lock. */
23360+ znode * node,
23361+ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23362+ znode_lock_mode mode,
23363+ /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23364+ znode_lock_request request) {
23365+ int ret;
23366+ int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23367+ int non_blocking = 0;
23368+ int has_atom;
23369+ txn_capture cap_flags;
23370+ zlock *lock;
23371+ txn_handle *txnh;
23372+ tree_level level;
23373+
23374+ /* Get current process context */
23375+ lock_stack *owner = get_current_lock_stack();
23376+
23377+ /* Check that the lock handle is initialized and isn't already being
23378+ * used. */
23379+ assert("jmacd-808", handle->owner == NULL);
23380+ assert("nikita-3026", reiser4_schedulable());
23381+ assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23382+ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23383+ /* long term locks are not allowed in the VM contexts (->writepage(),
23384+ * prune_{d,i}cache()).
23385+ *
23386+ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23387+ * bug caused by d_splice_alias() only working for directories.
23388+ */
23389+ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23390+ assert ("zam-1055", mode != ZNODE_NO_LOCK);
23391+
23392+ cap_flags = 0;
23393+ if (request & ZNODE_LOCK_NONBLOCK) {
23394+ cap_flags |= TXN_CAPTURE_NONBLOCKING;
23395+ non_blocking = 1;
23396+ }
23397+
23398+ if (request & ZNODE_LOCK_DONT_FUSE)
23399+ cap_flags |= TXN_CAPTURE_DONT_FUSE;
23400+
23401+ /* If we are changing our process priority we must adjust a number
23402+ of high priority owners for each znode that we already lock */
23403+ if (hipri) {
23404+ set_high_priority(owner);
23405+ } else {
23406+ set_low_priority(owner);
23407+ }
23408+
23409+ level = znode_get_level(node);
23410+
23411+ /* Fill request structure with our values. */
23412+ owner->request.mode = mode;
23413+ owner->request.handle = handle;
23414+ owner->request.node = node;
23415+
23416+ txnh = get_current_context()->trans;
23417+ lock = &node->lock;
23418+
23419+ if (mode == ZNODE_READ_LOCK && request == 0) {
23420+ ret = longterm_lock_tryfast(owner);
23421+ if (ret <= 0)
23422+ return ret;
23423+ }
23424+
23425+ has_atom = (txnh->atom != NULL);
23426+
23427+ /* Synchronize on node's zlock guard lock. */
23428+ spin_lock_zlock(lock);
23429+
23430+ if (znode_is_locked(node) &&
23431+ mode == ZNODE_WRITE_LOCK && recursive(owner))
23432+ return lock_tail(owner, 0, mode);
23433+
23434+ for (;;) {
23435+ /* Check the lock's availability: if it is unavaiable we get
23436+ E_REPEAT, 0 indicates "can_lock", otherwise the node is
23437+ invalid. */
23438+ ret = can_lock_object(owner);
23439+
23440+ if (unlikely(ret == -EINVAL)) {
23441+ /* @node is dying. Leave it alone. */
23442+ break;
23443+ }
23444+
23445+ if (unlikely(ret == -E_REPEAT && non_blocking)) {
23446+ /* either locking of @node by the current thread will
23447+ * lead to the deadlock, or lock modes are
23448+ * incompatible. */
23449+ break;
23450+ }
23451+
23452+ assert("nikita-1844", (ret == 0)
23453+ || ((ret == -E_REPEAT) && !non_blocking));
23454+ /* If we can get the lock... Try to capture first before
23455+ taking the lock. */
23456+
23457+ /* first handle commonest case where node and txnh are already
23458+ * in the same atom. */
23459+ /* safe to do without taking locks, because:
23460+ *
23461+ * 1. read of aligned word is atomic with respect to writes to
23462+ * this word
23463+ *
23464+ * 2. false negatives are handled in reiser4_try_capture().
23465+ *
23466+ * 3. false positives are impossible.
23467+ *
23468+ * PROOF: left as an exercise to the curious reader.
23469+ *
23470+ * Just kidding. Here is one:
23471+ *
23472+ * At the time T0 txnh->atom is stored in txnh_atom.
23473+ *
23474+ * At the time T1 node->atom is stored in node_atom.
23475+ *
23476+ * At the time T2 we observe that
23477+ *
23478+ * txnh_atom != NULL && node_atom == txnh_atom.
23479+ *
23480+ * Imagine that at this moment we acquire node and txnh spin
23481+ * lock in this order. Suppose that under spin lock we have
23482+ *
23483+ * node->atom != txnh->atom, (S1)
23484+ *
23485+ * at the time T3.
23486+ *
23487+ * txnh->atom != NULL still, because txnh is open by the
23488+ * current thread.
23489+ *
23490+ * Suppose node->atom == NULL, that is, node was un-captured
23491+ * between T1, and T3. But un-capturing of formatted node is
23492+ * always preceded by the call to reiser4_invalidate_lock(),
23493+ * which marks znode as JNODE_IS_DYING under zlock spin
23494+ * lock. Contradiction, because can_lock_object() above checks
23495+ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23496+ *
23497+ * Suppose that node->atom != node_atom, that is, atom, node
23498+ * belongs to was fused into another atom: node_atom was fused
23499+ * into node->atom. Atom of txnh was equal to node_atom at T2,
23500+ * which means that under spin lock, txnh->atom == node->atom,
23501+ * because txnh->atom can only follow fusion
23502+ * chain. Contradicts S1.
23503+ *
23504+ * The same for hypothesis txnh->atom != txnh_atom. Hence,
23505+ * node->atom == node_atom == txnh_atom == txnh->atom. Again
23506+ * contradicts S1. Hence S1 is false. QED.
23507+ *
23508+ */
23509+
23510+ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23511+ ;
23512+ } else {
23513+ /*
23514+ * unlock zlock spin lock here. It is possible for
23515+ * longterm_unlock_znode() to sneak in here, but there
23516+ * is no harm: reiser4_invalidate_lock() will mark znode
23517+ * as JNODE_IS_DYING and this will be noted by
23518+ * can_lock_object() below.
23519+ */
23520+ spin_unlock_zlock(lock);
23521+ spin_lock_znode(node);
23522+ ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags);
23523+ spin_unlock_znode(node);
23524+ spin_lock_zlock(lock);
23525+ if (unlikely(ret != 0)) {
23526+ /* In the failure case, the txnmgr releases
23527+ the znode's lock (or in some cases, it was
23528+ released a while ago). There's no need to
23529+ reacquire it so we should return here,
23530+ avoid releasing the lock. */
23531+ owner->request.mode = 0;
23532+ break;
23533+ }
23534+
23535+ /* Check the lock's availability again -- this is
23536+ because under some circumstances the capture code
23537+ has to release and reacquire the znode spinlock. */
23538+ ret = can_lock_object(owner);
23539+ }
23540+
23541+ /* This time, a return of (ret == 0) means we can lock, so we
23542+ should break out of the loop. */
23543+ if (likely(ret != -E_REPEAT || non_blocking))
23544+ break;
23545+
23546+ /* Lock is unavailable, we have to wait. */
23547+ ret = reiser4_prepare_to_sleep(owner);
23548+ if (unlikely(ret != 0))
23549+ break;
23550+
23551+ assert_spin_locked(&(node->lock.guard));
23552+ if (hipri) {
23553+ /* If we are going in high priority direction then
23554+ increase high priority requests counter for the
23555+ node */
23556+ lock->nr_hipri_requests++;
23557+ if (mode == ZNODE_WRITE_LOCK)
23558+ lock->nr_hipri_write_requests ++;
23559+ /* If there are no high priority owners for a node,
23560+ then immediately wake up low priority owners, so
23561+ they can detect possible deadlock */
23562+ if (lock->nr_hipri_owners == 0)
23563+ wake_up_all_lopri_owners(node);
23564+ }
23565+ list_add_tail(&owner->requestors_link, &lock->requestors);
23566+
23567+ /* Ok, here we have prepared a lock request, so unlock
23568+ a znode ... */
23569+ spin_unlock_zlock(lock);
23570+ /* ... and sleep */
23571+ reiser4_go_to_sleep(owner);
23572+ if (owner->request.mode == ZNODE_NO_LOCK)
23573+ goto request_is_done;
23574+ spin_lock_zlock(lock);
23575+ if (owner->request.mode == ZNODE_NO_LOCK) {
23576+ spin_unlock_zlock(lock);
23577+ request_is_done:
23578+ if (owner->request.ret_code == 0) {
23579+ LOCK_CNT_INC(long_term_locked_znode);
23580+ zref(node);
23581+ }
23582+ return owner->request.ret_code;
23583+ }
23584+ remove_lock_request(owner);
23585+ }
23586+
23587+ return lock_tail(owner, ret, mode);
23588+}
23589+
23590+/* lock object invalidation means changing of lock object state to `INVALID'
23591+ and waiting for all other processes to cancel theirs lock requests. */
23592+void reiser4_invalidate_lock(lock_handle * handle /* path to lock
23593+ * owner and lock
23594+ * object is being
23595+ * invalidated. */ )
23596+{
23597+ znode *node = handle->node;
23598+ lock_stack *owner = handle->owner;
23599+
23600+ assert("zam-325", owner == get_current_lock_stack());
23601+ assert("zam-103", znode_is_write_locked(node));
23602+ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23603+ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23604+ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23605+ assert("nikita-3097", znode_is_wlocked_once(node));
23606+ assert_spin_locked(&(node->lock.guard));
23607+
23608+ if (handle->signaled)
23609+ atomic_dec(&owner->nr_signaled);
23610+
23611+ ZF_SET(node, JNODE_IS_DYING);
23612+ unlink_object(handle);
23613+ node->lock.nr_readers = 0;
23614+
23615+ invalidate_all_lock_requests(node);
23616+ spin_unlock_zlock(&node->lock);
23617+}
23618+
23619+/* Initializes lock_stack. */
23620+void init_lock_stack(lock_stack * owner /* pointer to
23621+ * allocated
23622+ * structure. */ )
23623+{
23624+ INIT_LIST_HEAD(&owner->locks);
23625+ INIT_LIST_HEAD(&owner->requestors_link);
23626+ spin_lock_init(&owner->sguard);
23627+ owner->curpri = 1;
23628+ init_waitqueue_head(&owner->wait);
23629+}
23630+
23631+/* Initializes lock object. */
23632+void reiser4_init_lock(zlock * lock /* pointer on allocated
23633+ * uninitialized lock object
23634+ * structure. */ )
23635+{
23636+ memset(lock, 0, sizeof(zlock));
23637+ spin_lock_init(&lock->guard);
23638+ INIT_LIST_HEAD(&lock->requestors);
23639+ INIT_LIST_HEAD(&lock->owners);
23640+}
23641+
23642+/* Transfer a lock handle (presumably so that variables can be moved between stack and
23643+ heap locations). */
23644+static void
23645+move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23646+{
23647+ znode *node = old->node;
23648+ lock_stack *owner = old->owner;
23649+ int signaled;
23650+
23651+ /* locks_list, modified by link_object() is not protected by
23652+ anything. This is valid because only current thread ever modifies
23653+ locks_list of its lock_stack.
23654+ */
23655+ assert("nikita-1827", owner == get_current_lock_stack());
23656+ assert("nikita-1831", new->owner == NULL);
23657+
23658+ spin_lock_zlock(&node->lock);
23659+
23660+ signaled = old->signaled;
23661+ if (unlink_old) {
23662+ unlink_object(old);
23663+ } else {
23664+ if (node->lock.nr_readers > 0) {
23665+ node->lock.nr_readers += 1;
23666+ } else {
23667+ node->lock.nr_readers -= 1;
23668+ }
23669+ if (signaled) {
23670+ atomic_inc(&owner->nr_signaled);
23671+ }
23672+ if (owner->curpri) {
23673+ node->lock.nr_hipri_owners += 1;
23674+ }
23675+ LOCK_CNT_INC(long_term_locked_znode);
23676+
23677+ zref(node);
23678+ }
23679+ link_object(new, owner, node);
23680+ new->signaled = signaled;
23681+
23682+ spin_unlock_zlock(&node->lock);
23683+}
23684+
23685+void move_lh(lock_handle * new, lock_handle * old)
23686+{
23687+ move_lh_internal(new, old, /*unlink_old */ 1);
23688+}
23689+
23690+void copy_lh(lock_handle * new, lock_handle * old)
23691+{
23692+ move_lh_internal(new, old, /*unlink_old */ 0);
23693+}
23694+
23695+/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
23696+int reiser4_check_deadlock(void)
23697+{
23698+ lock_stack *owner = get_current_lock_stack();
23699+ return atomic_read(&owner->nr_signaled) != 0;
23700+}
23701+
23702+/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
23703+ priorities. */
23704+int reiser4_prepare_to_sleep(lock_stack * owner)
23705+{
23706+ assert("nikita-1847", owner == get_current_lock_stack());
23707+
23708+ /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23709+ * counted in nr_signaled */
23710+ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23711+ assert("zam-959", !owner->curpri);
23712+ return RETERR(-E_DEADLOCK);
23713+ }
23714+ return 0;
23715+}
23716+
23717+/* Wakes up a single thread */
23718+void __reiser4_wake_up(lock_stack * owner)
23719+{
23720+ atomic_set(&owner->wakeup, 1);
23721+ wake_up(&owner->wait);
23722+}
23723+
23724+/* Puts a thread to sleep */
23725+void reiser4_go_to_sleep(lock_stack * owner)
23726+{
23727+ /* Well, we might sleep here, so holding of any spinlocks is no-no */
23728+ assert("nikita-3027", reiser4_schedulable());
23729+
23730+ wait_event(owner->wait, atomic_read(&owner->wakeup));
23731+ atomic_set(&owner->wakeup, 0);
23732+}
23733+
23734+int lock_stack_isclean(lock_stack * owner)
23735+{
23736+ if (list_empty_careful(&owner->locks)) {
23737+ assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23738+ return 1;
23739+ }
23740+
23741+ return 0;
23742+}
23743+
23744+#if REISER4_DEBUG
23745+
23746+/*
23747+ * debugging functions
23748+ */
23749+
23750+static void list_check(struct list_head *head)
23751+{
23752+ struct list_head *pos;
23753+
23754+ list_for_each(pos, head)
23755+ assert("", (pos->prev != NULL && pos->next != NULL &&
23756+ pos->prev->next == pos && pos->next->prev == pos));
23757+}
23758+
23759+/* check consistency of locking data-structures hanging of the @stack */
23760+static void check_lock_stack(lock_stack * stack)
23761+{
23762+ spin_lock_stack(stack);
23763+ /* check that stack->locks is not corrupted */
23764+ list_check(&stack->locks);
23765+ spin_unlock_stack(stack);
23766+}
23767+
23768+/* check consistency of locking data structures */
23769+void check_lock_data(void)
23770+{
23771+ check_lock_stack(&get_current_context()->stack);
23772+}
23773+
23774+/* check consistency of locking data structures for @node */
23775+void check_lock_node_data(znode * node)
23776+{
23777+ spin_lock_zlock(&node->lock);
23778+ list_check(&node->lock.owners);
23779+ list_check(&node->lock.requestors);
23780+ spin_unlock_zlock(&node->lock);
23781+}
23782+
23783+/* check that given lock request is dead lock safe. This check is, of course,
23784+ * not exhaustive. */
23785+static int
23786+request_is_deadlock_safe(znode * node, znode_lock_mode mode,
23787+ znode_lock_request request)
23788+{
23789+ lock_stack *owner;
23790+
23791+ owner = get_current_lock_stack();
23792+ /*
23793+ * check that hipri lock request is not issued when there are locked
23794+ * nodes at the higher levels.
23795+ */
23796+ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
23797+ znode_get_level(node) != 0) {
23798+ lock_handle *item;
23799+
23800+ list_for_each_entry(item, &owner->locks, locks_link) {
23801+ znode *other;
23802+
23803+ other = item->node;
23804+
23805+ if (znode_get_level(other) == 0)
23806+ continue;
23807+ if (znode_get_level(other) > znode_get_level(node))
23808+ return 0;
23809+ }
23810+ }
23811+ return 1;
23812+}
23813+
23814+#endif
23815+
23816+/* return pointer to static storage with name of lock_mode. For
23817+ debugging */
23818+const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
23819+{
23820+ if (lock == ZNODE_READ_LOCK)
23821+ return "read";
23822+ else if (lock == ZNODE_WRITE_LOCK)
23823+ return "write";
23824+ else {
23825+ static char buf[30];
23826+
23827+ sprintf(buf, "unknown: %i", lock);
23828+ return buf;
23829+ }
23830+}
23831+
23832+/* Make Linus happy.
23833+ Local variables:
23834+ c-indentation-style: "K&R"
23835+ mode-name: "LC"
23836+ c-basic-offset: 8
23837+ tab-width: 8
23838+ fill-column: 79
23839+ End:
23840+*/
23841diff -urN linux-2.6.23.orig/fs/reiser4/lock.h linux-2.6.23/fs/reiser4/lock.h
23842--- linux-2.6.23.orig/fs/reiser4/lock.h 1970-01-01 03:00:00.000000000 +0300
23843+++ linux-2.6.23/fs/reiser4/lock.h 2007-12-04 16:49:30.000000000 +0300
23844@@ -0,0 +1,249 @@
23845+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
23846+
23847+/* Long term locking data structures. See lock.c for details. */
23848+
23849+#ifndef __LOCK_H__
23850+#define __LOCK_H__
23851+
23852+#include "forward.h"
23853+#include "debug.h"
23854+#include "dformat.h"
23855+#include "key.h"
23856+#include "coord.h"
23857+#include "plugin/node/node.h"
23858+#include "txnmgr.h"
23859+#include "readahead.h"
23860+
23861+#include <linux/types.h>
23862+#include <linux/spinlock.h>
23863+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
23864+#include <asm/atomic.h>
23865+#include <linux/wait.h>
23866+
23867+/* Per-znode lock object */
23868+struct zlock {
23869+ spinlock_t guard;
23870+ /* The number of readers if positive; the number of recursively taken
23871+ write locks if negative. Protected by zlock spin lock. */
23872+ int nr_readers;
23873+ /* A number of processes (lock_stacks) that have this object
23874+ locked with high priority */
23875+ unsigned nr_hipri_owners;
23876+ /* A number of attempts to lock znode in high priority direction */
23877+ unsigned nr_hipri_requests;
23878+ /* A linked list of lock_handle objects that contains pointers
23879+ for all lock_stacks which have this lock object locked */
23880+ unsigned nr_hipri_write_requests;
23881+ struct list_head owners;
23882+ /* A linked list of lock_stacks that wait for this lock */
23883+ struct list_head requestors;
23884+};
23885+
23886+static inline void spin_lock_zlock(zlock *lock)
23887+{
23888+ /* check that zlock is not locked */
23889+ assert("", LOCK_CNT_NIL(spin_locked_zlock));
23890+ /* check that spinlocks of lower priorities are not held */
23891+ assert("", LOCK_CNT_NIL(spin_locked_stack));
23892+
23893+ spin_lock(&lock->guard);
23894+
23895+ LOCK_CNT_INC(spin_locked_zlock);
23896+ LOCK_CNT_INC(spin_locked);
23897+}
23898+
23899+static inline void spin_unlock_zlock(zlock *lock)
23900+{
23901+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
23902+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
23903+
23904+ LOCK_CNT_DEC(spin_locked_zlock);
23905+ LOCK_CNT_DEC(spin_locked);
23906+
23907+ spin_unlock(&lock->guard);
23908+}
23909+
23910+#define lock_is_locked(lock) ((lock)->nr_readers != 0)
23911+#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
23912+#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
23913+#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
23914+#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0)
23915+#define lock_mode_compatible(lock, mode) \
23916+ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
23917+ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
23918+
23919+/* Since we have R/W znode locks we need additional bidirectional `link'
23920+ objects to implement n<->m relationship between lock owners and lock
23921+ objects. We call them `lock handles'.
23922+
23923+ Locking: see lock.c/"SHORT-TERM LOCKING"
23924+*/
23925+struct lock_handle {
23926+ /* This flag indicates that a signal to yield a lock was passed to
23927+ lock owner and counted in owner->nr_signalled
23928+
23929+ Locking: this is accessed under spin lock on ->node.
23930+ */
23931+ int signaled;
23932+ /* A link to owner of a lock */
23933+ lock_stack *owner;
23934+ /* A link to znode locked */
23935+ znode *node;
23936+ /* A list of all locks for a process */
23937+ struct list_head locks_link;
23938+ /* A list of all owners for a znode */
23939+ struct list_head owners_link;
23940+};
23941+
23942+struct lock_request {
23943+ /* A pointer to uninitialized link object */
23944+ lock_handle *handle;
23945+ /* A pointer to the object we want to lock */
23946+ znode *node;
23947+ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
23948+ znode_lock_mode mode;
23949+ /* how dispatch_lock_requests() returns lock request result code */
23950+ int ret_code;
23951+};
23952+
23953+/* A lock stack structure for accumulating locks owned by a process */
23954+struct lock_stack {
23955+ /* A guard lock protecting a lock stack */
23956+ spinlock_t sguard;
23957+ /* number of znodes which were requested by high priority processes */
23958+ atomic_t nr_signaled;
23959+ /* Current priority of a process
23960+
23961+ This is only accessed by the current thread and thus requires no
23962+ locking.
23963+ */
23964+ int curpri;
23965+ /* A list of all locks owned by this process. Elements can be added to
23966+ * this list only by the current thread. ->node pointers in this list
23967+ * can be only changed by the current thread. */
23968+ struct list_head locks;
23969+ /* When lock_stack waits for the lock, it puts itself on double-linked
23970+ requestors list of that lock */
23971+ struct list_head requestors_link;
23972+ /* Current lock request info.
23973+
23974+ This is only accessed by the current thread and thus requires no
23975+ locking.
23976+ */
23977+ struct lock_request request;
23978+ /* the following two fields are the lock stack's
23979+ * synchronization object to use with the standard linux/wait.h
23980+ * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
23981+ * usage details. */
23982+ wait_queue_head_t wait;
23983+ atomic_t wakeup;
23984+#if REISER4_DEBUG
23985+ int nr_locks; /* number of lock handles in the above list */
23986+#endif
23987+};
23988+
23989+/*
23990+ User-visible znode locking functions
23991+*/
23992+
23993+extern int longterm_lock_znode(lock_handle * handle,
23994+ znode * node,
23995+ znode_lock_mode mode,
23996+ znode_lock_request request);
23997+
23998+extern void longterm_unlock_znode(lock_handle * handle);
23999+
24000+extern int reiser4_check_deadlock(void);
24001+
24002+extern lock_stack *get_current_lock_stack(void);
24003+
24004+extern void init_lock_stack(lock_stack * owner);
24005+extern void reiser4_init_lock(zlock * lock);
24006+
24007+static inline void init_lh(lock_handle *lh)
24008+{
24009+#if REISER4_DEBUG
24010+ memset(lh, 0, sizeof *lh);
24011+ INIT_LIST_HEAD(&lh->locks_link);
24012+ INIT_LIST_HEAD(&lh->owners_link);
24013+#else
24014+ lh->node = NULL;
24015+#endif
24016+}
24017+
24018+static inline void done_lh(lock_handle *lh)
24019+{
24020+ assert("zam-342", lh != NULL);
24021+ if (lh->node != NULL)
24022+ longterm_unlock_znode(lh);
24023+}
24024+
24025+extern void move_lh(lock_handle * new, lock_handle * old);
24026+extern void copy_lh(lock_handle * new, lock_handle * old);
24027+
24028+extern int reiser4_prepare_to_sleep(lock_stack * owner);
24029+extern void reiser4_go_to_sleep(lock_stack * owner);
24030+extern void __reiser4_wake_up(lock_stack * owner);
24031+
24032+extern int lock_stack_isclean(lock_stack * owner);
24033+
24034+/* zlock object state check macros: only used in assertions. Both forms imply that the
24035+ lock is held by the current thread. */
24036+extern int znode_is_write_locked(const znode *);
24037+extern void reiser4_invalidate_lock(lock_handle *);
24038+
24039+/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24040+#define spin_ordering_pred_stack(stack) \
24041+ (LOCK_CNT_NIL(spin_locked_stack) && \
24042+ LOCK_CNT_NIL(spin_locked_txnmgr) && \
24043+ LOCK_CNT_NIL(spin_locked_inode) && \
24044+ LOCK_CNT_NIL(rw_locked_cbk_cache) && \
24045+ LOCK_CNT_NIL(spin_locked_super_eflush) )
24046+
24047+static inline void spin_lock_stack(lock_stack *stack)
24048+{
24049+ assert("", spin_ordering_pred_stack(stack));
24050+ spin_lock(&(stack->sguard));
24051+ LOCK_CNT_INC(spin_locked_stack);
24052+ LOCK_CNT_INC(spin_locked);
24053+}
24054+
24055+static inline void spin_unlock_stack(lock_stack *stack)
24056+{
24057+ assert_spin_locked(&(stack->sguard));
24058+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24059+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24060+ LOCK_CNT_DEC(spin_locked_stack);
24061+ LOCK_CNT_DEC(spin_locked);
24062+ spin_unlock(&(stack->sguard));
24063+}
24064+
24065+static inline void reiser4_wake_up(lock_stack * owner)
24066+{
24067+ spin_lock_stack(owner);
24068+ __reiser4_wake_up(owner);
24069+ spin_unlock_stack(owner);
24070+}
24071+
24072+const char *lock_mode_name(znode_lock_mode lock);
24073+
24074+#if REISER4_DEBUG
24075+extern void check_lock_data(void);
24076+extern void check_lock_node_data(znode * node);
24077+#else
24078+#define check_lock_data() noop
24079+#define check_lock_node_data() noop
24080+#endif
24081+
24082+/* __LOCK_H__ */
24083+#endif
24084+
24085+/* Make Linus happy.
24086+ Local variables:
24087+ c-indentation-style: "K&R"
24088+ mode-name: "LC"
24089+ c-basic-offset: 8
24090+ tab-width: 8
24091+ fill-column: 120
24092+ End:
24093+*/
24094diff -urN linux-2.6.23.orig/fs/reiser4/Makefile linux-2.6.23/fs/reiser4/Makefile
24095--- linux-2.6.23.orig/fs/reiser4/Makefile 1970-01-01 03:00:00.000000000 +0300
24096+++ linux-2.6.23/fs/reiser4/Makefile 2007-12-04 16:49:30.000000000 +0300
24097@@ -0,0 +1,98 @@
24098+#
24099+# reiser4/Makefile
24100+#
24101+
24102+obj-$(CONFIG_REISER4_FS) += reiser4.o
24103+
24104+reiser4-y := \
24105+ debug.o \
24106+ jnode.o \
24107+ znode.o \
24108+ key.o \
24109+ pool.o \
24110+ tree_mod.o \
24111+ estimate.o \
24112+ carry.o \
24113+ carry_ops.o \
24114+ lock.o \
24115+ tree.o \
24116+ context.o \
24117+ tap.o \
24118+ coord.o \
24119+ block_alloc.o \
24120+ txnmgr.o \
24121+ kassign.o \
24122+ flush.o \
24123+ wander.o \
24124+ eottl.o \
24125+ search.o \
24126+ page_cache.o \
24127+ seal.o \
24128+ dscale.o \
24129+ flush_queue.o \
24130+ ktxnmgrd.o \
24131+ blocknrset.o \
24132+ super.o \
24133+ super_ops.o \
24134+ fsdata.o \
24135+ export_ops.o \
24136+ oid.o \
24137+ tree_walk.o \
24138+ inode.o \
24139+ vfs_ops.o \
24140+ as_ops.o \
24141+ entd.o\
24142+ readahead.o \
24143+ status_flags.o \
24144+ init_super.o \
24145+ safe_link.o \
24146+ \
24147+ plugin/plugin.o \
24148+ plugin/plugin_set.o \
24149+ plugin/node/node.o \
24150+ plugin/object.o \
24151+ plugin/cluster.o \
24152+ plugin/inode_ops.o \
24153+ plugin/inode_ops_rename.o \
24154+ plugin/file_ops.o \
24155+ plugin/file_ops_readdir.o \
24156+ plugin/file_plugin_common.o \
24157+ plugin/file/file.o \
24158+ plugin/file/tail_conversion.o \
24159+ plugin/file/file_conversion.o \
24160+ plugin/file/symlink.o \
24161+ plugin/file/cryptcompress.o \
24162+ plugin/dir_plugin_common.o \
24163+ plugin/dir/hashed_dir.o \
24164+ plugin/dir/seekable_dir.o \
24165+ plugin/node/node40.o \
24166+ \
24167+ plugin/crypto/cipher.o \
24168+ plugin/crypto/digest.o \
24169+ \
24170+ plugin/compress/compress.o \
24171+ plugin/compress/compress_mode.o \
24172+ \
24173+ plugin/item/static_stat.o \
24174+ plugin/item/sde.o \
24175+ plugin/item/cde.o \
24176+ plugin/item/blackbox.o \
24177+ plugin/item/internal.o \
24178+ plugin/item/tail.o \
24179+ plugin/item/ctail.o \
24180+ plugin/item/extent.o \
24181+ plugin/item/extent_item_ops.o \
24182+ plugin/item/extent_file_ops.o \
24183+ plugin/item/extent_flush_ops.o \
24184+ \
24185+ plugin/hash.o \
24186+ plugin/fibration.o \
24187+ plugin/tail_policy.o \
24188+ plugin/item/item.o \
24189+ \
24190+ plugin/security/perm.o \
24191+ plugin/space/bitmap.o \
24192+ \
24193+ plugin/disk_format/disk_format40.o \
24194+ plugin/disk_format/disk_format.o
24195+
24196diff -urN linux-2.6.23.orig/fs/reiser4/oid.c linux-2.6.23/fs/reiser4/oid.c
24197--- linux-2.6.23.orig/fs/reiser4/oid.c 1970-01-01 03:00:00.000000000 +0300
24198+++ linux-2.6.23/fs/reiser4/oid.c 2007-12-04 16:49:30.000000000 +0300
24199@@ -0,0 +1,141 @@
24200+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24201+
24202+#include "debug.h"
24203+#include "super.h"
24204+#include "txnmgr.h"
24205+
24206+/* we used to have oid allocation plugin. It was removed because it
24207+ was recognized as providing unneeded level of abstraction. If one
24208+ ever will find it useful - look at yet_unneeded_abstractions/oid
24209+*/
24210+
24211+/*
24212+ * initialize in-memory data for oid allocator at @super. @nr_files and @next
24213+ * are provided by disk format plugin that reads them from the disk during
24214+ * mount.
24215+ */
24216+int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24217+{
24218+ reiser4_super_info_data *sbinfo;
24219+
24220+ sbinfo = get_super_private(super);
24221+
24222+ sbinfo->next_to_use = next;
24223+ sbinfo->oids_in_use = nr_files;
24224+ return 0;
24225+}
24226+
24227+/*
24228+ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24229+ * runs out of oids.
24230+ */
24231+oid_t oid_allocate(struct super_block * super)
24232+{
24233+ reiser4_super_info_data *sbinfo;
24234+ oid_t oid;
24235+
24236+ sbinfo = get_super_private(super);
24237+
24238+ spin_lock_reiser4_super(sbinfo);
24239+ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24240+ oid = sbinfo->next_to_use++;
24241+ sbinfo->oids_in_use++;
24242+ } else
24243+ oid = ABSOLUTE_MAX_OID;
24244+ spin_unlock_reiser4_super(sbinfo);
24245+ return oid;
24246+}
24247+
24248+/*
24249+ * Tell oid allocator that @oid is now free.
24250+ */
24251+int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24252+{
24253+ reiser4_super_info_data *sbinfo;
24254+
24255+ sbinfo = get_super_private(super);
24256+
24257+ spin_lock_reiser4_super(sbinfo);
24258+ sbinfo->oids_in_use--;
24259+ spin_unlock_reiser4_super(sbinfo);
24260+ return 0;
24261+}
24262+
24263+/*
24264+ * return next @oid that would be allocated (i.e., returned by oid_allocate())
24265+ * without actually allocating it. This is used by disk format plugin to save
24266+ * oid allocator state on the disk.
24267+ */
24268+oid_t oid_next(const struct super_block * super)
24269+{
24270+ reiser4_super_info_data *sbinfo;
24271+ oid_t oid;
24272+
24273+ sbinfo = get_super_private(super);
24274+
24275+ spin_lock_reiser4_super(sbinfo);
24276+ oid = sbinfo->next_to_use;
24277+ spin_unlock_reiser4_super(sbinfo);
24278+ return oid;
24279+}
24280+
24281+/*
24282+ * returns number of currently used oids. This is used by statfs(2) to report
24283+ * number of "inodes" and by disk format plugin to save oid allocator state on
24284+ * the disk.
24285+ */
24286+long oids_used(const struct super_block *super)
24287+{
24288+ reiser4_super_info_data *sbinfo;
24289+ oid_t used;
24290+
24291+ sbinfo = get_super_private(super);
24292+
24293+ spin_lock_reiser4_super(sbinfo);
24294+ used = sbinfo->oids_in_use;
24295+ spin_unlock_reiser4_super(sbinfo);
24296+ if (used < (__u64) ((long)~0) >> 1)
24297+ return (long)used;
24298+ else
24299+ return (long)-1;
24300+}
24301+
24302+/*
24303+ * Count oid as allocated in atom. This is done after call to oid_allocate()
24304+ * at the point when we are irrevocably committed to creation of the new file
24305+ * (i.e., when oid allocation cannot be any longer rolled back due to some
24306+ * error).
24307+ */
24308+void oid_count_allocated(void)
24309+{
24310+ txn_atom *atom;
24311+
24312+ atom = get_current_atom_locked();
24313+ atom->nr_objects_created++;
24314+ spin_unlock_atom(atom);
24315+}
24316+
24317+/*
24318+ * Count oid as free in atom. This is done after call to oid_release() at the
24319+ * point when we are irrevocably committed to the deletion of the file (i.e.,
24320+ * when oid release cannot be any longer rolled back due to some error).
24321+ */
24322+void oid_count_released(void)
24323+{
24324+ txn_atom *atom;
24325+
24326+ atom = get_current_atom_locked();
24327+ atom->nr_objects_deleted++;
24328+ spin_unlock_atom(atom);
24329+}
24330+
24331+/*
24332+ Local variables:
24333+ c-indentation-style: "K&R"
24334+ mode-name: "LC"
24335+ c-basic-offset: 8
24336+ tab-width: 8
24337+ fill-column: 120
24338+ scroll-step: 1
24339+ End:
24340+*/
24341diff -urN linux-2.6.23.orig/fs/reiser4/page_cache.c linux-2.6.23/fs/reiser4/page_cache.c
24342--- linux-2.6.23.orig/fs/reiser4/page_cache.c 1970-01-01 03:00:00.000000000 +0300
24343+++ linux-2.6.23/fs/reiser4/page_cache.c 2007-12-04 21:05:55.806810005 +0300
24344@@ -0,0 +1,730 @@
24345+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24346+ * reiser4/README */
24347+
24348+/* Memory pressure hooks. Fake inodes handling. */
24349+
24350+/* GLOSSARY
24351+
24352+ . Formatted and unformatted nodes.
24353+ Elements of reiser4 balanced tree to store data and metadata.
24354+ Unformatted nodes are pointed to by extent pointers. Such nodes
24355+ are used to store data of large objects. Unlike unformatted nodes,
24356+ formatted ones have associated format described by node4X plugin.
24357+
24358+ . Jnode (or journal node)
24359+ The in-memory header which is used to track formatted and unformatted
24360+ nodes, bitmap nodes, etc. In particular, jnodes are used to track
24361+ transactional information associated with each block(see reiser4/jnode.c
24362+ for details).
24363+
24364+ . Znode
24365+ The in-memory header which is used to track formatted nodes. Contains
24366+ embedded jnode (see reiser4/znode.c for details).
24367+*/
24368+
24369+/* We store all file system meta data (and data, of course) in the page cache.
24370+
24371+ What does this mean? In stead of using bread/brelse we create special
24372+ "fake" inode (one per super block) and store content of formatted nodes
24373+ into pages bound to this inode in the page cache. In newer kernels bread()
24374+ already uses inode attached to block device (bd_inode). Advantage of having
24375+ our own fake inode is that we can install appropriate methods in its
24376+ address_space operations. Such methods are called by VM on memory pressure
24377+ (or during background page flushing) and we can use them to react
24378+ appropriately.
24379+
24380+ In initial version we only support one block per page. Support for multiple
24381+ blocks per page is complicated by relocation.
24382+
24383+ To each page, used by reiser4, jnode is attached. jnode is analogous to
24384+ buffer head. Difference is that jnode is bound to the page permanently:
24385+ jnode cannot be removed from memory until its backing page is.
24386+
24387+ jnode contain pointer to page (->pg field) and page contain pointer to
24388+ jnode in ->private field. Pointer from jnode to page is protected to by
24389+ jnode's spinlock and pointer from page to jnode is protected by page lock
24390+ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24391+ lock. To go into reverse direction use jnode_lock_page() function that uses
24392+ standard try-lock-and-release device.
24393+
24394+ Properties:
24395+
24396+ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24397+ reference counter is increased.
24398+
24399+ 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24400+ reference counter is decreased.
24401+
24402+ 3. on jload() reference counter on jnode page is increased, page is
24403+ kmapped and `referenced'.
24404+
24405+ 4. on jrelse() inverse operations are performed.
24406+
24407+ 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24408+
24409+ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24410+ historically.]
24411+
24412+ [In the following discussion, `lock' invariably means long term lock on
24413+ znode.] (What about page locks?)
24414+
24415+ There is some special class of deadlock possibilities related to memory
24416+ pressure. Locks acquired by other reiser4 threads are accounted for in
24417+ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24418+ invoked additional hidden arc is added to the locking graph: thread that
24419+ tries to allocate memory waits for ->vm_writeback() to finish. If this
24420+ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24421+ prevention is useless.
24422+
24423+ Another related problem is possibility for ->vm_writeback() to run out of
24424+ memory itself. This is not a problem for ext2 and friends, because their
24425+ ->vm_writeback() don't allocate much memory, but reiser4 flush is
24426+ definitely able to allocate huge amounts of memory.
24427+
24428+ It seems that there is no reliable way to cope with the problems above. In
24429+ stead it was decided that ->vm_writeback() (as invoked in the kswapd
24430+ context) wouldn't perform any flushing itself, but rather should just wake
24431+ up some auxiliary thread dedicated for this purpose (or, the same thread
24432+ that does periodic commit of old atoms (ktxnmgrd.c)).
24433+
24434+ Details:
24435+
24436+ 1. Page is called `reclaimable' against particular reiser4 mount F if this
24437+ page can be ultimately released by try_to_free_pages() under presumptions
24438+ that:
24439+
24440+ a. ->vm_writeback() for F is no-op, and
24441+
24442+ b. none of the threads accessing F are making any progress, and
24443+
24444+ c. other reiser4 mounts obey the same memory reservation protocol as F
24445+ (described below).
24446+
24447+ For example, clean un-pinned page, or page occupied by ext2 data are
24448+ reclaimable against any reiser4 mount.
24449+
24450+ When there is more than one reiser4 mount in a system, condition (c) makes
24451+ reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24452+
24453+ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24454+
24455+ Fake inode is used to bound formatted nodes and each node is indexed within
24456+ fake inode by its block number. If block size of smaller than page size, it
24457+ may so happen that block mapped to the page with formatted node is occupied
24458+ by unformatted node or is unallocated. This lead to some complications,
24459+ because flushing whole page can lead to an incorrect overwrite of
24460+ unformatted node that is moreover, can be cached in some other place as
24461+ part of the file body. To avoid this, buffers for unformatted nodes are
24462+ never marked dirty. Also pages in the fake are never marked dirty. This
24463+ rules out usage of ->writepage() as memory pressure hook. In stead
24464+ ->releasepage() is used.
24465+
24466+ Josh is concerned that page->buffer is going to die. This should not pose
24467+ significant problem though, because we need to add some data structures to
24468+ the page anyway (jnode) and all necessary book keeping can be put there.
24469+
24470+*/
24471+
24472+/* Life cycle of pages/nodes.
24473+
24474+ jnode contains reference to page and page contains reference back to
24475+ jnode. This reference is counted in page ->count. Thus, page bound to jnode
24476+ cannot be released back into free pool.
24477+
24478+ 1. Formatted nodes.
24479+
24480+ 1. formatted node is represented by znode. When new znode is created its
24481+ ->pg pointer is NULL initially.
24482+
24483+ 2. when node content is loaded into znode (by call to zload()) for the
24484+ first time following happens (in call to ->read_node() or
24485+ ->allocate_node()):
24486+
24487+ 1. new page is added to the page cache.
24488+
24489+ 2. this page is attached to znode and its ->count is increased.
24490+
24491+ 3. page is kmapped.
24492+
24493+ 3. if more calls to zload() follow (without corresponding zrelses), page
24494+ counter is left intact and in its stead ->d_count is increased in znode.
24495+
24496+ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24497+ ->release_node() is called and page is kunmapped as result.
24498+
24499+ 5. at some moment node can be captured by a transaction. Its ->x_count
24500+ is then increased by transaction manager.
24501+
24502+ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24503+ bit set) following will happen (also see comment at the top of znode.c):
24504+
24505+ 1. when last lock is released, node will be uncaptured from
24506+ transaction. This released reference that transaction manager acquired
24507+ at the step 5.
24508+
24509+ 2. when last reference is released, zput() detects that node is
24510+ actually deleted and calls ->delete_node()
24511+ operation. page_cache_delete_node() implementation detaches jnode from
24512+ page and releases page.
24513+
24514+ 7. otherwise (node wasn't removed from the tree), last reference to
24515+ znode will be released after transaction manager committed transaction
24516+ node was in. This implies squallocing of this node (see
24517+ flush.c). Nothing special happens at this point. Znode is still in the
24518+ hash table and page is still attached to it.
24519+
24520+ 8. znode is actually removed from the memory because of the memory
24521+ pressure, or during umount (znodes_tree_done()). Anyway, znode is
24522+ removed by the call to zdrop(). At this moment, page is detached from
24523+ znode and removed from the inode address space.
24524+
24525+*/
24526+
24527+#include "debug.h"
24528+#include "dformat.h"
24529+#include "key.h"
24530+#include "txnmgr.h"
24531+#include "jnode.h"
24532+#include "znode.h"
24533+#include "block_alloc.h"
24534+#include "tree.h"
24535+#include "vfs_ops.h"
24536+#include "inode.h"
24537+#include "super.h"
24538+#include "entd.h"
24539+#include "page_cache.h"
24540+#include "ktxnmgrd.h"
24541+
24542+#include <linux/types.h>
24543+#include <linux/fs.h>
24544+#include <linux/mm.h> /* for struct page */
24545+#include <linux/swap.h> /* for struct page */
24546+#include <linux/pagemap.h>
24547+#include <linux/bio.h>
24548+#include <linux/writeback.h>
24549+#include <linux/blkdev.h>
24550+
24551+static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24552+
24553+static struct address_space_operations formatted_fake_as_ops;
24554+
24555+static const oid_t fake_ino = 0x1;
24556+static const oid_t bitmap_ino = 0x2;
24557+static const oid_t cc_ino = 0x3;
24558+
24559+static void
24560+init_fake_inode(struct super_block *super, struct inode *fake,
24561+ struct inode **pfake)
24562+{
24563+ assert("nikita-2168", fake->i_state & I_NEW);
24564+ fake->i_mapping->a_ops = &formatted_fake_as_ops;
24565+ *pfake = fake;
24566+ /* NOTE-NIKITA something else? */
24567+ unlock_new_inode(fake);
24568+}
24569+
24570+/**
24571+ * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24572+ * @super: super block to init fake inode for
24573+ *
24574+ * Initializes fake inode to which formatted nodes are bound in the page cache
24575+ * and inode for bitmaps.
24576+ */
24577+int reiser4_init_formatted_fake(struct super_block *super)
24578+{
24579+ struct inode *fake;
24580+ struct inode *bitmap;
24581+ struct inode *cc;
24582+ reiser4_super_info_data *sinfo;
24583+
24584+ assert("nikita-1703", super != NULL);
24585+
24586+ sinfo = get_super_private_nocheck(super);
24587+ fake = iget_locked(super, oid_to_ino(fake_ino));
24588+
24589+ if (fake != NULL) {
24590+ init_fake_inode(super, fake, &sinfo->fake);
24591+
24592+ bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24593+ if (bitmap != NULL) {
24594+ init_fake_inode(super, bitmap, &sinfo->bitmap);
24595+
24596+ cc = iget_locked(super, oid_to_ino(cc_ino));
24597+ if (cc != NULL) {
24598+ init_fake_inode(super, cc, &sinfo->cc);
24599+ return 0;
24600+ } else {
24601+ iput(sinfo->fake);
24602+ iput(sinfo->bitmap);
24603+ sinfo->fake = NULL;
24604+ sinfo->bitmap = NULL;
24605+ }
24606+ } else {
24607+ iput(sinfo->fake);
24608+ sinfo->fake = NULL;
24609+ }
24610+ }
24611+ return RETERR(-ENOMEM);
24612+}
24613+
24614+/**
24615+ * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24616+ * @super: super block to init fake inode for
24617+ *
24618+ * Releases inodes which were used as address spaces of bitmap and formatted
24619+ * nodes.
24620+ */
24621+void reiser4_done_formatted_fake(struct super_block *super)
24622+{
24623+ reiser4_super_info_data *sinfo;
24624+
24625+ sinfo = get_super_private_nocheck(super);
24626+
24627+ if (sinfo->fake != NULL) {
24628+ iput(sinfo->fake);
24629+ sinfo->fake = NULL;
24630+ }
24631+
24632+ if (sinfo->bitmap != NULL) {
24633+ iput(sinfo->bitmap);
24634+ sinfo->bitmap = NULL;
24635+ }
24636+
24637+ if (sinfo->cc != NULL) {
24638+ iput(sinfo->cc);
24639+ sinfo->cc = NULL;
24640+ }
24641+ return;
24642+}
24643+
24644+void reiser4_wait_page_writeback(struct page *page)
24645+{
24646+ assert("zam-783", PageLocked(page));
24647+
24648+ do {
24649+ unlock_page(page);
24650+ wait_on_page_writeback(page);
24651+ lock_page(page);
24652+ } while (PageWriteback(page));
24653+}
24654+
24655+/* return tree @page is in */
24656+reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ )
24657+{
24658+ assert("nikita-2461", page != NULL);
24659+ return &get_super_private(page->mapping->host->i_sb)->tree;
24660+}
24661+
24662+/* completion handler for single page bio-based read.
24663+
24664+ mpage_end_io_read() would also do. But it's static.
24665+
24666+*/
24667+static int
24668+end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24669+ int err UNUSED_ARG)
24670+{
24671+ struct page *page;
24672+
24673+ if (bio->bi_size != 0) {
24674+ warning("nikita-3332", "Truncated single page read: %i",
24675+ bio->bi_size);
24676+ return 1;
24677+ }
24678+
24679+ page = bio->bi_io_vec[0].bv_page;
24680+
24681+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24682+ SetPageUptodate(page);
24683+ } else {
24684+ ClearPageUptodate(page);
24685+ SetPageError(page);
24686+ }
24687+ unlock_page(page);
24688+ bio_put(bio);
24689+ return 0;
24690+}
24691+
24692+/* completion handler for single page bio-based write.
24693+
24694+ mpage_end_io_write() would also do. But it's static.
24695+
24696+*/
24697+static int
24698+end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24699+ int err UNUSED_ARG)
24700+{
24701+ struct page *page;
24702+
24703+ if (bio->bi_size != 0) {
24704+ warning("nikita-3333", "Truncated single page write: %i",
24705+ bio->bi_size);
24706+ return 1;
24707+ }
24708+
24709+ page = bio->bi_io_vec[0].bv_page;
24710+
24711+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24712+ SetPageError(page);
24713+ end_page_writeback(page);
24714+ bio_put(bio);
24715+ return 0;
24716+}
24717+
24718+/* ->readpage() method for formatted nodes */
24719+static int formatted_readpage(struct file *f UNUSED_ARG,
24720+ struct page *page /* page to read */ )
24721+{
24722+ assert("nikita-2412", PagePrivate(page) && jprivate(page));
24723+ return reiser4_page_io(page, jprivate(page), READ,
24724+ reiser4_ctx_gfp_mask_get());
24725+}
24726+
24727+/**
24728+ * reiser4_page_io - submit single-page bio request
24729+ * @page: page to perform io for
24730+ * @node: jnode of page
24731+ * @rw: read or write
24732+ * @gfp: gfp mask for bio allocation
24733+ *
24734+ * Submits single page read or write.
24735+ */
24736+int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
24737+{
24738+ struct bio *bio;
24739+ int result;
24740+
24741+ assert("nikita-2094", page != NULL);
24742+ assert("nikita-2226", PageLocked(page));
24743+ assert("nikita-2634", node != NULL);
24744+ assert("nikita-2893", rw == READ || rw == WRITE);
24745+
24746+ if (rw) {
24747+ if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24748+ unlock_page(page);
24749+ return 0;
24750+ }
24751+ }
24752+
24753+ bio = page_bio(page, node, rw, gfp);
24754+ if (!IS_ERR(bio)) {
24755+ if (rw == WRITE) {
24756+ set_page_writeback(page);
24757+ unlock_page(page);
24758+ }
24759+ reiser4_submit_bio(rw, bio);
24760+ result = 0;
24761+ } else {
24762+ unlock_page(page);
24763+ result = PTR_ERR(bio);
24764+ }
24765+
24766+ return result;
24767+}
24768+
24769+/* helper function to construct bio for page */
24770+static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24771+{
24772+ struct bio *bio;
24773+ assert("nikita-2092", page != NULL);
24774+ assert("nikita-2633", node != NULL);
24775+
24776+ /* Simple implementation in the assumption that blocksize == pagesize.
24777+
24778+ We only have to submit one block, but submit_bh() will allocate bio
24779+ anyway, so lets use all the bells-and-whistles of bio code.
24780+ */
24781+
24782+ bio = bio_alloc(gfp, 1);
24783+ if (bio != NULL) {
24784+ int blksz;
24785+ struct super_block *super;
24786+ reiser4_block_nr blocknr;
24787+
24788+ super = page->mapping->host->i_sb;
24789+ assert("nikita-2029", super != NULL);
24790+ blksz = super->s_blocksize;
24791+ assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24792+
24793+ spin_lock_jnode(node);
24794+ blocknr = *jnode_get_io_block(node);
24795+ spin_unlock_jnode(node);
24796+
24797+ assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
24798+ assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
24799+
24800+ bio->bi_bdev = super->s_bdev;
24801+ /* fill bio->bi_sector before calling bio_add_page(), because
24802+ * q->merge_bvec_fn may want to inspect it (see
24803+ * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24804+ bio->bi_sector = blocknr * (blksz >> 9);
24805+
24806+ if (!bio_add_page(bio, page, blksz, 0)) {
24807+ warning("nikita-3452",
24808+ "Single page bio cannot be constructed");
24809+ return ERR_PTR(RETERR(-EINVAL));
24810+ }
24811+
24812+ /* bio -> bi_idx is filled by bio_init() */
24813+ bio->bi_end_io = (rw == READ) ?
24814+ end_bio_single_page_read : end_bio_single_page_write;
24815+
24816+ return bio;
24817+ } else
24818+ return ERR_PTR(RETERR(-ENOMEM));
24819+}
24820+
24821+/* this function is internally called by jnode_make_dirty() */
24822+int reiser4_set_page_dirty_internal(struct page *page)
24823+{
24824+ struct address_space *mapping;
24825+
24826+ mapping = page->mapping;
24827+ BUG_ON(mapping == NULL);
24828+
24829+ if (!TestSetPageDirty(page)) {
24830+ if (mapping_cap_account_dirty(mapping))
24831+ inc_zone_page_state(page, NR_FILE_DIRTY);
24832+
24833+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
24834+ }
24835+
24836+ /* znode must be dirty ? */
24837+ if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb))
24838+ assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
24839+ return 0;
24840+}
24841+
24842+#if 0
24843+static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
24844+{
24845+ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
24846+ return 1;
24847+ if (ctx->super != s)
24848+ return 1;
24849+ if (get_super_private(s)->entd.tsk == current)
24850+ return 0;
24851+ if (!lock_stack_isclean(&ctx->stack))
24852+ return 0;
24853+ if (ctx->trans->atom != NULL)
24854+ return 0;
24855+ return 1;
24856+}
24857+#endif
24858+
24859+/**
24860+ * reiser4_writepage - writepage of struct address_space_operations
24861+ * @page: page to write
24862+ * @wbc:
24863+ *
24864+ *
24865+ */
24866+/* Common memory pressure notification. */
24867+int reiser4_writepage(struct page *page,
24868+ struct writeback_control *wbc)
24869+{
24870+ struct super_block *s;
24871+ reiser4_context *ctx;
24872+
24873+ assert("vs-828", PageLocked(page));
24874+
24875+ s = page->mapping->host->i_sb;
24876+ ctx = get_current_context_check();
24877+
24878+ //assert("", can_hit_entd(ctx, s));
24879+ return write_page_by_ent(page, wbc);
24880+}
24881+
24882+/* ->set_page_dirty() method of formatted address_space */
24883+static int formatted_set_page_dirty(struct page *page)
24884+{
24885+ assert("nikita-2173", page != NULL);
24886+ BUG();
24887+ return __set_page_dirty_nobuffers(page);
24888+}
24889+
24890+/* writepages method of address space operations in reiser4 is used to involve
24891+ into transactions pages which are dirtied via mmap. Only regular files can
24892+ have such pages. Fake inode is used to access formatted nodes via page
24893+ cache. As formatted nodes can never be mmaped, fake inode's writepages has
24894+ nothing to do */
24895+static int
24896+writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
24897+{
24898+ return 0;
24899+}
24900+
24901+/* address space operations for the fake inode */
24902+static struct address_space_operations formatted_fake_as_ops = {
24903+ /* Perform a writeback of a single page as a memory-freeing
24904+ * operation. */
24905+ .writepage = reiser4_writepage,
24906+ /* this is called to read formatted node */
24907+ .readpage = formatted_readpage,
24908+ /* ->sync_page() method of fake inode address space operations. Called
24909+ from wait_on_page() and lock_page().
24910+
24911+ This is most annoyingly misnomered method. Actually it is called
24912+ from wait_on_page_bit() and lock_page() and its purpose is to
24913+ actually start io by jabbing device drivers.
24914+ */
24915+ .sync_page = block_sync_page,
24916+ /* Write back some dirty pages from this mapping. Called from sync.
24917+ called during sync (pdflush) */
24918+ .writepages = writepages_fake,
24919+ /* Set a page dirty */
24920+ .set_page_dirty = formatted_set_page_dirty,
24921+ /* used for read-ahead. Not applicable */
24922+ .readpages = NULL,
24923+ .prepare_write = NULL,
24924+ .commit_write = NULL,
24925+ .bmap = NULL,
24926+ /* called just before page is being detached from inode mapping and
24927+ removed from memory. Called on truncate, cut/squeeze, and
24928+ umount. */
24929+ .invalidatepage = reiser4_invalidatepage,
24930+ /* this is called by shrink_cache() so that file system can try to
24931+ release objects (jnodes, buffers, journal heads) attached to page
24932+ and, may be made page itself free-able.
24933+ */
24934+ .releasepage = reiser4_releasepage,
24935+ .direct_IO = NULL
24936+};
24937+
24938+/* called just before page is released (no longer used by reiser4). Callers:
24939+ jdelete() and extent2tail(). */
24940+void reiser4_drop_page(struct page *page)
24941+{
24942+ assert("nikita-2181", PageLocked(page));
24943+ clear_page_dirty_for_io(page);
24944+ ClearPageUptodate(page);
24945+#if defined(PG_skipped)
24946+ ClearPageSkipped(page);
24947+#endif
24948+ unlock_page(page);
24949+}
24950+
24951+#define JNODE_GANG_SIZE (16)
24952+
24953+/* find all jnodes from range specified and invalidate them */
24954+static int
24955+truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
24956+{
24957+ reiser4_inode *info;
24958+ int truncated_jnodes;
24959+ reiser4_tree *tree;
24960+ unsigned long index;
24961+ unsigned long end;
24962+
24963+ if (inode_file_plugin(inode) ==
24964+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
24965+ /*
24966+ * No need to get rid of jnodes here: if the single jnode of
24967+ * page cluster did not have page, then it was found and killed
24968+ * before in
24969+ * truncate_complete_page_cluster()->jput()->jput_final(),
24970+ * otherwise it will be dropped by reiser4_invalidatepage()
24971+ */
24972+ return 0;
24973+ truncated_jnodes = 0;
24974+
24975+ info = reiser4_inode_data(inode);
24976+ tree = reiser4_tree_by_inode(inode);
24977+
24978+ index = from;
24979+ end = from + count;
24980+
24981+ while (1) {
24982+ jnode *gang[JNODE_GANG_SIZE];
24983+ int taken;
24984+ int i;
24985+ jnode *node;
24986+
24987+ assert("nikita-3466", index <= end);
24988+
24989+ read_lock_tree(tree);
24990+ taken =
24991+ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
24992+ (void **)gang, index,
24993+ JNODE_GANG_SIZE);
24994+ for (i = 0; i < taken; ++i) {
24995+ node = gang[i];
24996+ if (index_jnode(node) < end)
24997+ jref(node);
24998+ else
24999+ gang[i] = NULL;
25000+ }
25001+ read_unlock_tree(tree);
25002+
25003+ for (i = 0; i < taken; ++i) {
25004+ node = gang[i];
25005+ if (node != NULL) {
25006+ index = max(index, index_jnode(node));
25007+ spin_lock_jnode(node);
25008+ assert("edward-1457", node->pg == NULL);
25009+ /* this is always called after
25010+ truncate_inode_pages_range(). Therefore, here
25011+ jnode can not have page. New pages can not be
25012+ created because truncate_jnodes_range goes
25013+ under exclusive access on file obtained,
25014+ where as new page creation requires
25015+ non-exclusive access obtained */
25016+ JF_SET(node, JNODE_HEARD_BANSHEE);
25017+ reiser4_uncapture_jnode(node);
25018+ unhash_unformatted_jnode(node);
25019+ truncated_jnodes++;
25020+ jput(node);
25021+ } else
25022+ break;
25023+ }
25024+ if (i != taken || taken == 0)
25025+ break;
25026+ }
25027+ return truncated_jnodes;
25028+}
25029+
25030+/* Truncating files in reiser4: problems and solutions.
25031+
25032+ VFS calls fs's truncate after it has called truncate_inode_pages()
25033+ to get rid of pages corresponding to part of file being truncated.
25034+ In reiser4 it may cause existence of unallocated extents which do
25035+ not have jnodes. Flush code does not expect that. Solution of this
25036+ problem is straightforward. As vfs's truncate is implemented using
25037+ setattr operation, it seems reasonable to have ->setattr() that
25038+ will cut file body. However, flush code also does not expect dirty
25039+ pages without parent items, so it is impossible to cut all items,
25040+ then truncate all pages in two steps. We resolve this problem by
25041+ cutting items one-by-one. Each such fine-grained step performed
25042+ under longterm znode lock calls at the end ->kill_hook() method of
25043+ a killed item to remove its binded pages and jnodes.
25044+
25045+ The following function is a common part of mentioned kill hooks.
25046+ Also, this is called before tail-to-extent conversion (to not manage
25047+ few copies of the data).
25048+*/
25049+void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25050+ unsigned long count, int even_cows)
25051+{
25052+ loff_t from_bytes, count_bytes;
25053+
25054+ if (count == 0)
25055+ return;
25056+ from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25057+ count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25058+
25059+ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25060+ truncate_inode_pages_range(mapping, from_bytes,
25061+ from_bytes + count_bytes - 1);
25062+ truncate_jnodes_range(mapping->host, from, count);
25063+}
25064+
25065+/*
25066+ * Local variables:
25067+ * c-indentation-style: "K&R"
25068+ * mode-name: "LC"
25069+ * c-basic-offset: 8
25070+ * tab-width: 8
25071+ * fill-column: 120
25072+ * scroll-step: 1
25073+ * End:
25074+ */
25075diff -urN linux-2.6.23.orig/fs/reiser4/page_cache.h linux-2.6.23/fs/reiser4/page_cache.h
25076--- linux-2.6.23.orig/fs/reiser4/page_cache.h 1970-01-01 03:00:00.000000000 +0300
25077+++ linux-2.6.23/fs/reiser4/page_cache.h 2007-12-04 16:49:30.000000000 +0300
25078@@ -0,0 +1,68 @@
25079+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25080+ * reiser4/README */
25081+/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25082+
25083+#if !defined( __REISER4_PAGE_CACHE_H__ )
25084+#define __REISER4_PAGE_CACHE_H__
25085+
25086+#include "forward.h"
25087+#include "context.h" /* for reiser4_ctx_gfp_mask_get() */
25088+
25089+#include <linux/fs.h> /* for struct super_block, address_space */
25090+#include <linux/mm.h> /* for struct page */
25091+#include <linux/pagemap.h> /* for lock_page() */
25092+#include <linux/vmalloc.h> /* for __vmalloc() */
25093+
25094+extern int reiser4_init_formatted_fake(struct super_block *);
25095+extern void reiser4_done_formatted_fake(struct super_block *);
25096+
25097+extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25098+
25099+extern int reiser4_set_page_dirty_internal(struct page *);
25100+
25101+#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25102+
25103+extern void reiser4_wait_page_writeback(struct page *);
25104+static inline void lock_and_wait_page_writeback(struct page *page)
25105+{
25106+ lock_page(page);
25107+ if (unlikely(PageWriteback(page)))
25108+ reiser4_wait_page_writeback(page);
25109+}
25110+
25111+#define jprivate(page) ((jnode *)page_private(page))
25112+
25113+extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25114+extern void reiser4_drop_page(struct page *);
25115+extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25116+ unsigned long count, int even_cows);
25117+extern void capture_reiser4_inodes(struct super_block *,
25118+ struct writeback_control *);
25119+static inline void * reiser4_vmalloc (unsigned long size)
25120+{
25121+ return __vmalloc(size,
25122+ reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25123+ PAGE_KERNEL);
25124+}
25125+
25126+#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25127+
25128+#if REISER4_DEBUG
25129+extern void print_page(const char *prefix, struct page *page);
25130+#else
25131+#define print_page(prf, p) noop
25132+#endif
25133+
25134+/* __REISER4_PAGE_CACHE_H__ */
25135+#endif
25136+
25137+/* Make Linus happy.
25138+ Local variables:
25139+ c-indentation-style: "K&R"
25140+ mode-name: "LC"
25141+ c-basic-offset: 8
25142+ tab-width: 8
25143+ fill-column: 120
25144+ scroll-step: 1
25145+ End:
25146+*/
25147diff -urN linux-2.6.23.orig/fs/reiser4/plugin/cluster.c linux-2.6.23/fs/reiser4/plugin/cluster.c
25148--- linux-2.6.23.orig/fs/reiser4/plugin/cluster.c 1970-01-01 03:00:00.000000000 +0300
25149+++ linux-2.6.23/fs/reiser4/plugin/cluster.c 2007-12-04 16:49:30.000000000 +0300
25150@@ -0,0 +1,71 @@
25151+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25152+
25153+/* Contains reiser4 cluster plugins (see
25154+ http://www.namesys.com/cryptcompress_design.html
25155+ "Concepts of clustering" for details). */
25156+
25157+#include "plugin_header.h"
25158+#include "plugin.h"
25159+#include "../inode.h"
25160+
25161+static int change_cluster(struct inode *inode,
25162+ reiser4_plugin * plugin,
25163+ pset_member memb)
25164+{
25165+ assert("edward-1324", inode != NULL);
25166+ assert("edward-1325", plugin != NULL);
25167+ assert("edward-1326", is_reiser4_inode(inode));
25168+ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25169+
25170+ /* Can't change the cluster plugin for already existent regular files. */
25171+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25172+ return RETERR(-EINVAL);
25173+
25174+ /* If matches, nothing to change. */
25175+ if (inode_hash_plugin(inode) != NULL &&
25176+ inode_hash_plugin(inode)->h.id == plugin->h.id)
25177+ return 0;
25178+
25179+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25180+ PSET_CLUSTER, plugin);
25181+}
25182+
25183+static reiser4_plugin_ops cluster_plugin_ops = {
25184+ .init = NULL,
25185+ .load = NULL,
25186+ .save_len = NULL,
25187+ .save = NULL,
25188+ .change = &change_cluster
25189+};
25190+
25191+#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25192+ [CLUSTER_ ## ID ## _ID] = { \
25193+ .h = { \
25194+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25195+ .id = CLUSTER_ ## ID ## _ID, \
25196+ .pops = &cluster_plugin_ops, \
25197+ .label = LABEL, \
25198+ .desc = DESC, \
25199+ .linkage = {NULL, NULL} \
25200+ }, \
25201+ .shift = SHIFT \
25202+ }
25203+
25204+cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25205+ SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25206+ SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25207+ SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25208+ SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25209+ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25210+};
25211+
25212+/*
25213+ Local variables:
25214+ c-indentation-style: "K&R"
25215+ mode-name: "LC"
25216+ c-basic-offset: 8
25217+ tab-width: 8
25218+ fill-column: 120
25219+ scroll-step: 1
25220+ End:
25221+*/
25222diff -urN linux-2.6.23.orig/fs/reiser4/plugin/cluster.h linux-2.6.23/fs/reiser4/plugin/cluster.h
25223--- linux-2.6.23.orig/fs/reiser4/plugin/cluster.h 1970-01-01 03:00:00.000000000 +0300
25224+++ linux-2.6.23/fs/reiser4/plugin/cluster.h 2007-12-04 16:49:30.000000000 +0300
25225@@ -0,0 +1,395 @@
25226+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25227+
25228+/* This file contains size/offset translators, modulators
25229+ and other helper functions. */
25230+
25231+#if !defined( __FS_REISER4_CLUSTER_H__ )
25232+#define __FS_REISER4_CLUSTER_H__
25233+
25234+#include "../inode.h"
25235+
25236+static inline int inode_cluster_shift(struct inode *inode)
25237+{
25238+ assert("edward-92", inode != NULL);
25239+ assert("edward-93", reiser4_inode_data(inode) != NULL);
25240+
25241+ return inode_cluster_plugin(inode)->shift;
25242+}
25243+
25244+static inline unsigned cluster_nrpages_shift(struct inode *inode)
25245+{
25246+ return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25247+}
25248+
25249+/* cluster size in page units */
25250+static inline unsigned cluster_nrpages(struct inode *inode)
25251+{
25252+ return 1U << cluster_nrpages_shift(inode);
25253+}
25254+
25255+static inline size_t inode_cluster_size(struct inode *inode)
25256+{
25257+ assert("edward-96", inode != NULL);
25258+
25259+ return 1U << inode_cluster_shift(inode);
25260+}
25261+
25262+static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25263+{
25264+ return idx >> cluster_nrpages_shift(inode);
25265+}
25266+
25267+static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25268+{
25269+ return idx << cluster_nrpages_shift(inode);
25270+}
25271+
25272+static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25273+{
25274+ return clust_to_pg(pg_to_clust(idx, inode), inode);
25275+}
25276+
25277+static inline pgoff_t off_to_pg(loff_t off)
25278+{
25279+ return (off >> PAGE_CACHE_SHIFT);
25280+}
25281+
25282+static inline loff_t pg_to_off(pgoff_t idx)
25283+{
25284+ return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25285+}
25286+
25287+static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25288+{
25289+ return off >> inode_cluster_shift(inode);
25290+}
25291+
25292+static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25293+{
25294+ return (loff_t) idx << inode_cluster_shift(inode);
25295+}
25296+
25297+static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25298+{
25299+ return clust_to_off(off_to_clust(off, inode), inode);
25300+}
25301+
25302+static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25303+{
25304+ return clust_to_pg(off_to_clust(off, inode), inode);
25305+}
25306+
25307+static inline unsigned off_to_pgoff(loff_t off)
25308+{
25309+ return off & (PAGE_CACHE_SIZE - 1);
25310+}
25311+
25312+static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25313+{
25314+ return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25315+}
25316+
25317+static inline pgoff_t offset_in_clust(struct page * page)
25318+{
25319+ assert("edward-1488", page != NULL);
25320+ assert("edward-1489", page->mapping != NULL);
25321+
25322+ return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1);
25323+}
25324+
25325+static inline int first_page_in_cluster(struct page * page)
25326+{
25327+ return offset_in_clust(page) == 0;
25328+}
25329+
25330+static inline int last_page_in_cluster(struct page * page)
25331+{
25332+ return offset_in_clust(page) ==
25333+ cluster_nrpages(page->mapping->host) - 1;
25334+}
25335+
25336+static inline unsigned
25337+pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25338+{
25339+ return off_to_cloff(pg_to_off(idx), inode);
25340+}
25341+
25342+/*********************** Size translators **************************/
25343+
25344+/* Translate linear size.
25345+ * New units are (1 << @blk_shift) times larger, then old ones.
25346+ * In other words, calculate number of logical blocks, occupied
25347+ * by @count elements
25348+ */
25349+static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits)
25350+{
25351+ return (count + (1UL << blkbits) - 1) >> blkbits;
25352+}
25353+
25354+/* size in pages */
25355+static inline pgoff_t size_in_pages(loff_t size)
25356+{
25357+ return size_in_blocks(size, PAGE_CACHE_SHIFT);
25358+}
25359+
25360+/* size in logical clusters */
25361+static inline cloff_t size_in_lc(loff_t size, struct inode *inode)
25362+{
25363+ return size_in_blocks(size, inode_cluster_shift(inode));
25364+}
25365+
25366+/* size in pages to the size in page clusters */
25367+static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode)
25368+{
25369+ return size_in_blocks(size, cluster_nrpages_shift(inode));
25370+}
25371+
25372+/*********************** Size modulators ***************************/
25373+
25374+/*
25375+ Modulate linear size by nominated block size and offset.
25376+
25377+ The "finite" function (which is zero almost everywhere).
25378+ How much is a height of the figure at a position @pos,
25379+ when trying to construct rectangle of height (1 << @blkbits),
25380+ and square @size.
25381+
25382+ ******
25383+ *******
25384+ *******
25385+ *******
25386+ ----------> pos
25387+*/
25388+static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits)
25389+{
25390+ unsigned end = size >> blkbits;
25391+ if (pos < end)
25392+ return 1U << blkbits;
25393+ if (unlikely(pos > end))
25394+ return 0;
25395+ return size & ~(~0ull << blkbits);
25396+}
25397+
25398+/* the same as above, but block size is page size */
25399+static inline unsigned __mbp(loff_t size, pgoff_t pos)
25400+{
25401+ return __mbb(size, pos, PAGE_CACHE_SHIFT);
25402+}
25403+
25404+/* number of file's bytes in the nominated logical cluster */
25405+static inline unsigned lbytes(cloff_t index, struct inode * inode)
25406+{
25407+ return __mbb(i_size_read(inode), index, inode_cluster_shift(inode));
25408+}
25409+
25410+/* number of file's bytes in the nominated page */
25411+static inline unsigned pbytes(pgoff_t index, struct inode * inode)
25412+{
25413+ return __mbp(i_size_read(inode), index);
25414+}
25415+
25416+/* return true, if logical cluster is not occupied by the file */
25417+static inline int new_logical_cluster(struct cluster_handle * clust,
25418+ struct inode *inode)
25419+{
25420+ return clust_to_off(clust->index, inode) >= i_size_read(inode);
25421+}
25422+
25423+/* return true, if pages @p1 and @p2 are of the same page cluster */
25424+static inline int same_page_cluster(struct page * p1, struct page * p2)
25425+{
25426+ assert("edward-1490", p1 != NULL);
25427+ assert("edward-1491", p2 != NULL);
25428+ assert("edward-1492", p1->mapping != NULL);
25429+ assert("edward-1493", p2->mapping != NULL);
25430+
25431+ return (pg_to_clust(page_index(p1), p1->mapping->host) ==
25432+ pg_to_clust(page_index(p2), p2->mapping->host));
25433+}
25434+
25435+static inline int cluster_is_complete(struct cluster_handle * clust,
25436+ struct inode * inode)
25437+{
25438+ return clust->tc.lsize == inode_cluster_size(inode);
25439+}
25440+
25441+static inline void reiser4_slide_init(struct reiser4_slide * win)
25442+{
25443+ assert("edward-1084", win != NULL);
25444+ memset(win, 0, sizeof *win);
25445+}
25446+
25447+static inline tfm_action
25448+cluster_get_tfm_act(struct tfm_cluster * tc)
25449+{
25450+ assert("edward-1356", tc != NULL);
25451+ return tc->act;
25452+}
25453+
25454+static inline void
25455+cluster_set_tfm_act(struct tfm_cluster * tc, tfm_action act)
25456+{
25457+ assert("edward-1356", tc != NULL);
25458+ tc->act = act;
25459+}
25460+
25461+static inline void cluster_init_act(struct cluster_handle * clust,
25462+ tfm_action act,
25463+ struct reiser4_slide * window)
25464+{
25465+ assert("edward-84", clust != NULL);
25466+ memset(clust, 0, sizeof *clust);
25467+ cluster_set_tfm_act(&clust->tc, act);
25468+ clust->dstat = INVAL_DISK_CLUSTER;
25469+ clust->win = window;
25470+}
25471+
25472+static inline void cluster_init_read(struct cluster_handle * clust,
25473+ struct reiser4_slide * window)
25474+{
25475+ cluster_init_act (clust, TFMA_READ, window);
25476+}
25477+
25478+static inline void cluster_init_write(struct cluster_handle * clust,
25479+ struct reiser4_slide * window)
25480+{
25481+ cluster_init_act (clust, TFMA_WRITE, window);
25482+}
25483+
25484+/* true if @p1 and @p2 are items of the same disk cluster */
25485+static inline int same_disk_cluster(const coord_t * p1, const coord_t * p2)
25486+{
25487+ /* drop this if you have other items to aggregate */
25488+ assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID);
25489+
25490+ return item_plugin_by_coord(p1)->b.mergeable(p1, p2);
25491+}
25492+
25493+static inline int dclust_get_extension_dsize(hint_t * hint)
25494+{
25495+ return hint->ext_coord.extension.ctail.dsize;
25496+}
25497+
25498+static inline void dclust_set_extension_dsize(hint_t * hint, int dsize)
25499+{
25500+ hint->ext_coord.extension.ctail.dsize = dsize;
25501+}
25502+
25503+static inline int dclust_get_extension_shift(hint_t * hint)
25504+{
25505+ return hint->ext_coord.extension.ctail.shift;
25506+}
25507+
25508+static inline int dclust_get_extension_ncount(hint_t * hint)
25509+{
25510+ return hint->ext_coord.extension.ctail.ncount;
25511+}
25512+
25513+static inline void dclust_inc_extension_ncount(hint_t * hint)
25514+{
25515+ hint->ext_coord.extension.ctail.ncount ++;
25516+}
25517+
25518+static inline void dclust_init_extension(hint_t * hint)
25519+{
25520+ memset(&hint->ext_coord.extension.ctail, 0,
25521+ sizeof(hint->ext_coord.extension.ctail));
25522+}
25523+
25524+static inline int hint_is_unprepped_dclust(hint_t * hint)
25525+{
25526+ assert("edward-1451", hint_is_valid(hint));
25527+ return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25528+}
25529+
25530+static inline void coord_set_between_clusters(coord_t * coord)
25531+{
25532+#if REISER4_DEBUG
25533+ int result;
25534+ result = zload(coord->node);
25535+ assert("edward-1296", !result);
25536+#endif
25537+ if (!coord_is_between_items(coord)) {
25538+ coord->between = AFTER_ITEM;
25539+ coord->unit_pos = 0;
25540+ }
25541+#if REISER4_DEBUG
25542+ zrelse(coord->node);
25543+#endif
25544+}
25545+
25546+int reiser4_inflate_cluster(struct cluster_handle *, struct inode *);
25547+int find_disk_cluster(struct cluster_handle *, struct inode *, int read,
25548+ znode_lock_mode mode);
25549+int checkout_logical_cluster(struct cluster_handle *, jnode *, struct inode *);
25550+int reiser4_deflate_cluster(struct cluster_handle *, struct inode *);
25551+void truncate_complete_page_cluster(struct inode *inode, cloff_t start,
25552+ int even_cows);
25553+void invalidate_hint_cluster(struct cluster_handle * clust);
25554+int get_disk_cluster_locked(struct cluster_handle * clust, struct inode * inode,
25555+ znode_lock_mode lock_mode);
25556+void reset_cluster_params(struct cluster_handle * clust);
25557+int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
25558+ int count);
25559+int prepare_page_cluster(struct inode *inode, struct cluster_handle * clust,
25560+ rw_op rw);
25561+void put_page_cluster(struct cluster_handle * clust,
25562+ struct inode * inode, rw_op rw);
25563+void put_cluster_handle(struct cluster_handle * clust);
25564+int grab_tfm_stream(struct inode *inode, struct tfm_cluster * tc, tfm_stream_id id);
25565+int tfm_cluster_is_uptodate(struct tfm_cluster * tc);
25566+void tfm_cluster_set_uptodate(struct tfm_cluster * tc);
25567+void tfm_cluster_clr_uptodate(struct tfm_cluster * tc);
25568+
25569+/* move cluster handle to the target position
25570+ specified by the page of index @pgidx */
25571+static inline void move_cluster_forward(struct cluster_handle * clust,
25572+ struct inode *inode,
25573+ pgoff_t pgidx)
25574+{
25575+ assert("edward-1297", clust != NULL);
25576+ assert("edward-1298", inode != NULL);
25577+
25578+ reset_cluster_params(clust);
25579+ if (clust->index_valid &&
25580+ /* Hole in the indices. Hint became invalid and can not be
25581+ used by find_cluster_item() even if seal/node versions
25582+ will coincide */
25583+ pg_to_clust(pgidx, inode) != clust->index + 1) {
25584+ reiser4_unset_hint(clust->hint);
25585+ invalidate_hint_cluster(clust);
25586+ }
25587+ clust->index = pg_to_clust(pgidx, inode);
25588+ clust->index_valid = 1;
25589+}
25590+
25591+static inline int alloc_clust_pages(struct cluster_handle * clust,
25592+ struct inode *inode)
25593+{
25594+ assert("edward-791", clust != NULL);
25595+ assert("edward-792", inode != NULL);
25596+ clust->pages =
25597+ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25598+ reiser4_ctx_gfp_mask_get());
25599+ if (!clust->pages)
25600+ return -ENOMEM;
25601+ return 0;
25602+}
25603+
25604+static inline void free_clust_pages(struct cluster_handle * clust)
25605+{
25606+ kfree(clust->pages);
25607+}
25608+
25609+#endif /* __FS_REISER4_CLUSTER_H__ */
25610+
25611+/* Make Linus happy.
25612+ Local variables:
25613+ c-indentation-style: "K&R"
25614+ mode-name: "LC"
25615+ c-basic-offset: 8
25616+ tab-width: 8
25617+ fill-column: 120
25618+ scroll-step: 1
25619+ End:
25620+*/
25621diff -urN linux-2.6.23.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.23/fs/reiser4/plugin/compress/compress.c
25622--- linux-2.6.23.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 03:00:00.000000000 +0300
25623+++ linux-2.6.23/fs/reiser4/plugin/compress/compress.c 2007-12-04 16:49:30.000000000 +0300
25624@@ -0,0 +1,367 @@
25625+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25626+/* reiser4 compression transform plugins */
25627+
25628+#include "../../debug.h"
25629+#include "../../inode.h"
25630+#include "../plugin.h"
25631+
25632+#include <linux/lzo.h>
25633+#include <linux/zlib.h>
25634+#include <linux/types.h>
25635+#include <linux/hardirq.h>
25636+
25637+static int change_compression(struct inode *inode,
25638+ reiser4_plugin * plugin,
25639+ pset_member memb)
25640+{
25641+ assert("edward-1316", inode != NULL);
25642+ assert("edward-1317", plugin != NULL);
25643+ assert("edward-1318", is_reiser4_inode(inode));
25644+ assert("edward-1319",
25645+ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25646+
25647+ /* cannot change compression plugin of already existing regular object */
25648+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25649+ return RETERR(-EINVAL);
25650+
25651+ /* If matches, nothing to change. */
25652+ if (inode_hash_plugin(inode) != NULL &&
25653+ inode_hash_plugin(inode)->h.id == plugin->h.id)
25654+ return 0;
25655+
25656+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25657+ PSET_COMPRESSION, plugin);
25658+}
25659+
25660+static reiser4_plugin_ops compression_plugin_ops = {
25661+ .init = NULL,
25662+ .load = NULL,
25663+ .save_len = NULL,
25664+ .save = NULL,
25665+ .change = &change_compression
25666+};
25667+
25668+/******************************************************************************/
25669+/* gzip1 compression */
25670+/******************************************************************************/
25671+
25672+#define GZIP1_DEF_LEVEL Z_BEST_SPEED
25673+#define GZIP1_DEF_WINBITS 15
25674+#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
25675+
25676+static int gzip1_init(void)
25677+{
25678+ int ret = -EINVAL;
25679+#if REISER4_ZLIB
25680+ ret = 0;
25681+#endif
25682+ if (ret == -EINVAL)
25683+ warning("edward-1337", "Zlib not compiled into kernel");
25684+ return ret;
25685+}
25686+
25687+static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25688+{
25689+ return 0;
25690+}
25691+
25692+static coa_t gzip1_alloc(tfm_action act)
25693+{
25694+ coa_t coa = NULL;
25695+#if REISER4_ZLIB
25696+ int ret = 0;
25697+ switch (act) {
25698+ case TFMA_WRITE: /* compress */
25699+ coa = reiser4_vmalloc(zlib_deflate_workspacesize());
25700+ if (!coa) {
25701+ ret = -ENOMEM;
25702+ break;
25703+ }
25704+ break;
25705+ case TFMA_READ: /* decompress */
25706+ coa = reiser4_vmalloc(zlib_inflate_workspacesize());
25707+ if (!coa) {
25708+ ret = -ENOMEM;
25709+ break;
25710+ }
25711+ break;
25712+ default:
25713+ impossible("edward-767",
25714+ "trying to alloc workspace for unknown tfm action");
25715+ }
25716+ if (ret) {
25717+ warning("edward-768",
25718+ "alloc workspace for gzip1 (tfm action = %d) failed\n",
25719+ act);
25720+ return ERR_PTR(ret);
25721+ }
25722+#endif
25723+ return coa;
25724+}
25725+
25726+static void gzip1_free(coa_t coa, tfm_action act)
25727+{
25728+ assert("edward-769", coa != NULL);
25729+
25730+ switch (act) {
25731+ case TFMA_WRITE: /* compress */
25732+ vfree(coa);
25733+ break;
25734+ case TFMA_READ: /* decompress */
25735+ vfree(coa);
25736+ break;
25737+ default:
25738+ impossible("edward-770", "unknown tfm action");
25739+ }
25740+ return;
25741+}
25742+
25743+static int gzip1_min_size_deflate(void)
25744+{
25745+ return 64;
25746+}
25747+
25748+static void
25749+gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25750+ __u8 * dst_first, unsigned *dst_len)
25751+{
25752+#if REISER4_ZLIB
25753+ int ret = 0;
25754+ struct z_stream_s stream;
25755+
25756+ assert("edward-842", coa != NULL);
25757+ assert("edward-875", src_len != 0);
25758+
25759+ stream.workspace = coa;
25760+ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25761+ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25762+ Z_DEFAULT_STRATEGY);
25763+ if (ret != Z_OK) {
25764+ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25765+ goto rollback;
25766+ }
25767+ ret = zlib_deflateReset(&stream);
25768+ if (ret != Z_OK) {
25769+ warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25770+ goto rollback;
25771+ }
25772+ stream.next_in = src_first;
25773+ stream.avail_in = src_len;
25774+ stream.next_out = dst_first;
25775+ stream.avail_out = *dst_len;
25776+
25777+ ret = zlib_deflate(&stream, Z_FINISH);
25778+ if (ret != Z_STREAM_END) {
25779+ if (ret != Z_OK)
25780+ warning("edward-773",
25781+ "zlib_deflate returned %d\n", ret);
25782+ goto rollback;
25783+ }
25784+ *dst_len = stream.total_out;
25785+ return;
25786+ rollback:
25787+ *dst_len = src_len;
25788+#endif
25789+ return;
25790+}
25791+
25792+static void
25793+gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25794+ __u8 * dst_first, unsigned *dst_len)
25795+{
25796+#if REISER4_ZLIB
25797+ int ret = 0;
25798+ struct z_stream_s stream;
25799+
25800+ assert("edward-843", coa != NULL);
25801+ assert("edward-876", src_len != 0);
25802+
25803+ stream.workspace = coa;
25804+ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25805+ if (ret != Z_OK) {
25806+ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25807+ return;
25808+ }
25809+ ret = zlib_inflateReset(&stream);
25810+ if (ret != Z_OK) {
25811+ warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25812+ return;
25813+ }
25814+
25815+ stream.next_in = src_first;
25816+ stream.avail_in = src_len;
25817+ stream.next_out = dst_first;
25818+ stream.avail_out = *dst_len;
25819+
25820+ ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25821+ /*
25822+ * Work around a bug in zlib, which sometimes wants to taste an extra
25823+ * byte when being used in the (undocumented) raw deflate mode.
25824+ * (From USAGI).
25825+ */
25826+ if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25827+ u8 zerostuff = 0;
25828+ stream.next_in = &zerostuff;
25829+ stream.avail_in = 1;
25830+ ret = zlib_inflate(&stream, Z_FINISH);
25831+ }
25832+ if (ret != Z_STREAM_END) {
25833+ warning("edward-776", "zlib_inflate returned %d\n", ret);
25834+ return;
25835+ }
25836+ *dst_len = stream.total_out;
25837+#endif
25838+ return;
25839+}
25840+
25841+/******************************************************************************/
25842+/* lzo1 compression */
25843+/******************************************************************************/
25844+
25845+static int lzo1_init(void)
25846+{
25847+ return 0;
25848+}
25849+
25850+static int lzo1_overrun(unsigned in_len)
25851+{
25852+ return in_len / 64 + 16 + 3;
25853+}
25854+
25855+static coa_t lzo1_alloc(tfm_action act)
25856+{
25857+ int ret = 0;
25858+ coa_t coa = NULL;
25859+
25860+ switch (act) {
25861+ case TFMA_WRITE: /* compress */
25862+ coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS);
25863+ if (!coa) {
25864+ ret = -ENOMEM;
25865+ break;
25866+ }
25867+ case TFMA_READ: /* decompress */
25868+ break;
25869+ default:
25870+ impossible("edward-877",
25871+ "trying to alloc workspace for unknown tfm action");
25872+ }
25873+ if (ret) {
25874+ warning("edward-878",
25875+ "alloc workspace for lzo1 (tfm action = %d) failed\n",
25876+ act);
25877+ return ERR_PTR(ret);
25878+ }
25879+ return coa;
25880+}
25881+
25882+static void lzo1_free(coa_t coa, tfm_action act)
25883+{
25884+ assert("edward-879", coa != NULL);
25885+
25886+ switch (act) {
25887+ case TFMA_WRITE: /* compress */
25888+ vfree(coa);
25889+ break;
25890+ case TFMA_READ: /* decompress */
25891+ impossible("edward-1304",
25892+ "trying to free non-allocated workspace");
25893+ default:
25894+ impossible("edward-880", "unknown tfm action");
25895+ }
25896+ return;
25897+}
25898+
25899+static int lzo1_min_size_deflate(void)
25900+{
25901+ return 256;
25902+}
25903+
25904+static void
25905+lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25906+ __u8 * dst_first, unsigned *dst_len)
25907+{
25908+ int result;
25909+
25910+ assert("edward-846", coa != NULL);
25911+ assert("edward-847", src_len != 0);
25912+
25913+ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
25914+ if (unlikely(result != LZO_E_OK)) {
25915+ warning("edward-849", "lzo1x_1_compress failed\n");
25916+ goto out;
25917+ }
25918+ if (*dst_len >= src_len) {
25919+ //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
25920+ goto out;
25921+ }
25922+ return;
25923+ out:
25924+ *dst_len = src_len;
25925+ return;
25926+}
25927+
25928+static void
25929+lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25930+ __u8 * dst_first, unsigned *dst_len)
25931+{
25932+ int result;
25933+
25934+ assert("edward-851", coa == NULL);
25935+ assert("edward-852", src_len != 0);
25936+
25937+ result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len);
25938+ if (result != LZO_E_OK)
25939+ warning("edward-853", "lzo1x_1_decompress failed\n");
25940+ return;
25941+}
25942+
25943+compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
25944+ [LZO1_COMPRESSION_ID] = {
25945+ .h = {
25946+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25947+ .id = LZO1_COMPRESSION_ID,
25948+ .pops = &compression_plugin_ops,
25949+ .label = "lzo1",
25950+ .desc = "lzo1 compression transform",
25951+ .linkage = {NULL, NULL}
25952+ },
25953+ .init = lzo1_init,
25954+ .overrun = lzo1_overrun,
25955+ .alloc = lzo1_alloc,
25956+ .free = lzo1_free,
25957+ .min_size_deflate = lzo1_min_size_deflate,
25958+ .checksum = reiser4_adler32,
25959+ .compress = lzo1_compress,
25960+ .decompress = lzo1_decompress
25961+ },
25962+ [GZIP1_COMPRESSION_ID] = {
25963+ .h = {
25964+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25965+ .id = GZIP1_COMPRESSION_ID,
25966+ .pops = &compression_plugin_ops,
25967+ .label = "gzip1",
25968+ .desc = "gzip1 compression transform",
25969+ .linkage = {NULL, NULL}
25970+ },
25971+ .init = gzip1_init,
25972+ .overrun = gzip1_overrun,
25973+ .alloc = gzip1_alloc,
25974+ .free = gzip1_free,
25975+ .min_size_deflate = gzip1_min_size_deflate,
25976+ .checksum = reiser4_adler32,
25977+ .compress = gzip1_compress,
25978+ .decompress = gzip1_decompress
25979+ }
25980+};
25981+
25982+/*
25983+ Local variables:
25984+ c-indentation-style: "K&R"
25985+ mode-name: "LC"
25986+ c-basic-offset: 8
25987+ tab-width: 8
25988+ fill-column: 120
25989+ scroll-step: 1
25990+ End:
25991+*/
25992diff -urN linux-2.6.23.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.23/fs/reiser4/plugin/compress/compress.h
25993--- linux-2.6.23.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 03:00:00.000000000 +0300
25994+++ linux-2.6.23/fs/reiser4/plugin/compress/compress.h 2007-12-04 16:49:30.000000000 +0300
25995@@ -0,0 +1,43 @@
25996+#if !defined( __FS_REISER4_COMPRESS_H__ )
25997+#define __FS_REISER4_COMPRESS_H__
25998+
25999+#include <linux/types.h>
26000+#include <linux/string.h>
26001+
26002+/* transform direction */
26003+typedef enum {
26004+ TFMA_READ, /* decrypt, decompress */
26005+ TFMA_WRITE, /* encrypt, compress */
26006+ TFMA_LAST
26007+} tfm_action;
26008+
26009+/* supported compression algorithms */
26010+typedef enum {
26011+ LZO1_COMPRESSION_ID,
26012+ GZIP1_COMPRESSION_ID,
26013+ LAST_COMPRESSION_ID,
26014+} reiser4_compression_id;
26015+
26016+/* the same as pgoff, but units are page clusters */
26017+typedef unsigned long cloff_t;
26018+
26019+/* working data of a (de)compression algorithm */
26020+typedef void *coa_t;
26021+
26022+/* table for all supported (de)compression algorithms */
26023+typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
26024+
26025+__u32 reiser4_adler32(char *data, __u32 len);
26026+
26027+#endif /* __FS_REISER4_COMPRESS_H__ */
26028+
26029+/* Make Linus happy.
26030+ Local variables:
26031+ c-indentation-style: "K&R"
26032+ mode-name: "LC"
26033+ c-basic-offset: 8
26034+ tab-width: 8
26035+ fill-column: 120
26036+ scroll-step: 1
26037+ End:
26038+*/
26039diff -urN linux-2.6.23.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.23/fs/reiser4/plugin/compress/compress_mode.c
26040--- linux-2.6.23.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 03:00:00.000000000 +0300
26041+++ linux-2.6.23/fs/reiser4/plugin/compress/compress_mode.c 2007-12-04 16:49:30.000000000 +0300
26042@@ -0,0 +1,162 @@
26043+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26044+/* This file contains Reiser4 compression mode plugins.
26045+
26046+ Compression mode plugin is a set of handlers called by compressor
26047+ at flush time and represent some heuristics including the ones
26048+ which are to avoid compression of incompressible data, see
26049+ http://www.namesys.com/cryptcompress_design.html for more details.
26050+*/
26051+#include "../../inode.h"
26052+#include "../plugin.h"
26053+
26054+static int should_deflate_none(struct inode * inode, cloff_t index)
26055+{
26056+ return 0;
26057+}
26058+
26059+static int should_deflate_common(struct inode * inode, cloff_t index)
26060+{
26061+ return compression_is_on(cryptcompress_inode_data(inode));
26062+}
26063+
26064+static int discard_hook_ultim(struct inode *inode, cloff_t index)
26065+{
26066+ turn_off_compression(cryptcompress_inode_data(inode));
26067+ return 0;
26068+}
26069+
26070+static int discard_hook_lattd(struct inode *inode, cloff_t index)
26071+{
26072+ struct cryptcompress_info * info = cryptcompress_inode_data(inode);
26073+
26074+ assert("edward-1462",
26075+ get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
26076+ get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
26077+
26078+ turn_off_compression(info);
26079+ if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
26080+ set_lattice_factor(info, get_lattice_factor(info) << 1);
26081+ return 0;
26082+}
26083+
26084+static int accept_hook_lattd(struct inode *inode, cloff_t index)
26085+{
26086+ turn_on_compression(cryptcompress_inode_data(inode));
26087+ set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
26088+ return 0;
26089+}
26090+
26091+/* Check on dynamic lattice, the adaptive compression modes which
26092+ defines the following behavior:
26093+
26094+ Compression is on: try to compress everything and turn
26095+ it off, whenever cluster is incompressible.
26096+
26097+ Compression is off: try to compress clusters of indexes
26098+ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26099+ them is compressible. If incompressible, then increase FACTOR */
26100+
26101+/* check if @index belongs to one-dimensional lattice
26102+ of sparce factor @factor */
26103+static int is_on_lattice(cloff_t index, int factor)
26104+{
26105+ return (factor ? index % factor == 0: index == 0);
26106+}
26107+
26108+static int should_deflate_lattd(struct inode * inode, cloff_t index)
26109+{
26110+ return should_deflate_common(inode, index) ||
26111+ is_on_lattice(index,
26112+ get_lattice_factor
26113+ (cryptcompress_inode_data(inode)));
26114+}
26115+
26116+/* compression mode_plugins */
26117+compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26118+ [NONE_COMPRESSION_MODE_ID] = {
26119+ .h = {
26120+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26121+ .id = NONE_COMPRESSION_MODE_ID,
26122+ .pops = NULL,
26123+ .label = "none",
26124+ .desc = "Compress nothing",
26125+ .linkage = {NULL, NULL}
26126+ },
26127+ .should_deflate = should_deflate_none,
26128+ .accept_hook = NULL,
26129+ .discard_hook = NULL
26130+ },
26131+ /* Check-on-dynamic-lattice adaptive compression mode */
26132+ [LATTD_COMPRESSION_MODE_ID] = {
26133+ .h = {
26134+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26135+ .id = LATTD_COMPRESSION_MODE_ID,
26136+ .pops = NULL,
26137+ .label = "lattd",
26138+ .desc = "Check on dynamic lattice",
26139+ .linkage = {NULL, NULL}
26140+ },
26141+ .should_deflate = should_deflate_lattd,
26142+ .accept_hook = accept_hook_lattd,
26143+ .discard_hook = discard_hook_lattd
26144+ },
26145+ /* Check-ultimately compression mode:
26146+ Turn off compression forever as soon as we meet
26147+ incompressible data */
26148+ [ULTIM_COMPRESSION_MODE_ID] = {
26149+ .h = {
26150+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26151+ .id = ULTIM_COMPRESSION_MODE_ID,
26152+ .pops = NULL,
26153+ .label = "ultim",
26154+ .desc = "Check ultimately",
26155+ .linkage = {NULL, NULL}
26156+ },
26157+ .should_deflate = should_deflate_common,
26158+ .accept_hook = NULL,
26159+ .discard_hook = discard_hook_ultim
26160+ },
26161+ /* Force-to-compress-everything compression mode */
26162+ [FORCE_COMPRESSION_MODE_ID] = {
26163+ .h = {
26164+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26165+ .id = FORCE_COMPRESSION_MODE_ID,
26166+ .pops = NULL,
26167+ .label = "force",
26168+ .desc = "Force to compress everything",
26169+ .linkage = {NULL, NULL}
26170+ },
26171+ .should_deflate = NULL,
26172+ .accept_hook = NULL,
26173+ .discard_hook = NULL
26174+ },
26175+ /* Convert-to-extent compression mode.
26176+ In this mode items will be converted to extents and management
26177+ will be passed to (classic) unix file plugin as soon as ->write()
26178+ detects that the first complete logical cluster (of index #0) is
26179+ incompressible. */
26180+ [CONVX_COMPRESSION_MODE_ID] = {
26181+ .h = {
26182+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26183+ .id = CONVX_COMPRESSION_MODE_ID,
26184+ .pops = NULL,
26185+ .label = "conv",
26186+ .desc = "Convert to extent",
26187+ .linkage = {NULL, NULL}
26188+ },
26189+ .should_deflate = should_deflate_common,
26190+ .accept_hook = NULL,
26191+ .discard_hook = NULL
26192+ }
26193+};
26194+
26195+/*
26196+ Local variables:
26197+ c-indentation-style: "K&R"
26198+ mode-name: "LC"
26199+ c-basic-offset: 8
26200+ tab-width: 8
26201+ fill-column: 120
26202+ scroll-step: 1
26203+ End:
26204+*/
26205diff -urN linux-2.6.23.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.23/fs/reiser4/plugin/compress/Makefile
26206--- linux-2.6.23.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 03:00:00.000000000 +0300
26207+++ linux-2.6.23/fs/reiser4/plugin/compress/Makefile 2007-12-04 16:49:30.000000000 +0300
26208@@ -0,0 +1,5 @@
26209+obj-$(CONFIG_REISER4_FS) += compress_plugins.o
26210+
26211+compress_plugins-objs := \
26212+ compress.o \
26213+ compress_mode.o
26214diff -urN linux-2.6.23.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.23/fs/reiser4/plugin/crypto/cipher.c
26215--- linux-2.6.23.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 03:00:00.000000000 +0300
26216+++ linux-2.6.23/fs/reiser4/plugin/crypto/cipher.c 2007-12-04 16:49:30.000000000 +0300
26217@@ -0,0 +1,37 @@
26218+/* Copyright 2001, 2002, 2003 by Hans Reiser,
26219+ licensing governed by reiser4/README */
26220+/* Reiser4 cipher transform plugins */
26221+
26222+#include "../../debug.h"
26223+#include "../plugin.h"
26224+
26225+cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
26226+ [NONE_CIPHER_ID] = {
26227+ .h = {
26228+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
26229+ .id = NONE_CIPHER_ID,
26230+ .pops = NULL,
26231+ .label = "none",
26232+ .desc = "no cipher transform",
26233+ .linkage = {NULL, NULL}
26234+ },
26235+ .alloc = NULL,
26236+ .free = NULL,
26237+ .scale = NULL,
26238+ .align_stream = NULL,
26239+ .setkey = NULL,
26240+ .encrypt = NULL,
26241+ .decrypt = NULL
26242+ }
26243+};
26244+
26245+/* Make Linus happy.
26246+ Local variables:
26247+ c-indentation-style: "K&R"
26248+ mode-name: "LC"
26249+ c-basic-offset: 8
26250+ tab-width: 8
26251+ fill-column: 120
26252+ scroll-step: 1
26253+ End:
26254+*/
26255diff -urN linux-2.6.23.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.23/fs/reiser4/plugin/crypto/cipher.h
26256--- linux-2.6.23.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 03:00:00.000000000 +0300
26257+++ linux-2.6.23/fs/reiser4/plugin/crypto/cipher.h 2007-12-04 16:49:30.000000000 +0300
26258@@ -0,0 +1,55 @@
26259+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26260+/* This file contains definitions for the objects operated
26261+ by reiser4 key manager, which is something like keyring
26262+ wrapped by appropriate reiser4 plugin */
26263+
26264+#if !defined( __FS_REISER4_CRYPT_H__ )
26265+#define __FS_REISER4_CRYPT_H__
26266+
26267+#include <linux/crypto.h>
26268+
26269+/* key info imported from user space */
26270+struct reiser4_crypto_data {
26271+ int keysize; /* uninstantiated key size */
26272+ __u8 * key; /* uninstantiated key */
26273+ int keyid_size; /* size of passphrase */
26274+ __u8 * keyid; /* passphrase */
26275+};
26276+
26277+/* This object contains all needed infrastructure to implement
26278+ cipher transform. This is operated (allocating, inheriting,
26279+ validating, binding to host inode, etc..) by reiser4 key manager.
26280+
26281+ This info can be allocated in two cases:
26282+ 1. importing a key from user space.
26283+ 2. reading inode from disk */
26284+struct reiser4_crypto_info {
26285+ struct inode * host;
26286+ struct crypto_hash * digest;
26287+ struct crypto_blkcipher * cipher;
26288+#if 0
26289+ cipher_key_plugin * kplug; /* key manager */
26290+#endif
26291+ __u8 * keyid; /* key fingerprint, created by digest plugin,
26292+ using uninstantiated key and passphrase.
26293+ supposed to be stored in disk stat-data */
26294+ int inst; /* this indicates if the cipher key is
26295+ instantiated (case 1 above) */
26296+ int keysize; /* uninstantiated key size (bytes), supposed
26297+ to be stored in disk stat-data */
26298+ int keyload_count; /* number of the objects which has this
26299+ crypto-stat attached */
26300+};
26301+
26302+#endif /* __FS_REISER4_CRYPT_H__ */
26303+
26304+/*
26305+ Local variables:
26306+ c-indentation-style: "K&R"
26307+ mode-name: "LC"
26308+ c-basic-offset: 8
26309+ tab-width: 8
26310+ fill-column: 120
26311+ scroll-step: 1
26312+ End:
26313+*/
26314diff -urN linux-2.6.23.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.23/fs/reiser4/plugin/crypto/digest.c
26315--- linux-2.6.23.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 03:00:00.000000000 +0300
26316+++ linux-2.6.23/fs/reiser4/plugin/crypto/digest.c 2007-12-04 16:49:30.000000000 +0300
26317@@ -0,0 +1,58 @@
26318+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26319+
26320+/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
26321+/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
26322+#include "../../debug.h"
26323+#include "../plugin_header.h"
26324+#include "../plugin.h"
26325+#include "../file/cryptcompress.h"
26326+
26327+#include <linux/types.h>
26328+
26329+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
26330+
26331+static struct crypto_hash * alloc_sha256 (void)
26332+{
26333+#if REISER4_SHA256
26334+ return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
26335+#else
26336+ warning("edward-1418", "sha256 unsupported");
26337+ return ERR_PTR(-EINVAL);
26338+#endif
26339+}
26340+
26341+static void free_sha256 (struct crypto_hash * tfm)
26342+{
26343+#if REISER4_SHA256
26344+ crypto_free_hash(tfm);
26345+#endif
26346+ return;
26347+}
26348+
26349+/* digest plugins */
26350+digest_plugin digest_plugins[LAST_DIGEST_ID] = {
26351+ [SHA256_32_DIGEST_ID] = {
26352+ .h = {
26353+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
26354+ .id = SHA256_32_DIGEST_ID,
26355+ .pops = NULL,
26356+ .label = "sha256_32",
26357+ .desc = "sha256_32 digest transform",
26358+ .linkage = {NULL, NULL}
26359+ },
26360+ .fipsize = sizeof(__u32),
26361+ .alloc = alloc_sha256,
26362+ .free = free_sha256
26363+ }
26364+};
26365+
26366+/*
26367+ Local variables:
26368+ c-indentation-style: "K&R"
26369+ mode-name: "LC"
26370+ c-basic-offset: 8
26371+ tab-width: 8
26372+ fill-column: 120
26373+ scroll-step: 1
26374+ End:
26375+*/
26376diff -urN linux-2.6.23.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.23/fs/reiser4/plugin/dir/dir.h
26377--- linux-2.6.23.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 03:00:00.000000000 +0300
26378+++ linux-2.6.23/fs/reiser4/plugin/dir/dir.h 2007-12-04 16:49:30.000000000 +0300
26379@@ -0,0 +1,36 @@
26380+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26381+ * reiser4/README */
26382+
26383+/* this file contains declarations of methods implementing directory plugins */
26384+
26385+#if !defined( __REISER4_DIR_H__ )
26386+#define __REISER4_DIR_H__
26387+
26388+/*#include "../../key.h"
26389+
26390+#include <linux/fs.h>*/
26391+
26392+/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
26393+
26394+/* "hashed" directory methods of dir plugin */
26395+void build_entry_key_hashed(const struct inode *, const struct qstr *,
26396+ reiser4_key *);
26397+
26398+/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
26399+
26400+/* "seekable" directory methods of dir plugin */
26401+void build_entry_key_seekable(const struct inode *, const struct qstr *,
26402+ reiser4_key *);
26403+
26404+/* __REISER4_DIR_H__ */
26405+#endif
26406+
26407+/*
26408+ Local variables:
26409+ c-indentation-style: "K&R"
26410+ mode-name: "LC"
26411+ c-basic-offset: 8
26412+ tab-width: 8
26413+ fill-column: 120
26414+ End:
26415+*/
26416diff -urN linux-2.6.23.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.23/fs/reiser4/plugin/dir/hashed_dir.c
26417--- linux-2.6.23.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 03:00:00.000000000 +0300
26418+++ linux-2.6.23/fs/reiser4/plugin/dir/hashed_dir.c 2007-12-04 16:49:30.000000000 +0300
26419@@ -0,0 +1,81 @@
26420+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26421+ * reiser4/README */
26422+
26423+/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
26424+ names to the files. */
26425+
26426+/*
26427+ * Hashed directory logically consists of persistent directory
26428+ * entries. Directory entry is a pair of a file name and a key of stat-data of
26429+ * a file that has this name in the given directory.
26430+ *
26431+ * Directory entries are stored in the tree in the form of directory
26432+ * items. Directory item should implement dir_entry_ops portion of item plugin
26433+ * interface (see plugin/item/item.h). Hashed directory interacts with
26434+ * directory item plugin exclusively through dir_entry_ops operations.
26435+ *
26436+ * Currently there are two implementations of directory items: "simple
26437+ * directory item" (plugin/item/sde.[ch]), and "compound directory item"
26438+ * (plugin/item/cde.[ch]) with the latter being the default.
26439+ *
26440+ * There is, however some delicate way through which directory code interferes
26441+ * with item plugin: key assignment policy. A key for a directory item is
26442+ * chosen by directory code, and as described in kassign.c, this key contains
26443+ * a portion of file name. Directory item uses this knowledge to avoid storing
26444+ * this portion of file name twice: in the key and in the directory item body.
26445+ *
26446+ */
26447+
26448+#include "../../inode.h"
26449+
26450+void complete_entry_key(const struct inode *, const char *name,
26451+ int len, reiser4_key * result);
26452+
26453+/* this is implementation of build_entry_key method of dir
26454+ plugin for HASHED_DIR_PLUGIN_ID
26455+ */
26456+void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
26457+ * (or will be) in.*/
26458+ const struct qstr *qname, /* name of file referenced
26459+ * by this entry */
26460+ reiser4_key * result /* resulting key of directory
26461+ * entry */ )
26462+{
26463+ const char *name;
26464+ int len;
26465+
26466+ assert("nikita-1139", dir != NULL);
26467+ assert("nikita-1140", qname != NULL);
26468+ assert("nikita-1141", qname->name != NULL);
26469+ assert("nikita-1142", result != NULL);
26470+
26471+ name = qname->name;
26472+ len = qname->len;
26473+
26474+ assert("nikita-2867", strlen(name) == len);
26475+
26476+ reiser4_key_init(result);
26477+ /* locality of directory entry's key is objectid of parent
26478+ directory */
26479+ set_key_locality(result, get_inode_oid(dir));
26480+ /* minor packing locality is constant */
26481+ set_key_type(result, KEY_FILE_NAME_MINOR);
26482+ /* dot is special case---we always want it to be first entry in
26483+ a directory. Actually, we just want to have smallest
26484+ directory entry.
26485+ */
26486+ if (len == 1 && name[0] == '.')
26487+ return;
26488+
26489+ /* initialize part of entry key which depends on file name */
26490+ complete_entry_key(dir, name, len, result);
26491+}
26492+
26493+/* Local variables:
26494+ c-indentation-style: "K&R"
26495+ mode-name: "LC"
26496+ c-basic-offset: 8
26497+ tab-width: 8
26498+ fill-column: 120
26499+ End:
26500+*/
26501diff -urN linux-2.6.23.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.23/fs/reiser4/plugin/dir/Makefile
26502--- linux-2.6.23.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 03:00:00.000000000 +0300
26503+++ linux-2.6.23/fs/reiser4/plugin/dir/Makefile 2007-12-04 16:49:30.000000000 +0300
26504@@ -0,0 +1,5 @@
26505+obj-$(CONFIG_REISER4_FS) += dir_plugins.o
26506+
26507+dir_plugins-objs := \
26508+ hashed_dir.o \
26509+ seekable_dir.o
26510diff -urN linux-2.6.23.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.23/fs/reiser4/plugin/dir/seekable_dir.c
26511--- linux-2.6.23.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 03:00:00.000000000 +0300
26512+++ linux-2.6.23/fs/reiser4/plugin/dir/seekable_dir.c 2007-12-04 16:49:30.000000000 +0300
26513@@ -0,0 +1,46 @@
26514+/* Copyright 2005 by Hans Reiser, licensing governed by
26515+ * reiser4/README */
26516+
26517+#include "../../inode.h"
26518+
26519+/* this is implementation of build_entry_key method of dir
26520+ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
26521+ This is for directories where we want repeatable and restartable readdir()
26522+ even in case 32bit user level struct dirent (readdir(3)).
26523+*/
26524+void
26525+build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
26526+ reiser4_key * result)
26527+{
26528+ oid_t objectid;
26529+
26530+ assert("nikita-2283", dir != NULL);
26531+ assert("nikita-2284", name != NULL);
26532+ assert("nikita-2285", name->name != NULL);
26533+ assert("nikita-2286", result != NULL);
26534+
26535+ reiser4_key_init(result);
26536+ /* locality of directory entry's key is objectid of parent
26537+ directory */
26538+ set_key_locality(result, get_inode_oid(dir));
26539+ /* minor packing locality is constant */
26540+ set_key_type(result, KEY_FILE_NAME_MINOR);
26541+ /* dot is special case---we always want it to be first entry in
26542+ a directory. Actually, we just want to have smallest
26543+ directory entry.
26544+ */
26545+ if ((name->len == 1) && (name->name[0] == '.'))
26546+ return;
26547+
26548+ /* objectid of key is 31 lowest bits of hash. */
26549+ objectid =
26550+ inode_hash_plugin(dir)->hash(name->name,
26551+ (int)name->len) & 0x7fffffff;
26552+
26553+ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
26554+ set_key_objectid(result, objectid);
26555+
26556+ /* offset is always 0. */
26557+ set_key_offset(result, (__u64) 0);
26558+ return;
26559+}
26560diff -urN linux-2.6.23.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.23/fs/reiser4/plugin/dir_plugin_common.c
26561--- linux-2.6.23.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
26562+++ linux-2.6.23/fs/reiser4/plugin/dir_plugin_common.c 2007-12-04 16:49:30.000000000 +0300
26563@@ -0,0 +1,872 @@
26564+/* Copyright 2005 by Hans Reiser, licensing governed by
26565+ reiser4/README */
26566+
26567+/* this file contains typical implementations for most of methods of
26568+ directory plugin
26569+*/
26570+
26571+#include "../inode.h"
26572+
26573+int reiser4_find_entry(struct inode *dir, struct dentry *name,
26574+ lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
26575+int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
26576+void check_light_weight(struct inode *inode, struct inode *parent);
26577+
26578+/* this is common implementation of get_parent method of dir plugin
26579+ this is used by NFS kernel server to "climb" up directory tree to
26580+ check permissions
26581+ */
26582+struct dentry *get_parent_common(struct inode *child)
26583+{
26584+ struct super_block *s;
26585+ struct inode *parent;
26586+ struct dentry dotdot;
26587+ struct dentry *dentry;
26588+ reiser4_key key;
26589+ int result;
26590+
26591+ /*
26592+ * lookup dotdot entry.
26593+ */
26594+
26595+ s = child->i_sb;
26596+ memset(&dotdot, 0, sizeof(dotdot));
26597+ dotdot.d_name.name = "..";
26598+ dotdot.d_name.len = 2;
26599+ dotdot.d_op = &get_super_private(s)->ops.dentry;
26600+
26601+ result = reiser4_lookup_name(child, &dotdot, &key);
26602+ if (result != 0)
26603+ return ERR_PTR(result);
26604+
26605+ parent = reiser4_iget(s, &key, 1);
26606+ if (!IS_ERR(parent)) {
26607+ /*
26608+ * FIXME-NIKITA dubious: attributes are inherited from @child
26609+ * to @parent. But:
26610+ *
26611+ * (*) this is the only this we can do
26612+ *
26613+ * (*) attributes of light-weight object are inherited
26614+ * from a parent through which object was looked up first,
26615+ * so it is ambiguous anyway.
26616+ *
26617+ */
26618+ check_light_weight(parent, child);
26619+ reiser4_iget_complete(parent);
26620+ dentry = d_alloc_anon(parent);
26621+ if (dentry == NULL) {
26622+ iput(parent);
26623+ dentry = ERR_PTR(RETERR(-ENOMEM));
26624+ } else
26625+ dentry->d_op = &get_super_private(s)->ops.dentry;
26626+ } else if (PTR_ERR(parent) == -ENOENT)
26627+ dentry = ERR_PTR(RETERR(-ESTALE));
26628+ else
26629+ dentry = (void *)parent;
26630+ return dentry;
26631+}
26632+
26633+/* this is common implementation of is_name_acceptable method of dir
26634+ plugin
26635+ */
26636+int is_name_acceptable_common(const struct inode *inode, /* directory to check */
26637+ const char *name UNUSED_ARG, /* name to check */
26638+ int len /* @name's length */ )
26639+{
26640+ assert("nikita-733", inode != NULL);
26641+ assert("nikita-734", name != NULL);
26642+ assert("nikita-735", len > 0);
26643+
26644+ return len <= reiser4_max_filename_len(inode);
26645+}
26646+
26647+/* there is no common implementation of build_entry_key method of dir
26648+ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
26649+ plugin/dir/seekable.c:build_entry_key_seekable() for example
26650+*/
26651+
26652+/* this is common implementation of build_readdir_key method of dir
26653+ plugin
26654+ see reiser4_readdir_common for more details
26655+*/
26656+int build_readdir_key_common(struct file *dir /* directory being read */ ,
26657+ reiser4_key * result /* where to store key */ )
26658+{
26659+ reiser4_file_fsdata *fdata;
26660+ struct inode *inode;
26661+
26662+ assert("nikita-1361", dir != NULL);
26663+ assert("nikita-1362", result != NULL);
26664+ assert("nikita-1363", dir->f_dentry != NULL);
26665+ inode = dir->f_dentry->d_inode;
26666+ assert("nikita-1373", inode != NULL);
26667+
26668+ fdata = reiser4_get_file_fsdata(dir);
26669+ if (IS_ERR(fdata))
26670+ return PTR_ERR(fdata);
26671+ assert("nikita-1364", fdata != NULL);
26672+ return extract_key_from_de_id(get_inode_oid(inode),
26673+ &fdata->dir.readdir.position.
26674+ dir_entry_key, result);
26675+
26676+}
26677+
26678+void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
26679+ int adj);
26680+
26681+/* this is common implementation of add_entry method of dir plugin
26682+*/
26683+int reiser4_add_entry_common(struct inode *object, /* directory to add new name
26684+ * in */
26685+ struct dentry *where, /* new name */
26686+ reiser4_object_create_data * data, /* parameters of
26687+ * new object */
26688+ reiser4_dir_entry_desc * entry /* parameters of
26689+ * new directory
26690+ * entry */)
26691+{
26692+ int result;
26693+ coord_t *coord;
26694+ lock_handle lh;
26695+ struct reiser4_dentry_fsdata *fsdata;
26696+ reiser4_block_nr reserve;
26697+
26698+ assert("nikita-1114", object != NULL);
26699+ assert("nikita-1250", where != NULL);
26700+
26701+ fsdata = reiser4_get_dentry_fsdata(where);
26702+ if (unlikely(IS_ERR(fsdata)))
26703+ return PTR_ERR(fsdata);
26704+
26705+ reserve = inode_dir_plugin(object)->estimate.add_entry(object);
26706+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
26707+ return RETERR(-ENOSPC);
26708+
26709+ init_lh(&lh);
26710+ coord = &fsdata->dec.entry_coord;
26711+ coord_clear_iplug(coord);
26712+
26713+ /* check for this entry in a directory. This is plugin method. */
26714+ result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
26715+ entry);
26716+ if (likely(result == -ENOENT)) {
26717+ /* add new entry. Just pass control to the directory
26718+ item plugin. */
26719+ assert("nikita-1709", inode_dir_item_plugin(object));
26720+ assert("nikita-2230", coord->node == lh.node);
26721+ reiser4_seal_done(&fsdata->dec.entry_seal);
26722+ result =
26723+ inode_dir_item_plugin(object)->s.dir.add_entry(object,
26724+ coord, &lh,
26725+ where,
26726+ entry);
26727+ if (result == 0) {
26728+ reiser4_adjust_dir_file(object, where,
26729+ fsdata->dec.pos + 1, +1);
26730+ INODE_INC_FIELD(object, i_size);
26731+ }
26732+ } else if (result == 0) {
26733+ assert("nikita-2232", coord->node == lh.node);
26734+ result = RETERR(-EEXIST);
26735+ }
26736+ done_lh(&lh);
26737+
26738+ return result;
26739+}
26740+
26741+/**
26742+ * rem_entry - remove entry from directory item
26743+ * @dir:
26744+ * @dentry:
26745+ * @entry:
26746+ * @coord:
26747+ * @lh:
26748+ *
26749+ * Checks that coordinate @coord is set properly and calls item plugin
26750+ * method to cut entry.
26751+ */
26752+static int
26753+rem_entry(struct inode *dir, struct dentry *dentry,
26754+ reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
26755+{
26756+ item_plugin *iplug;
26757+ struct inode *child;
26758+
26759+ iplug = inode_dir_item_plugin(dir);
26760+ child = dentry->d_inode;
26761+ assert("nikita-3399", child != NULL);
26762+
26763+ /* check that we are really destroying an entry for @child */
26764+ if (REISER4_DEBUG) {
26765+ int result;
26766+ reiser4_key key;
26767+
26768+ result = iplug->s.dir.extract_key(coord, &key);
26769+ if (result != 0)
26770+ return result;
26771+ if (get_key_objectid(&key) != get_inode_oid(child)) {
26772+ warning("nikita-3397",
26773+ "rem_entry: %#llx != %#llx\n",
26774+ get_key_objectid(&key),
26775+ (unsigned long long)get_inode_oid(child));
26776+ return RETERR(-EIO);
26777+ }
26778+ }
26779+ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
26780+}
26781+
26782+/**
26783+ * reiser4_rem_entry_common - remove entry from a directory
26784+ * @dir: directory to remove entry from
26785+ * @where: name that is being removed
26786+ * @entry: description of entry being removed
26787+ *
26788+ * This is common implementation of rem_entry method of dir plugin.
26789+ */
26790+int reiser4_rem_entry_common(struct inode *dir,
26791+ struct dentry *dentry,
26792+ reiser4_dir_entry_desc *entry)
26793+{
26794+ int result;
26795+ coord_t *coord;
26796+ lock_handle lh;
26797+ struct reiser4_dentry_fsdata *fsdata;
26798+ __u64 tograb;
26799+
26800+ assert("nikita-1124", dir != NULL);
26801+ assert("nikita-1125", dentry != NULL);
26802+
26803+ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
26804+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
26805+ if (result != 0)
26806+ return RETERR(-ENOSPC);
26807+
26808+ init_lh(&lh);
26809+
26810+ /* check for this entry in a directory. This is plugin method. */
26811+ result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
26812+ fsdata = reiser4_get_dentry_fsdata(dentry);
26813+ if (IS_ERR(fsdata)) {
26814+ done_lh(&lh);
26815+ return PTR_ERR(fsdata);
26816+ }
26817+
26818+ coord = &fsdata->dec.entry_coord;
26819+
26820+ assert("nikita-3404",
26821+ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
26822+ dir->i_size <= 1);
26823+
26824+ coord_clear_iplug(coord);
26825+ if (result == 0) {
26826+ /* remove entry. Just pass control to the directory item
26827+ plugin. */
26828+ assert("vs-542", inode_dir_item_plugin(dir));
26829+ reiser4_seal_done(&fsdata->dec.entry_seal);
26830+ reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
26831+ result =
26832+ WITH_COORD(coord,
26833+ rem_entry(dir, dentry, entry, coord, &lh));
26834+ if (result == 0) {
26835+ if (dir->i_size >= 1)
26836+ INODE_DEC_FIELD(dir, i_size);
26837+ else {
26838+ warning("nikita-2509", "Dir %llu is runt",
26839+ (unsigned long long)
26840+ get_inode_oid(dir));
26841+ result = RETERR(-EIO);
26842+ }
26843+
26844+ assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
26845+ dentry->d_inode->i_size != 2 ||
26846+ inode_dir_plugin(dentry->d_inode) == NULL);
26847+ }
26848+ }
26849+ done_lh(&lh);
26850+
26851+ return result;
26852+}
26853+
26854+static reiser4_block_nr estimate_init(struct inode *parent,
26855+ struct inode *object);
26856+static int create_dot_dotdot(struct inode *object, struct inode *parent);
26857+
26858+/* this is common implementation of init method of dir plugin
26859+ create "." and ".." entries
26860+*/
26861+int reiser4_dir_init_common(struct inode *object, /* new directory */
26862+ struct inode *parent, /* parent directory */
26863+ reiser4_object_create_data * data /* info passed
26864+ * to us, this
26865+ * is filled by
26866+ * reiser4()
26867+ * syscall in
26868+ * particular */)
26869+{
26870+ reiser4_block_nr reserve;
26871+
26872+ assert("nikita-680", object != NULL);
26873+ assert("nikita-681", S_ISDIR(object->i_mode));
26874+ assert("nikita-682", parent != NULL);
26875+ assert("nikita-684", data != NULL);
26876+ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
26877+ assert("nikita-687", object->i_mode & S_IFDIR);
26878+
26879+ reserve = estimate_init(parent, object);
26880+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
26881+ return RETERR(-ENOSPC);
26882+
26883+ return create_dot_dotdot(object, parent);
26884+}
26885+
26886+/* this is common implementation of done method of dir plugin
26887+ remove "." entry
26888+*/
26889+int reiser4_dir_done_common(struct inode *object /* object being deleted */ )
26890+{
26891+ int result;
26892+ reiser4_block_nr reserve;
26893+ struct dentry goodby_dots;
26894+ reiser4_dir_entry_desc entry;
26895+
26896+ assert("nikita-1449", object != NULL);
26897+
26898+ if (reiser4_inode_get_flag(object, REISER4_NO_SD))
26899+ return 0;
26900+
26901+ /* of course, this can be rewritten to sweep everything in one
26902+ reiser4_cut_tree(). */
26903+ memset(&entry, 0, sizeof entry);
26904+
26905+ /* FIXME: this done method is called from reiser4_delete_dir_common which
26906+ * reserved space already */
26907+ reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
26908+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
26909+ return RETERR(-ENOSPC);
26910+
26911+ memset(&goodby_dots, 0, sizeof goodby_dots);
26912+ entry.obj = goodby_dots.d_inode = object;
26913+ goodby_dots.d_name.name = ".";
26914+ goodby_dots.d_name.len = 1;
26915+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
26916+ reiser4_free_dentry_fsdata(&goodby_dots);
26917+ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
26918+ /* only worth a warning
26919+
26920+ "values of \ eB\ f will give rise to dom!\n"
26921+ -- v6src/s2/mv.c:89
26922+ */
26923+ warning("nikita-2252", "Cannot remove dot of %lli: %i",
26924+ (unsigned long long)get_inode_oid(object), result);
26925+ return 0;
26926+}
26927+
26928+/* this is common implementation of attach method of dir plugin
26929+*/
26930+int reiser4_attach_common(struct inode *child UNUSED_ARG,
26931+ struct inode *parent UNUSED_ARG)
26932+{
26933+ assert("nikita-2647", child != NULL);
26934+ assert("nikita-2648", parent != NULL);
26935+
26936+ return 0;
26937+}
26938+
26939+/* this is common implementation of detach method of dir plugin
26940+ remove "..", decrease nlink on parent
26941+*/
26942+int reiser4_detach_common(struct inode *object, struct inode *parent)
26943+{
26944+ int result;
26945+ struct dentry goodby_dots;
26946+ reiser4_dir_entry_desc entry;
26947+
26948+ assert("nikita-2885", object != NULL);
26949+ assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
26950+
26951+ memset(&entry, 0, sizeof entry);
26952+
26953+ /* NOTE-NIKITA this only works if @parent is -the- parent of
26954+ @object, viz. object whose key is stored in dotdot
26955+ entry. Wouldn't work with hard-links on directories. */
26956+ memset(&goodby_dots, 0, sizeof goodby_dots);
26957+ entry.obj = goodby_dots.d_inode = parent;
26958+ goodby_dots.d_name.name = "..";
26959+ goodby_dots.d_name.len = 2;
26960+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
26961+ reiser4_free_dentry_fsdata(&goodby_dots);
26962+ if (result == 0) {
26963+ /* the dot should be the only entry remaining at this time... */
26964+ assert("nikita-3400",
26965+ object->i_size == 1 && object->i_nlink <= 2);
26966+#if 0
26967+ /* and, together with the only name directory can have, they
26968+ * provides for the last 2 remaining references. If we get
26969+ * here as part of error handling during mkdir, @object
26970+ * possibly has no name yet, so its nlink == 1. If we get here
26971+ * from rename (targeting empty directory), it has no name
26972+ * already, so its nlink == 1. */
26973+ assert("nikita-3401",
26974+ object->i_nlink == 2 || object->i_nlink == 1);
26975+#endif
26976+
26977+ /* decrement nlink of directory removed ".." pointed
26978+ to */
26979+ reiser4_del_nlink(parent, NULL, 0);
26980+ }
26981+ return result;
26982+}
26983+
26984+/* this is common implementation of estimate.add_entry method of
26985+ dir plugin
26986+ estimation of adding entry which supposes that entry is inserting a
26987+ unit into item
26988+*/
26989+reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
26990+{
26991+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
26992+}
26993+
26994+/* this is common implementation of estimate.rem_entry method of dir
26995+ plugin
26996+*/
26997+reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
26998+{
26999+ return estimate_one_item_removal(reiser4_tree_by_inode(inode));
27000+}
27001+
27002+/* this is common implementation of estimate.unlink method of dir
27003+ plugin
27004+*/
27005+reiser4_block_nr
27006+dir_estimate_unlink_common(const struct inode * parent,
27007+ const struct inode * object)
27008+{
27009+ reiser4_block_nr res;
27010+
27011+ /* hashed_rem_entry(object) */
27012+ res = inode_dir_plugin(object)->estimate.rem_entry(object);
27013+ /* del_nlink(parent) */
27014+ res += 2 * inode_file_plugin(parent)->estimate.update(parent);
27015+
27016+ return res;
27017+}
27018+
27019+/*
27020+ * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
27021+ * methods: if @inode is a light-weight file, setup its credentials
27022+ * that are not stored in the stat-data in this case
27023+ */
27024+void check_light_weight(struct inode *inode, struct inode *parent)
27025+{
27026+ if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
27027+ inode->i_uid = parent->i_uid;
27028+ inode->i_gid = parent->i_gid;
27029+ /* clear light-weight flag. If inode would be read by any
27030+ other name, [ug]id wouldn't change. */
27031+ reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
27032+ }
27033+}
27034+
27035+/* looks for name specified in @dentry in directory @parent and if name is
27036+ found - key of object found entry points to is stored in @entry->key */
27037+int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup for
27038+ * name in */
27039+ struct dentry *dentry, /* name to look for */
27040+ reiser4_key * key /* place to store key */ )
27041+{
27042+ int result;
27043+ coord_t *coord;
27044+ lock_handle lh;
27045+ const char *name;
27046+ int len;
27047+ reiser4_dir_entry_desc entry;
27048+ struct reiser4_dentry_fsdata *fsdata;
27049+
27050+ assert("nikita-1247", parent != NULL);
27051+ assert("nikita-1248", dentry != NULL);
27052+ assert("nikita-1123", dentry->d_name.name != NULL);
27053+ assert("vs-1486",
27054+ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
27055+
27056+ name = dentry->d_name.name;
27057+ len = dentry->d_name.len;
27058+
27059+ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
27060+ /* some arbitrary error code to return */
27061+ return RETERR(-ENAMETOOLONG);
27062+
27063+ fsdata = reiser4_get_dentry_fsdata(dentry);
27064+ if (IS_ERR(fsdata))
27065+ return PTR_ERR(fsdata);
27066+
27067+ coord = &fsdata->dec.entry_coord;
27068+ coord_clear_iplug(coord);
27069+ init_lh(&lh);
27070+
27071+ /* find entry in a directory. This is plugin method. */
27072+ result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
27073+ &entry);
27074+ if (result == 0) {
27075+ /* entry was found, extract object key from it. */
27076+ result =
27077+ WITH_COORD(coord,
27078+ item_plugin_by_coord(coord)->s.dir.
27079+ extract_key(coord, key));
27080+ }
27081+ done_lh(&lh);
27082+ return result;
27083+
27084+}
27085+
27086+/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
27087+static reiser4_block_nr
27088+estimate_init(struct inode *parent, struct inode *object)
27089+{
27090+ reiser4_block_nr res = 0;
27091+
27092+ assert("vpf-321", parent != NULL);
27093+ assert("vpf-322", object != NULL);
27094+
27095+ /* hashed_add_entry(object) */
27096+ res += inode_dir_plugin(object)->estimate.add_entry(object);
27097+ /* reiser4_add_nlink(object) */
27098+ res += inode_file_plugin(object)->estimate.update(object);
27099+ /* hashed_add_entry(object) */
27100+ res += inode_dir_plugin(object)->estimate.add_entry(object);
27101+ /* reiser4_add_nlink(parent) */
27102+ res += inode_file_plugin(parent)->estimate.update(parent);
27103+
27104+ return 0;
27105+}
27106+
27107+/* helper function for reiser4_dir_init_common(). Create "." and ".." */
27108+static int create_dot_dotdot(struct inode *object /* object to create dot and
27109+ * dotdot for */ ,
27110+ struct inode *parent /* parent of @object */)
27111+{
27112+ int result;
27113+ struct dentry dots_entry;
27114+ reiser4_dir_entry_desc entry;
27115+
27116+ assert("nikita-688", object != NULL);
27117+ assert("nikita-689", S_ISDIR(object->i_mode));
27118+ assert("nikita-691", parent != NULL);
27119+
27120+ /* We store dot and dotdot as normal directory entries. This is
27121+ not necessary, because almost all information stored in them
27122+ is already in the stat-data of directory, the only thing
27123+ being missed is objectid of grand-parent directory that can
27124+ easily be added there as extension.
27125+
27126+ But it is done the way it is done, because not storing dot
27127+ and dotdot will lead to the following complications:
27128+
27129+ . special case handling in ->lookup().
27130+ . addition of another extension to the sd.
27131+ . dependency on key allocation policy for stat data.
27132+
27133+ */
27134+
27135+ memset(&entry, 0, sizeof entry);
27136+ memset(&dots_entry, 0, sizeof dots_entry);
27137+ entry.obj = dots_entry.d_inode = object;
27138+ dots_entry.d_name.name = ".";
27139+ dots_entry.d_name.len = 1;
27140+ result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
27141+ reiser4_free_dentry_fsdata(&dots_entry);
27142+
27143+ if (result == 0) {
27144+ result = reiser4_add_nlink(object, object, 0);
27145+ if (result == 0) {
27146+ entry.obj = dots_entry.d_inode = parent;
27147+ dots_entry.d_name.name = "..";
27148+ dots_entry.d_name.len = 2;
27149+ result = reiser4_add_entry_common(object,
27150+ &dots_entry, NULL, &entry);
27151+ reiser4_free_dentry_fsdata(&dots_entry);
27152+ /* if creation of ".." failed, iput() will delete
27153+ object with ".". */
27154+ if (result == 0) {
27155+ result = reiser4_add_nlink(parent, object, 0);
27156+ if (result != 0)
27157+ /*
27158+ * if we failed to bump i_nlink, try
27159+ * to remove ".."
27160+ */
27161+ reiser4_detach_common(object, parent);
27162+ }
27163+ }
27164+ }
27165+
27166+ if (result != 0) {
27167+ /*
27168+ * in the case of error, at least update stat-data so that,
27169+ * ->i_nlink updates are not lingering.
27170+ */
27171+ reiser4_update_sd(object);
27172+ reiser4_update_sd(parent);
27173+ }
27174+
27175+ return result;
27176+}
27177+
27178+/*
27179+ * return 0 iff @coord contains a directory entry for the file with the name
27180+ * @name.
27181+ */
27182+static int
27183+check_item(const struct inode *dir, const coord_t * coord, const char *name)
27184+{
27185+ item_plugin *iplug;
27186+ char buf[DE_NAME_BUF_LEN];
27187+
27188+ iplug = item_plugin_by_coord(coord);
27189+ if (iplug == NULL) {
27190+ warning("nikita-1135", "Cannot get item plugin");
27191+ print_coord("coord", coord, 1);
27192+ return RETERR(-EIO);
27193+ } else if (item_id_by_coord(coord) !=
27194+ item_id_by_plugin(inode_dir_item_plugin(dir))) {
27195+ /* item id of current item does not match to id of items a
27196+ directory is built of */
27197+ warning("nikita-1136", "Wrong item plugin");
27198+ print_coord("coord", coord, 1);
27199+ return RETERR(-EIO);
27200+ }
27201+ assert("nikita-1137", iplug->s.dir.extract_name);
27202+
27203+ /* Compare name stored in this entry with name we are looking for.
27204+
27205+ NOTE-NIKITA Here should go code for support of something like
27206+ unicode, code tables, etc.
27207+ */
27208+ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
27209+}
27210+
27211+static int
27212+check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
27213+{
27214+ return WITH_COORD(coord, check_item(dir, coord, name->name));
27215+}
27216+
27217+/*
27218+ * argument package used by entry_actor to scan entries with identical keys.
27219+ */
27220+struct entry_actor_args {
27221+ /* name we are looking for */
27222+ const char *name;
27223+ /* key of directory entry. entry_actor() scans through sequence of
27224+ * items/units having the same key */
27225+ reiser4_key *key;
27226+ /* how many entries with duplicate key was scanned so far. */
27227+ int non_uniq;
27228+#if REISER4_USE_COLLISION_LIMIT
27229+ /* scan limit */
27230+ int max_non_uniq;
27231+#endif
27232+ /* return parameter: set to true, if ->name wasn't found */
27233+ int not_found;
27234+ /* what type of lock to take when moving to the next node during
27235+ * scan */
27236+ znode_lock_mode mode;
27237+
27238+ /* last coord that was visited during scan */
27239+ coord_t last_coord;
27240+ /* last node locked during scan */
27241+ lock_handle last_lh;
27242+ /* inode of directory */
27243+ const struct inode *inode;
27244+};
27245+
27246+/* Function called by reiser4_find_entry() to look for given name
27247+ in the directory. */
27248+static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
27249+ coord_t * coord /* current coord */ ,
27250+ lock_handle * lh /* current lock handle */ ,
27251+ void *entry_actor_arg /* argument to scan */ )
27252+{
27253+ reiser4_key unit_key;
27254+ struct entry_actor_args *args;
27255+
27256+ assert("nikita-1131", tree != NULL);
27257+ assert("nikita-1132", coord != NULL);
27258+ assert("nikita-1133", entry_actor_arg != NULL);
27259+
27260+ args = entry_actor_arg;
27261+ ++args->non_uniq;
27262+#if REISER4_USE_COLLISION_LIMIT
27263+ if (args->non_uniq > args->max_non_uniq) {
27264+ args->not_found = 1;
27265+ /* hash collision overflow. */
27266+ return RETERR(-EBUSY);
27267+ }
27268+#endif
27269+
27270+ /*
27271+ * did we just reach the end of the sequence of items/units with
27272+ * identical keys?
27273+ */
27274+ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
27275+ assert("nikita-1791",
27276+ keylt(args->key, unit_key_by_coord(coord, &unit_key)));
27277+ args->not_found = 1;
27278+ args->last_coord.between = AFTER_UNIT;
27279+ return 0;
27280+ }
27281+
27282+ coord_dup(&args->last_coord, coord);
27283+ /*
27284+ * did scan just moved to the next node?
27285+ */
27286+ if (args->last_lh.node != lh->node) {
27287+ int lock_result;
27288+
27289+ /*
27290+ * if so, lock new node with the mode requested by the caller
27291+ */
27292+ done_lh(&args->last_lh);
27293+ assert("nikita-1896", znode_is_any_locked(lh->node));
27294+ lock_result = longterm_lock_znode(&args->last_lh, lh->node,
27295+ args->mode, ZNODE_LOCK_HIPRI);
27296+ if (lock_result != 0)
27297+ return lock_result;
27298+ }
27299+ return check_item(args->inode, coord, args->name);
27300+}
27301+
27302+/* Look for given @name within directory @dir.
27303+
27304+ This is called during lookup, creation and removal of directory
27305+ entries and on reiser4_rename_common
27306+
27307+ First calculate key that directory entry for @name would have. Search
27308+ for this key in the tree. If such key is found, scan all items with
27309+ the same key, checking name in each directory entry along the way.
27310+*/
27311+int reiser4_find_entry(struct inode *dir, /* directory to scan */
27312+ struct dentry *de, /* name to search for */
27313+ lock_handle * lh, /* resulting lock handle */
27314+ znode_lock_mode mode, /* required lock mode */
27315+ reiser4_dir_entry_desc * entry /* parameters of found
27316+ directory entry */)
27317+{
27318+ const struct qstr *name;
27319+ seal_t *seal;
27320+ coord_t *coord;
27321+ int result;
27322+ __u32 flags;
27323+ struct de_location *dec;
27324+ struct reiser4_dentry_fsdata *fsdata;
27325+
27326+ assert("nikita-1130", lh != NULL);
27327+ assert("nikita-1128", dir != NULL);
27328+
27329+ name = &de->d_name;
27330+ assert("nikita-1129", name != NULL);
27331+
27332+ /* dentry private data don't require lock, because dentry
27333+ manipulations are protected by i_mutex on parent.
27334+
27335+ This is not so for inodes, because there is no -the- parent in
27336+ inode case.
27337+ */
27338+ fsdata = reiser4_get_dentry_fsdata(de);
27339+ if (IS_ERR(fsdata))
27340+ return PTR_ERR(fsdata);
27341+ dec = &fsdata->dec;
27342+
27343+ coord = &dec->entry_coord;
27344+ coord_clear_iplug(coord);
27345+ seal = &dec->entry_seal;
27346+ /* compose key of directory entry for @name */
27347+ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
27348+
27349+ if (reiser4_seal_is_set(seal)) {
27350+ /* check seal */
27351+ result = reiser4_seal_validate(seal, coord, &entry->key,
27352+ lh, mode, ZNODE_LOCK_LOPRI);
27353+ if (result == 0) {
27354+ /* key was found. Check that it is really item we are
27355+ looking for. */
27356+ result = check_entry(dir, coord, name);
27357+ if (result == 0)
27358+ return 0;
27359+ }
27360+ }
27361+ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
27362+ /*
27363+ * find place in the tree where directory item should be located.
27364+ */
27365+ result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
27366+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
27367+ flags, NULL /*ra_info */ );
27368+ if (result == CBK_COORD_FOUND) {
27369+ struct entry_actor_args arg;
27370+
27371+ /* fast path: no hash collisions */
27372+ result = check_entry(dir, coord, name);
27373+ if (result == 0) {
27374+ reiser4_seal_init(seal, coord, &entry->key);
27375+ dec->pos = 0;
27376+ } else if (result > 0) {
27377+ /* Iterate through all units with the same keys. */
27378+ arg.name = name->name;
27379+ arg.key = &entry->key;
27380+ arg.not_found = 0;
27381+ arg.non_uniq = 0;
27382+#if REISER4_USE_COLLISION_LIMIT
27383+ arg.max_non_uniq = max_hash_collisions(dir);
27384+ assert("nikita-2851", arg.max_non_uniq > 1);
27385+#endif
27386+ arg.mode = mode;
27387+ arg.inode = dir;
27388+ coord_init_zero(&arg.last_coord);
27389+ init_lh(&arg.last_lh);
27390+
27391+ result = reiser4_iterate_tree
27392+ (reiser4_tree_by_inode(dir),
27393+ coord, lh,
27394+ entry_actor, &arg, mode, 1);
27395+ /* if end of the tree or extent was reached during
27396+ scanning. */
27397+ if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
27398+ /* step back */
27399+ done_lh(lh);
27400+
27401+ result = zload(arg.last_coord.node);
27402+ if (result == 0) {
27403+ coord_clear_iplug(&arg.last_coord);
27404+ coord_dup(coord, &arg.last_coord);
27405+ move_lh(lh, &arg.last_lh);
27406+ result = RETERR(-ENOENT);
27407+ zrelse(arg.last_coord.node);
27408+ --arg.non_uniq;
27409+ }
27410+ }
27411+
27412+ done_lh(&arg.last_lh);
27413+ if (result == 0)
27414+ reiser4_seal_init(seal, coord, &entry->key);
27415+
27416+ if (result == 0 || result == -ENOENT) {
27417+ assert("nikita-2580", arg.non_uniq > 0);
27418+ dec->pos = arg.non_uniq - 1;
27419+ }
27420+ }
27421+ } else
27422+ dec->pos = -1;
27423+ return result;
27424+}
27425+
27426+/*
27427+ Local variables:
27428+ c-indentation-style: "K&R"
27429+ mode-name: "LC"
27430+ c-basic-offset: 8
27431+ tab-width: 8
27432+ fill-column: 120
27433+ scroll-step: 1
27434+ End:
27435+*/
27436diff -urN linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format40.c
27437--- linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 03:00:00.000000000 +0300
27438+++ linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format40.c 2007-12-04 16:49:30.000000000 +0300
27439@@ -0,0 +1,655 @@
27440+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27441+
27442+#include "../../debug.h"
27443+#include "../../dformat.h"
27444+#include "../../key.h"
27445+#include "../node/node.h"
27446+#include "../space/space_allocator.h"
27447+#include "disk_format40.h"
27448+#include "../plugin.h"
27449+#include "../../txnmgr.h"
27450+#include "../../jnode.h"
27451+#include "../../tree.h"
27452+#include "../../super.h"
27453+#include "../../wander.h"
27454+#include "../../inode.h"
27455+#include "../../ktxnmgrd.h"
27456+#include "../../status_flags.h"
27457+
27458+#include <linux/types.h> /* for __u?? */
27459+#include <linux/fs.h> /* for struct super_block */
27460+#include <linux/buffer_head.h>
27461+
27462+/* reiser 4.0 default disk layout */
27463+
27464+/* Amount of free blocks needed to perform release_format40 when fs gets
27465+ mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
27466+ & tx record. */
27467+#define RELEASE_RESERVED 4
27468+
27469+/* The greatest supported format40 version number */
27470+#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
27471+
27472+/* This flag indicates that backup should be updated
27473+ (the update is performed by fsck) */
27474+#define FORMAT40_UPDATE_BACKUP (1 << 31)
27475+
27476+/* functions to access fields of format40_disk_super_block */
27477+static __u64 get_format40_block_count(const format40_disk_super_block * sb)
27478+{
27479+ return le64_to_cpu(get_unaligned(&sb->block_count));
27480+}
27481+
27482+static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
27483+{
27484+ return le64_to_cpu(get_unaligned(&sb->free_blocks));
27485+}
27486+
27487+static __u64 get_format40_root_block(const format40_disk_super_block * sb)
27488+{
27489+ return le64_to_cpu(get_unaligned(&sb->root_block));
27490+}
27491+
27492+static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
27493+{
27494+ return le16_to_cpu(get_unaligned(&sb->tree_height));
27495+}
27496+
27497+static __u64 get_format40_file_count(const format40_disk_super_block * sb)
27498+{
27499+ return le64_to_cpu(get_unaligned(&sb->file_count));
27500+}
27501+
27502+static __u64 get_format40_oid(const format40_disk_super_block * sb)
27503+{
27504+ return le64_to_cpu(get_unaligned(&sb->oid));
27505+}
27506+
27507+static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
27508+{
27509+ return le32_to_cpu(get_unaligned(&sb->mkfs_id));
27510+}
27511+
27512+static __u64 get_format40_flags(const format40_disk_super_block * sb)
27513+{
27514+ return le64_to_cpu(get_unaligned(&sb->flags));
27515+}
27516+
27517+static __u32 get_format40_version(const format40_disk_super_block * sb)
27518+{
27519+ return le32_to_cpu(get_unaligned(&sb->version)) &
27520+ ~FORMAT40_UPDATE_BACKUP;
27521+}
27522+
27523+static int update_backup_version(const format40_disk_super_block * sb)
27524+{
27525+ return (le32_to_cpu(get_unaligned(&sb->version)) &
27526+ FORMAT40_UPDATE_BACKUP);
27527+}
27528+
27529+static int update_disk_version(const format40_disk_super_block * sb)
27530+{
27531+ return (get_format40_version(sb) < FORMAT40_VERSION);
27532+}
27533+
27534+static int incomplete_compatibility(const format40_disk_super_block * sb)
27535+{
27536+ return (get_format40_version(sb) > FORMAT40_VERSION);
27537+}
27538+
27539+static format40_super_info *get_sb_info(struct super_block *super)
27540+{
27541+ return &get_super_private(super)->u.format40;
27542+}
27543+
27544+static int consult_diskmap(struct super_block *s)
27545+{
27546+ format40_super_info *info;
27547+ journal_location *jloc;
27548+
27549+ info = get_sb_info(s);
27550+ jloc = &get_super_private(s)->jloc;
27551+ /* Default format-specific locations, if there is nothing in
27552+ * diskmap */
27553+ jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
27554+ jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
27555+ info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
27556+#ifdef CONFIG_REISER4_BADBLOCKS
27557+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
27558+ &jloc->footer);
27559+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
27560+ &jloc->header);
27561+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
27562+ &info->loc.super);
27563+#endif
27564+ return 0;
27565+}
27566+
27567+/* find any valid super block of disk_format40 (even if the first
27568+ super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
27569+ if needed */
27570+static struct buffer_head *find_a_disk_format40_super_block(struct super_block
27571+ *s)
27572+{
27573+ struct buffer_head *super_bh;
27574+ format40_disk_super_block *disk_sb;
27575+ format40_super_info *info;
27576+
27577+ assert("umka-487", s != NULL);
27578+
27579+ info = get_sb_info(s);
27580+
27581+ super_bh = sb_bread(s, info->loc.super);
27582+ if (super_bh == NULL)
27583+ return ERR_PTR(RETERR(-EIO));
27584+
27585+ disk_sb = (format40_disk_super_block *) super_bh->b_data;
27586+ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
27587+ brelse(super_bh);
27588+ return ERR_PTR(RETERR(-EINVAL));
27589+ }
27590+
27591+ reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
27592+ reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
27593+ le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27594+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27595+
27596+ return super_bh;
27597+}
27598+
27599+/* find the most recent version of super block. This is called after journal is
27600+ replayed */
27601+static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
27602+{
27603+ /* Here the most recent superblock copy has to be read. However, as
27604+ journal replay isn't complete, we are using
27605+ find_a_disk_format40_super_block() function. */
27606+ return find_a_disk_format40_super_block(s);
27607+}
27608+
27609+static int get_super_jnode(struct super_block *s)
27610+{
27611+ reiser4_super_info_data *sbinfo = get_super_private(s);
27612+ jnode *sb_jnode;
27613+ int ret;
27614+
27615+ sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
27616+
27617+ ret = jload(sb_jnode);
27618+
27619+ if (ret) {
27620+ reiser4_drop_io_head(sb_jnode);
27621+ return ret;
27622+ }
27623+
27624+ pin_jnode_data(sb_jnode);
27625+ jrelse(sb_jnode);
27626+
27627+ sbinfo->u.format40.sb_jnode = sb_jnode;
27628+
27629+ return 0;
27630+}
27631+
27632+static void done_super_jnode(struct super_block *s)
27633+{
27634+ jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
27635+
27636+ if (sb_jnode) {
27637+ unpin_jnode_data(sb_jnode);
27638+ reiser4_drop_io_head(sb_jnode);
27639+ }
27640+}
27641+
27642+typedef enum format40_init_stage {
27643+ NONE_DONE = 0,
27644+ CONSULT_DISKMAP,
27645+ FIND_A_SUPER,
27646+ INIT_JOURNAL_INFO,
27647+ INIT_STATUS,
27648+ JOURNAL_REPLAY,
27649+ READ_SUPER,
27650+ KEY_CHECK,
27651+ INIT_OID,
27652+ INIT_TREE,
27653+ JOURNAL_RECOVER,
27654+ INIT_SA,
27655+ INIT_JNODE,
27656+ ALL_DONE
27657+} format40_init_stage;
27658+
27659+static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
27660+{
27661+ format40_disk_super_block *sb_copy;
27662+
27663+ sb_copy = kmalloc(sizeof(format40_disk_super_block),
27664+ reiser4_ctx_gfp_mask_get());
27665+ if (sb_copy == NULL)
27666+ return ERR_PTR(RETERR(-ENOMEM));
27667+ memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
27668+ sizeof(format40_disk_super_block));
27669+ return sb_copy;
27670+}
27671+
27672+static int check_key_format(const format40_disk_super_block *sb_copy)
27673+{
27674+ if (!equi(REISER4_LARGE_KEY,
27675+ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
27676+ warning("nikita-3228", "Key format mismatch. "
27677+ "Only %s keys are supported.",
27678+ REISER4_LARGE_KEY ? "large" : "small");
27679+ return RETERR(-EINVAL);
27680+ }
27681+ return 0;
27682+}
27683+
27684+/**
27685+ * try_init_format40
27686+ * @super:
27687+ * @stage:
27688+ *
27689+ */
27690+static int try_init_format40(struct super_block *super,
27691+ format40_init_stage *stage)
27692+{
27693+ int result;
27694+ struct buffer_head *super_bh;
27695+ reiser4_super_info_data *sbinfo;
27696+ format40_disk_super_block *sb_copy;
27697+ tree_level height;
27698+ reiser4_block_nr root_block;
27699+ node_plugin *nplug;
27700+
27701+ assert("vs-475", super != NULL);
27702+ assert("vs-474", get_super_private(super));
27703+
27704+ *stage = NONE_DONE;
27705+
27706+ result = consult_diskmap(super);
27707+ if (result)
27708+ return result;
27709+ *stage = CONSULT_DISKMAP;
27710+
27711+ super_bh = find_a_disk_format40_super_block(super);
27712+ if (IS_ERR(super_bh))
27713+ return PTR_ERR(super_bh);
27714+ brelse(super_bh);
27715+ *stage = FIND_A_SUPER;
27716+
27717+ /* ok, we are sure that filesystem format is a format40 format */
27718+
27719+ /* map jnodes for journal control blocks (header, footer) to disk */
27720+ result = reiser4_init_journal_info(super);
27721+ if (result)
27722+ return result;
27723+ *stage = INIT_JOURNAL_INFO;
27724+
27725+ /* ok, we are sure that filesystem format is a format40 format */
27726+ /* Now check it's state */
27727+ result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
27728+ if (result != 0 && result != -EINVAL)
27729+ /* -EINVAL means there is no magic, so probably just old
27730+ * fs. */
27731+ return result;
27732+ *stage = INIT_STATUS;
27733+
27734+ result = reiser4_status_query(NULL, NULL);
27735+ if (result == REISER4_STATUS_MOUNT_WARN)
27736+ notice("vpf-1363", "Warning: mounting %s with errors.",
27737+ super->s_id);
27738+ if (result == REISER4_STATUS_MOUNT_RO)
27739+ notice("vpf-1364", "Warning: mounting %s with fatal errors,"
27740+ " forcing read-only mount.", super->s_id);
27741+ result = reiser4_journal_replay(super);
27742+ if (result)
27743+ return result;
27744+ *stage = JOURNAL_REPLAY;
27745+
27746+ super_bh = read_super_block(super);
27747+ if (IS_ERR(super_bh))
27748+ return PTR_ERR(super_bh);
27749+ *stage = READ_SUPER;
27750+
27751+ /* allocate and make a copy of format40_disk_super_block */
27752+ sb_copy = copy_sb(super_bh);
27753+ brelse(super_bh);
27754+
27755+ if (IS_ERR(sb_copy))
27756+ return PTR_ERR(sb_copy);
27757+ printk("reiser4: %s: found disk format 4.0.%u.\n",
27758+ super->s_id,
27759+ get_format40_version(sb_copy));
27760+ if (incomplete_compatibility(sb_copy))
27761+ printk("reiser4: Warning: The last completely supported "
27762+ "version of disk format40 is %u. Some objects of "
27763+ "the semantic tree can be unaccessible.\n",
27764+ FORMAT40_VERSION);
27765+ /* make sure that key format of kernel and filesystem match */
27766+ result = check_key_format(sb_copy);
27767+ if (result) {
27768+ kfree(sb_copy);
27769+ return result;
27770+ }
27771+ *stage = KEY_CHECK;
27772+
27773+ result = oid_init_allocator(super, get_format40_file_count(sb_copy),
27774+ get_format40_oid(sb_copy));
27775+ if (result) {
27776+ kfree(sb_copy);
27777+ return result;
27778+ }
27779+ *stage = INIT_OID;
27780+
27781+ /* get things necessary to init reiser4_tree */
27782+ root_block = get_format40_root_block(sb_copy);
27783+ height = get_format40_tree_height(sb_copy);
27784+ nplug = node_plugin_by_id(NODE40_ID);
27785+
27786+ /* initialize reiser4_super_info_data */
27787+ sbinfo = get_super_private(super);
27788+ assert("", sbinfo->tree.super == super);
27789+ /* init reiser4_tree for the filesystem */
27790+ result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
27791+ if (result) {
27792+ kfree(sb_copy);
27793+ return result;
27794+ }
27795+ *stage = INIT_TREE;
27796+
27797+ /*
27798+ * initialize reiser4_super_info_data with data from format40 super
27799+ * block
27800+ */
27801+ sbinfo->default_uid = 0;
27802+ sbinfo->default_gid = 0;
27803+ sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
27804+ /* number of blocks in filesystem and reserved space */
27805+ reiser4_set_block_count(super, get_format40_block_count(sb_copy));
27806+ sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
27807+ sbinfo->version = get_format40_version(sb_copy);
27808+ kfree(sb_copy);
27809+
27810+ if (update_backup_version(sb_copy))
27811+ printk("reiser4: Warning: metadata backup is not updated. "
27812+ "Please run 'fsck.reiser4 --fix' on %s.\n",
27813+ super->s_id);
27814+
27815+ sbinfo->fsuid = 0;
27816+ sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
27817+ * are not supported */
27818+ sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
27819+ * layout 40 are
27820+ * of one
27821+ * plugin */
27822+ /* sbinfo->tmgr is initialized already */
27823+
27824+ /* recover sb data which were logged separately from sb block */
27825+
27826+ /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
27827+ * oid_init_allocator() and reiser4_set_free_blocks() with new
27828+ * data. What's the reason to call them above? */
27829+ result = reiser4_journal_recover_sb_data(super);
27830+ if (result != 0)
27831+ return result;
27832+ *stage = JOURNAL_RECOVER;
27833+
27834+ /*
27835+ * Set number of used blocks. The number of used blocks is not stored
27836+ * neither in on-disk super block nor in the journal footer blocks. At
27837+ * this moment actual values of total blocks and free block counters
27838+ * are set in the reiser4 super block (in-memory structure) and we can
27839+ * calculate number of used blocks from them.
27840+ */
27841+ reiser4_set_data_blocks(super,
27842+ reiser4_block_count(super) -
27843+ reiser4_free_blocks(super));
27844+
27845+#if REISER4_DEBUG
27846+ sbinfo->min_blocks_used = 16 /* reserved area */ +
27847+ 2 /* super blocks */ +
27848+ 2 /* journal footer and header */ ;
27849+#endif
27850+
27851+ /* init disk space allocator */
27852+ result = sa_init_allocator(reiser4_get_space_allocator(super),
27853+ super, NULL);
27854+ if (result)
27855+ return result;
27856+ *stage = INIT_SA;
27857+
27858+ result = get_super_jnode(super);
27859+ if (result == 0)
27860+ *stage = ALL_DONE;
27861+ return result;
27862+}
27863+
27864+/* plugin->u.format.get_ready */
27865+int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
27866+{
27867+ int result;
27868+ format40_init_stage stage;
27869+
27870+ result = try_init_format40(s, &stage);
27871+ switch (stage) {
27872+ case ALL_DONE:
27873+ assert("nikita-3458", result == 0);
27874+ break;
27875+ case INIT_JNODE:
27876+ done_super_jnode(s);
27877+ case INIT_SA:
27878+ sa_destroy_allocator(reiser4_get_space_allocator(s), s);
27879+ case JOURNAL_RECOVER:
27880+ case INIT_TREE:
27881+ reiser4_done_tree(&get_super_private(s)->tree);
27882+ case INIT_OID:
27883+ case KEY_CHECK:
27884+ case READ_SUPER:
27885+ case JOURNAL_REPLAY:
27886+ case INIT_STATUS:
27887+ reiser4_status_finish();
27888+ case INIT_JOURNAL_INFO:
27889+ reiser4_done_journal_info(s);
27890+ case FIND_A_SUPER:
27891+ case CONSULT_DISKMAP:
27892+ case NONE_DONE:
27893+ break;
27894+ default:
27895+ impossible("nikita-3457", "init stage: %i", stage);
27896+ }
27897+
27898+ if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
27899+ return RETERR(-ENOSPC);
27900+
27901+ return result;
27902+}
27903+
27904+static void pack_format40_super(const struct super_block *s, char *data)
27905+{
27906+ format40_disk_super_block *super_data =
27907+ (format40_disk_super_block *) data;
27908+
27909+ reiser4_super_info_data *sbinfo = get_super_private(s);
27910+
27911+ assert("zam-591", data != NULL);
27912+
27913+ put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
27914+ &super_data->free_blocks);
27915+
27916+ put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
27917+ &super_data->root_block);
27918+
27919+ put_unaligned(cpu_to_le64(oid_next(s)),
27920+ &super_data->oid);
27921+
27922+ put_unaligned(cpu_to_le64(oids_used(s)),
27923+ &super_data->file_count);
27924+
27925+ put_unaligned(cpu_to_le16(sbinfo->tree.height),
27926+ &super_data->tree_height);
27927+
27928+ if (update_disk_version(super_data)) {
27929+ __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
27930+
27931+ put_unaligned(cpu_to_le32(version), &super_data->version);
27932+ }
27933+}
27934+
27935+/* plugin->u.format.log_super
27936+ return a jnode which should be added to transaction when the super block
27937+ gets logged */
27938+jnode *log_super_format40(struct super_block *s)
27939+{
27940+ jnode *sb_jnode;
27941+
27942+ sb_jnode = get_super_private(s)->u.format40.sb_jnode;
27943+
27944+ jload(sb_jnode);
27945+
27946+ pack_format40_super(s, jdata(sb_jnode));
27947+
27948+ jrelse(sb_jnode);
27949+
27950+ return sb_jnode;
27951+}
27952+
27953+/* plugin->u.format.release */
27954+int release_format40(struct super_block *s)
27955+{
27956+ int ret;
27957+ reiser4_super_info_data *sbinfo;
27958+
27959+ sbinfo = get_super_private(s);
27960+ assert("zam-579", sbinfo != NULL);
27961+
27962+ if (!rofs_super(s)) {
27963+ ret = reiser4_capture_super_block(s);
27964+ if (ret != 0)
27965+ warning("vs-898",
27966+ "reiser4_capture_super_block failed: %d",
27967+ ret);
27968+
27969+ ret = txnmgr_force_commit_all(s, 1);
27970+ if (ret != 0)
27971+ warning("jmacd-74438", "txn_force failed: %d", ret);
27972+
27973+ all_grabbed2free();
27974+ }
27975+
27976+ sa_destroy_allocator(&sbinfo->space_allocator, s);
27977+ reiser4_done_journal_info(s);
27978+ done_super_jnode(s);
27979+
27980+ rcu_barrier();
27981+ reiser4_done_tree(&sbinfo->tree);
27982+ /* call finish_rcu(), because some znode were "released" in
27983+ * reiser4_done_tree(). */
27984+ rcu_barrier();
27985+
27986+ return 0;
27987+}
27988+
27989+#define FORMAT40_ROOT_LOCALITY 41
27990+#define FORMAT40_ROOT_OBJECTID 42
27991+
27992+/* plugin->u.format.root_dir_key */
27993+const reiser4_key *root_dir_key_format40(const struct super_block *super
27994+ UNUSED_ARG)
27995+{
27996+ static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
27997+ .el = {
27998+ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
27999+#if REISER4_LARGE_KEY
28000+ ON_LARGE_KEY(0ull,)
28001+#endif
28002+ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
28003+ 0ull
28004+ }
28005+ };
28006+
28007+ return &FORMAT40_ROOT_DIR_KEY;
28008+}
28009+
28010+/* plugin->u.format.check_open.
28011+ Check the opened object for validness. For now it checks for the valid oid &
28012+ locality only, can be improved later and it its work may depend on the mount
28013+ options. */
28014+int check_open_format40(const struct inode *object)
28015+{
28016+ oid_t max, oid;
28017+
28018+ max = oid_next(object->i_sb) - 1;
28019+
28020+ /* Check the oid. */
28021+ oid = get_inode_oid(object);
28022+ if (oid > max) {
28023+ warning("vpf-1360", "The object with the oid %llu "
28024+ "greater then the max used oid %llu found.",
28025+ (unsigned long long)oid, (unsigned long long)max);
28026+
28027+ return RETERR(-EIO);
28028+ }
28029+
28030+ /* Check the locality. */
28031+ oid = reiser4_inode_data(object)->locality_id;
28032+ if (oid > max) {
28033+ warning("vpf-1361", "The object with the locality %llu "
28034+ "greater then the max used oid %llu found.",
28035+ (unsigned long long)oid, (unsigned long long)max);
28036+
28037+ return RETERR(-EIO);
28038+ }
28039+
28040+ return 0;
28041+}
28042+
28043+/* plugin->u.format.version_update.
28044+ Perform all version update operations from the on-disk
28045+ format40_disk_super_block.version on disk to FORMAT40_VERSION.
28046+ */
28047+int version_update_format40(struct super_block *super) {
28048+ txn_handle * trans;
28049+ lock_handle lh;
28050+ txn_atom *atom;
28051+ int ret;
28052+
28053+ /* Nothing to do if RO mount or the on-disk version is not less. */
28054+ if (super->s_flags & MS_RDONLY)
28055+ return 0;
28056+
28057+ if (get_super_private(super)->version >= FORMAT40_VERSION)
28058+ return 0;
28059+
28060+ printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
28061+ "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
28062+ "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
28063+
28064+ /* Mark the uber znode dirty to call log_super on write_logs. */
28065+ init_lh(&lh);
28066+ ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
28067+ ZNODE_LOCK_HIPRI, &lh);
28068+ if (ret != 0)
28069+ return ret;
28070+
28071+ znode_make_dirty(lh.node);
28072+ done_lh(&lh);
28073+
28074+ /* Update the backup blocks. */
28075+
28076+ /* Force write_logs immediately. */
28077+ trans = get_current_context()->trans;
28078+ atom = get_current_atom_locked();
28079+ assert("vpf-1906", atom != NULL);
28080+
28081+ spin_lock_txnh(trans);
28082+ return force_commit_atom(trans);
28083+}
28084+
28085+/* Make Linus happy.
28086+ Local variables:
28087+ c-indentation-style: "K&R"
28088+ mode-name: "LC"
28089+ c-basic-offset: 8
28090+ tab-width: 8
28091+ fill-column: 120
28092+ scroll-step: 1
28093+ End:
28094+*/
28095diff -urN linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format40.h
28096--- linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 03:00:00.000000000 +0300
28097+++ linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format40.h 2007-12-04 16:49:30.000000000 +0300
28098@@ -0,0 +1,109 @@
28099+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28100+
28101+/* this file contains:
28102+ - definition of ondisk super block of standart disk layout for
28103+ reiser 4.0 (layout 40)
28104+ - definition of layout 40 specific portion of in-core super block
28105+ - declarations of functions implementing methods of layout plugin
28106+ for layout 40
28107+ - declarations of functions used to get/set fields in layout 40 super block
28108+*/
28109+
28110+#ifndef __DISK_FORMAT40_H__
28111+#define __DISK_FORMAT40_H__
28112+
28113+/* magic for default reiser4 layout */
28114+#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
28115+#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
28116+
28117+#include "../../dformat.h"
28118+
28119+#include <linux/fs.h> /* for struct super_block */
28120+
28121+typedef enum {
28122+ FORMAT40_LARGE_KEYS
28123+} format40_flags;
28124+
28125+/* ondisk super block for format 40. It is 512 bytes long */
28126+typedef struct format40_disk_super_block {
28127+ /* 0 */ d64 block_count;
28128+ /* number of block in a filesystem */
28129+ /* 8 */ d64 free_blocks;
28130+ /* number of free blocks */
28131+ /* 16 */ d64 root_block;
28132+ /* filesystem tree root block */
28133+ /* 24 */ d64 oid;
28134+ /* smallest free objectid */
28135+ /* 32 */ d64 file_count;
28136+ /* number of files in a filesystem */
28137+ /* 40 */ d64 flushes;
28138+ /* number of times super block was
28139+ flushed. Needed if format 40
28140+ will have few super blocks */
28141+ /* 48 */ d32 mkfs_id;
28142+ /* unique identifier of fs */
28143+ /* 52 */ char magic[16];
28144+ /* magic string ReIsEr40FoRmAt */
28145+ /* 68 */ d16 tree_height;
28146+ /* height of filesystem tree */
28147+ /* 70 */ d16 formatting_policy;
28148+ /* not used anymore */
28149+ /* 72 */ d64 flags;
28150+ /* 80 */ d32 version;
28151+ /* on-disk format version number
28152+ initially assigned by mkfs as the greatest format40
28153+ version number supported by reiser4progs and updated
28154+ in mount time in accordance with the greatest format40
28155+ version number supported by kernel.
28156+ Is used by fsck to catch possible corruption and
28157+ for various compatibility issues */
28158+ /* 84 */ char not_used[428];
28159+} format40_disk_super_block;
28160+
28161+/* format 40 specific part of reiser4_super_info_data */
28162+typedef struct format40_super_info {
28163+/* format40_disk_super_block actual_sb; */
28164+ jnode *sb_jnode;
28165+ struct {
28166+ reiser4_block_nr super;
28167+ } loc;
28168+} format40_super_info;
28169+
28170+/* Defines for journal header and footer respectively. */
28171+#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
28172+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
28173+
28174+#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
28175+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
28176+
28177+#define FORMAT40_STATUS_BLOCKNR \
28178+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
28179+
28180+/* Diskmap declarations */
28181+#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
28182+#define FORMAT40_SUPER 1
28183+#define FORMAT40_JH 2
28184+#define FORMAT40_JF 3
28185+
28186+/* declarations of functions implementing methods of layout plugin for
28187+ format 40. The functions theirself are in disk_format40.c */
28188+extern int init_format_format40(struct super_block *, void *data);
28189+extern const reiser4_key *root_dir_key_format40(const struct super_block *);
28190+extern int release_format40(struct super_block *s);
28191+extern jnode *log_super_format40(struct super_block *s);
28192+extern int check_open_format40(const struct inode *object);
28193+extern int version_update_format40(struct super_block *super);
28194+
28195+/* __DISK_FORMAT40_H__ */
28196+#endif
28197+
28198+/* Make Linus happy.
28199+ Local variables:
28200+ c-indentation-style: "K&R"
28201+ mode-name: "LC"
28202+ c-basic-offset: 8
28203+ tab-width: 8
28204+ fill-column: 120
28205+ scroll-step: 1
28206+ End:
28207+*/
28208diff -urN linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format.c
28209--- linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 03:00:00.000000000 +0300
28210+++ linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format.c 2007-12-04 16:49:30.000000000 +0300
28211@@ -0,0 +1,38 @@
28212+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28213+
28214+#include "../../debug.h"
28215+#include "../plugin_header.h"
28216+#include "disk_format40.h"
28217+#include "disk_format.h"
28218+#include "../plugin.h"
28219+
28220+/* initialization of disk layout plugins */
28221+disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
28222+ [FORMAT40_ID] = {
28223+ .h = {
28224+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
28225+ .id = FORMAT40_ID,
28226+ .pops = NULL,
28227+ .label = "reiser40",
28228+ .desc = "standard disk layout for reiser40",
28229+ .linkage = {NULL, NULL}
28230+ },
28231+ .init_format = init_format_format40,
28232+ .root_dir_key = root_dir_key_format40,
28233+ .release = release_format40,
28234+ .log_super = log_super_format40,
28235+ .check_open = check_open_format40,
28236+ .version_update = version_update_format40
28237+ }
28238+};
28239+
28240+/* Make Linus happy.
28241+ Local variables:
28242+ c-indentation-style: "K&R"
28243+ mode-name: "LC"
28244+ c-basic-offset: 8
28245+ tab-width: 8
28246+ fill-column: 120
28247+ scroll-step: 1
28248+ End:
28249+*/
28250diff -urN linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format.h
28251--- linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 03:00:00.000000000 +0300
28252+++ linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format.h 2007-12-04 16:49:30.000000000 +0300
28253@@ -0,0 +1,27 @@
28254+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28255+
28256+/* identifiers for disk layouts, they are also used as indexes in array of disk
28257+ plugins */
28258+
28259+#if !defined( __REISER4_DISK_FORMAT_H__ )
28260+#define __REISER4_DISK_FORMAT_H__
28261+
28262+typedef enum {
28263+ /* standard reiser4 disk layout plugin id */
28264+ FORMAT40_ID,
28265+ LAST_FORMAT_ID
28266+} disk_format_id;
28267+
28268+/* __REISER4_DISK_FORMAT_H__ */
28269+#endif
28270+
28271+/* Make Linus happy.
28272+ Local variables:
28273+ c-indentation-style: "K&R"
28274+ mode-name: "LC"
28275+ c-basic-offset: 8
28276+ tab-width: 8
28277+ fill-column: 120
28278+ scroll-step: 1
28279+ End:
28280+*/
28281diff -urN linux-2.6.23.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.23/fs/reiser4/plugin/disk_format/Makefile
28282--- linux-2.6.23.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 03:00:00.000000000 +0300
28283+++ linux-2.6.23/fs/reiser4/plugin/disk_format/Makefile 2007-12-04 16:49:30.000000000 +0300
28284@@ -0,0 +1,5 @@
28285+obj-$(CONFIG_REISER4_FS) += df_plugins.o
28286+
28287+df_plugins-objs := \
28288+ disk_format40.o \
28289+ disk_format.o
28290diff -urN linux-2.6.23.orig/fs/reiser4/plugin/fibration.c linux-2.6.23/fs/reiser4/plugin/fibration.c
28291--- linux-2.6.23.orig/fs/reiser4/plugin/fibration.c 1970-01-01 03:00:00.000000000 +0300
28292+++ linux-2.6.23/fs/reiser4/plugin/fibration.c 2007-12-04 16:49:30.000000000 +0300
28293@@ -0,0 +1,175 @@
28294+/* Copyright 2004 by Hans Reiser, licensing governed by
28295+ * reiser4/README */
28296+
28297+/* Directory fibrations */
28298+
28299+/*
28300+ * Suppose we have a directory tree with sources of some project. During
28301+ * compilation .o files are created within this tree. This makes access
28302+ * to the original source files less efficient, because source files are
28303+ * now "diluted" by object files: default directory plugin uses prefix
28304+ * of a file name as a part of the key for directory entry (and this
28305+ * part is also inherited by the key of file body). This means that
28306+ * foo.o will be located close to foo.c and foo.h in the tree.
28307+ *
28308+ * To avoid this effect directory plugin fill highest 7 (unused
28309+ * originally) bits of the second component of the directory entry key
28310+ * by bit-pattern depending on the file name (see
28311+ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
28312+ * "fibre". Fibre of the file name key is inherited by key of stat data
28313+ * and keys of file body (in the case of REISER4_LARGE_KEY).
28314+ *
28315+ * Fibre for a given file is chosen by per-directory fibration
28316+ * plugin. Names within given fibre are ordered lexicographically.
28317+ */
28318+
28319+#include "../debug.h"
28320+#include "plugin_header.h"
28321+#include "plugin.h"
28322+#include "../super.h"
28323+#include "../inode.h"
28324+
28325+#include <linux/types.h>
28326+
28327+static const int fibre_shift = 57;
28328+
28329+#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
28330+
28331+/*
28332+ * Trivial fibration: all files of directory are just ordered
28333+ * lexicographically.
28334+ */
28335+static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
28336+{
28337+ return FIBRE_NO(0);
28338+}
28339+
28340+/*
28341+ * dot-o fibration: place .o files after all others.
28342+ */
28343+static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
28344+{
28345+ /* special treatment for .*\.o */
28346+ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
28347+ return FIBRE_NO(1);
28348+ else
28349+ return FIBRE_NO(0);
28350+}
28351+
28352+/*
28353+ * ext.1 fibration: subdivide directory into 128 fibrations one for each
28354+ * 7bit extension character (file "foo.h" goes into fibre "h"), plus
28355+ * default fibre for the rest.
28356+ */
28357+static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
28358+{
28359+ if (len > 2 && name[len - 2] == '.')
28360+ return FIBRE_NO(name[len - 1]);
28361+ else
28362+ return FIBRE_NO(0);
28363+}
28364+
28365+/*
28366+ * ext.3 fibration: try to separate files with different 3-character
28367+ * extensions from each other.
28368+ */
28369+static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
28370+{
28371+ if (len > 4 && name[len - 4] == '.')
28372+ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
28373+ else
28374+ return FIBRE_NO(0);
28375+}
28376+
28377+static int change_fibration(struct inode *inode,
28378+ reiser4_plugin * plugin,
28379+ pset_member memb)
28380+{
28381+ int result;
28382+
28383+ assert("nikita-3503", inode != NULL);
28384+ assert("nikita-3504", plugin != NULL);
28385+
28386+ assert("nikita-3505", is_reiser4_inode(inode));
28387+ assert("nikita-3506", inode_dir_plugin(inode) != NULL);
28388+ assert("nikita-3507",
28389+ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
28390+
28391+ result = 0;
28392+ if (inode_fibration_plugin(inode) == NULL ||
28393+ inode_fibration_plugin(inode)->h.id != plugin->h.id) {
28394+ if (is_dir_empty(inode) == 0)
28395+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
28396+ PSET_FIBRATION, plugin);
28397+ else
28398+ result = RETERR(-ENOTEMPTY);
28399+
28400+ }
28401+ return result;
28402+}
28403+
28404+static reiser4_plugin_ops fibration_plugin_ops = {
28405+ .init = NULL,
28406+ .load = NULL,
28407+ .save_len = NULL,
28408+ .save = NULL,
28409+ .change = change_fibration
28410+};
28411+
28412+/* fibration plugins */
28413+fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
28414+ [FIBRATION_LEXICOGRAPHIC] = {
28415+ .h = {
28416+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28417+ .id = FIBRATION_LEXICOGRAPHIC,
28418+ .pops = &fibration_plugin_ops,
28419+ .label = "lexicographic",
28420+ .desc = "no fibration",
28421+ .linkage = {NULL, NULL}
28422+ },
28423+ .fibre = fibre_trivial
28424+ },
28425+ [FIBRATION_DOT_O] = {
28426+ .h = {
28427+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28428+ .id = FIBRATION_DOT_O,
28429+ .pops = &fibration_plugin_ops,
28430+ .label = "dot-o",
28431+ .desc = "fibrate .o files separately",
28432+ .linkage = {NULL, NULL}
28433+ },
28434+ .fibre = fibre_dot_o
28435+ },
28436+ [FIBRATION_EXT_1] = {
28437+ .h = {
28438+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28439+ .id = FIBRATION_EXT_1,
28440+ .pops = &fibration_plugin_ops,
28441+ .label = "ext-1",
28442+ .desc = "fibrate file by single character extension",
28443+ .linkage = {NULL, NULL}
28444+ },
28445+ .fibre = fibre_ext_1
28446+ },
28447+ [FIBRATION_EXT_3] = {
28448+ .h = {
28449+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28450+ .id = FIBRATION_EXT_3,
28451+ .pops = &fibration_plugin_ops,
28452+ .label = "ext-3",
28453+ .desc = "fibrate file by three character extension",
28454+ .linkage = {NULL, NULL}
28455+ },
28456+ .fibre = fibre_ext_3
28457+ }
28458+};
28459+
28460+/*
28461+ * Local variables:
28462+ * c-indentation-style: "K&R"
28463+ * mode-name: "LC"
28464+ * c-basic-offset: 8
28465+ * tab-width: 8
28466+ * fill-column: 79
28467+ * End:
28468+ */
28469diff -urN linux-2.6.23.orig/fs/reiser4/plugin/fibration.h linux-2.6.23/fs/reiser4/plugin/fibration.h
28470--- linux-2.6.23.orig/fs/reiser4/plugin/fibration.h 1970-01-01 03:00:00.000000000 +0300
28471+++ linux-2.6.23/fs/reiser4/plugin/fibration.h 2007-12-04 16:49:30.000000000 +0300
28472@@ -0,0 +1,37 @@
28473+/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
28474+
28475+/* Fibration plugin used by hashed directory plugin to segment content
28476+ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
28477+
28478+#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
28479+#define __FS_REISER4_PLUGIN_FIBRATION_H__
28480+
28481+#include "plugin_header.h"
28482+
28483+typedef struct fibration_plugin {
28484+ /* generic fields */
28485+ plugin_header h;
28486+
28487+ __u64(*fibre) (const struct inode * dir, const char *name, int len);
28488+} fibration_plugin;
28489+
28490+typedef enum {
28491+ FIBRATION_LEXICOGRAPHIC,
28492+ FIBRATION_DOT_O,
28493+ FIBRATION_EXT_1,
28494+ FIBRATION_EXT_3,
28495+ LAST_FIBRATION_ID
28496+} reiser4_fibration_id;
28497+
28498+/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
28499+#endif
28500+
28501+/* Make Linus happy.
28502+ Local variables:
28503+ c-indentation-style: "K&R"
28504+ mode-name: "LC"
28505+ c-basic-offset: 8
28506+ tab-width: 8
28507+ fill-column: 120
28508+ End:
28509+*/
28510diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.23/fs/reiser4/plugin/file/cryptcompress.c
28511--- linux-2.6.23.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 03:00:00.000000000 +0300
28512+++ linux-2.6.23/fs/reiser4/plugin/file/cryptcompress.c 2007-12-04 23:04:00.722303973 +0300
28513@@ -0,0 +1,3778 @@
28514+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
28515+ reiser4/README */
28516+/*
28517+ * Written by Edward Shishkin.
28518+ *
28519+ * Implementations of inode/file/address_space operations
28520+ * specific for cryptcompress file plugin which manages
28521+ * regular files built of compressed and(or) encrypted bodies.
28522+ * See http://dev.namesys.com/CryptcompressPlugin for details.
28523+ */
28524+
28525+#include "../../inode.h"
28526+#include "../cluster.h"
28527+#include "../object.h"
28528+#include "../../tree_walk.h"
28529+#include "cryptcompress.h"
28530+
28531+#include <linux/pagevec.h>
28532+#include <asm/uaccess.h>
28533+#include <linux/swap.h>
28534+#include <linux/writeback.h>
28535+#include <linux/random.h>
28536+#include <linux/scatterlist.h>
28537+
28538+/*
28539+ Managing primary and secondary caches by Reiser4
28540+ cryptcompress file plugin. Synchronization scheme.
28541+
28542+
28543+ +------------------+
28544+ +------------------->| tfm stream |
28545+ | | (compressed data)|
28546+ flush | +------------------+
28547+ +-----------------+ |
28548+ |(->)longterm lock| V
28549+--+ writepages() | | +-***-+ reiser4 +---+
28550+ | | +--+ | *** | storage tree | |
28551+ | | | +-***-+ (primary cache)| |
28552+u | write() (secondary| cache) V / | \ | |
28553+s | ----> +----+ +----+ +----+ +----+ +-***** ******* **----+ ----> | d |
28554+e | | | |page cluster | | | **disk cluster** | | i |
28555+r | <---- +----+ +----+ +----+ +----+ +-***** **********----+ <---- | s |
28556+ | read() ^ ^ | | k |
28557+ | | (->)longterm lock| | page_io()| |
28558+ | | +------+ | |
28559+--+ readpages() | | +---+
28560+ | V
28561+ | +------------------+
28562+ +--------------------| tfm stream |
28563+ | (plain text) |
28564+ +------------------+
28565+*/
28566+
28567+/* get cryptcompress specific portion of inode */
28568+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode)
28569+{
28570+ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
28571+}
28572+
28573+/* plugin->u.file.init_inode_data */
28574+void init_inode_data_cryptcompress(struct inode *inode,
28575+ reiser4_object_create_data * crd,
28576+ int create)
28577+{
28578+ struct cryptcompress_info *data;
28579+
28580+ data = cryptcompress_inode_data(inode);
28581+ assert("edward-685", data != NULL);
28582+
28583+ memset(data, 0, sizeof(*data));
28584+
28585+ mutex_init(&data->checkin_mutex);
28586+ data->trunc_index = ULONG_MAX;
28587+ turn_on_compression(data);
28588+ set_lattice_factor(data, MIN_LATTICE_FACTOR);
28589+ init_inode_ordering(inode, crd, create);
28590+}
28591+
28592+/* The following is a part of reiser4 cipher key manager
28593+ which is called when opening/creating a cryptcompress file */
28594+
28595+/* get/set cipher key info */
28596+struct reiser4_crypto_info * inode_crypto_info (struct inode * inode)
28597+{
28598+ assert("edward-90", inode != NULL);
28599+ assert("edward-91", reiser4_inode_data(inode) != NULL);
28600+ return cryptcompress_inode_data(inode)->crypt;
28601+}
28602+
28603+static void set_inode_crypto_info (struct inode * inode,
28604+ struct reiser4_crypto_info * info)
28605+{
28606+ cryptcompress_inode_data(inode)->crypt = info;
28607+}
28608+
28609+/* allocate a cipher key info */
28610+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode)
28611+{
28612+ struct reiser4_crypto_info *info;
28613+ int fipsize;
28614+
28615+ info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
28616+ if (!info)
28617+ return ERR_PTR(-ENOMEM);
28618+
28619+ fipsize = inode_digest_plugin(inode)->fipsize;
28620+ info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
28621+ if (!info->keyid) {
28622+ kfree(info);
28623+ return ERR_PTR(-ENOMEM);
28624+ }
28625+ info->host = inode;
28626+ return info;
28627+}
28628+
28629+#if 0
28630+/* allocate/free low-level info for cipher and digest
28631+ transforms */
28632+static int alloc_crypto_tfms(struct reiser4_crypto_info * info)
28633+{
28634+ struct crypto_blkcipher * ctfm = NULL;
28635+ struct crypto_hash * dtfm = NULL;
28636+ cipher_plugin * cplug = inode_cipher_plugin(info->host);
28637+ digest_plugin * dplug = inode_digest_plugin(info->host);
28638+
28639+ if (cplug->alloc) {
28640+ ctfm = cplug->alloc();
28641+ if (IS_ERR(ctfm)) {
28642+ warning("edward-1364",
28643+ "Can not allocate info for %s\n",
28644+ cplug->h.desc);
28645+ return RETERR(PTR_ERR(ctfm));
28646+ }
28647+ }
28648+ info_set_cipher(info, ctfm);
28649+ if (dplug->alloc) {
28650+ dtfm = dplug->alloc();
28651+ if (IS_ERR(dtfm)) {
28652+ warning("edward-1365",
28653+ "Can not allocate info for %s\n",
28654+ dplug->h.desc);
28655+ goto unhappy_with_digest;
28656+ }
28657+ }
28658+ info_set_digest(info, dtfm);
28659+ return 0;
28660+ unhappy_with_digest:
28661+ if (cplug->free) {
28662+ cplug->free(ctfm);
28663+ info_set_cipher(info, NULL);
28664+ }
28665+ return RETERR(PTR_ERR(dtfm));
28666+}
28667+#endif
28668+
28669+static void
28670+free_crypto_tfms(struct reiser4_crypto_info * info)
28671+{
28672+ assert("edward-1366", info != NULL);
28673+ if (!info_get_cipher(info)) {
28674+ assert("edward-1601", !info_get_digest(info));
28675+ return;
28676+ }
28677+ inode_cipher_plugin(info->host)->free(info_get_cipher(info));
28678+ info_set_cipher(info, NULL);
28679+ inode_digest_plugin(info->host)->free(info_get_digest(info));
28680+ info_set_digest(info, NULL);
28681+ return;
28682+}
28683+
28684+#if 0
28685+/* create a key fingerprint for disk stat-data */
28686+static int create_keyid (struct reiser4_crypto_info * info,
28687+ struct reiser4_crypto_data * data)
28688+{
28689+ int ret = -ENOMEM;
28690+ size_t blk, pad;
28691+ __u8 * dmem;
28692+ __u8 * cmem;
28693+ struct hash_desc ddesc;
28694+ struct blkcipher_desc cdesc;
28695+ struct scatterlist sg;
28696+
28697+ assert("edward-1367", info != NULL);
28698+ assert("edward-1368", info->keyid != NULL);
28699+
28700+ ddesc.tfm = info_get_digest(info);
28701+ ddesc.flags = 0;
28702+ cdesc.tfm = info_get_cipher(info);
28703+ cdesc.flags = 0;
28704+
28705+ dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
28706+ reiser4_ctx_gfp_mask_get());
28707+ if (!dmem)
28708+ goto exit1;
28709+
28710+ blk = crypto_blkcipher_blocksize(cdesc.tfm);
28711+
28712+ pad = data->keyid_size % blk;
28713+ pad = (pad ? blk - pad : 0);
28714+
28715+ cmem = kmalloc((size_t)data->keyid_size + pad,
28716+ reiser4_ctx_gfp_mask_get());
28717+ if (!cmem)
28718+ goto exit2;
28719+ memcpy(cmem, data->keyid, data->keyid_size);
28720+ memset(cmem + data->keyid_size, 0, pad);
28721+
28722+ sg_init_one(&sg, cmem, data->keyid_size + pad);
28723+
28724+ ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
28725+ data->keyid_size + pad);
28726+ if (ret) {
28727+ warning("edward-1369",
28728+ "encryption failed flags=%x\n", cdesc.flags);
28729+ goto exit3;
28730+ }
28731+ ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
28732+ if (ret) {
28733+ warning("edward-1602",
28734+ "digest failed flags=%x\n", ddesc.flags);
28735+ goto exit3;
28736+ }
28737+ memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
28738+ exit3:
28739+ kfree(cmem);
28740+ exit2:
28741+ kfree(dmem);
28742+ exit1:
28743+ return ret;
28744+}
28745+#endif
28746+
28747+static void destroy_keyid(struct reiser4_crypto_info * info)
28748+{
28749+ assert("edward-1370", info != NULL);
28750+ assert("edward-1371", info->keyid != NULL);
28751+ kfree(info->keyid);
28752+ return;
28753+}
28754+
28755+static void __free_crypto_info (struct inode * inode)
28756+{
28757+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
28758+ assert("edward-1372", info != NULL);
28759+
28760+ free_crypto_tfms(info);
28761+ destroy_keyid(info);
28762+ kfree(info);
28763+}
28764+
28765+#if 0
28766+static void instantiate_crypto_info(struct reiser4_crypto_info * info)
28767+{
28768+ assert("edward-1373", info != NULL);
28769+ assert("edward-1374", info->inst == 0);
28770+ info->inst = 1;
28771+}
28772+#endif
28773+
28774+static void uninstantiate_crypto_info(struct reiser4_crypto_info * info)
28775+{
28776+ assert("edward-1375", info != NULL);
28777+ info->inst = 0;
28778+}
28779+
28780+#if 0
28781+static int is_crypto_info_instantiated(struct reiser4_crypto_info * info)
28782+{
28783+ return info->inst;
28784+}
28785+
28786+static int inode_has_cipher_key(struct inode * inode)
28787+{
28788+ assert("edward-1376", inode != NULL);
28789+ return inode_crypto_info(inode) &&
28790+ is_crypto_info_instantiated(inode_crypto_info(inode));
28791+}
28792+#endif
28793+
28794+static void free_crypto_info (struct inode * inode)
28795+{
28796+ uninstantiate_crypto_info(inode_crypto_info(inode));
28797+ __free_crypto_info(inode);
28798+}
28799+
28800+static int need_cipher(struct inode * inode)
28801+{
28802+ return inode_cipher_plugin(inode) !=
28803+ cipher_plugin_by_id(NONE_CIPHER_ID);
28804+}
28805+
28806+/* Parse @data which contains a (uninstantiated) cipher key imported
28807+ from user space, create a low-level cipher info and attach it to
28808+ the @object. If success, then info contains an instantiated key */
28809+#if 0
28810+struct reiser4_crypto_info * create_crypto_info(struct inode * object,
28811+ struct reiser4_crypto_data * data)
28812+{
28813+ int ret;
28814+ struct reiser4_crypto_info * info;
28815+
28816+ assert("edward-1377", data != NULL);
28817+ assert("edward-1378", need_cipher(object));
28818+
28819+ if (inode_file_plugin(object) !=
28820+ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
28821+ return ERR_PTR(-EINVAL);
28822+
28823+ info = reiser4_alloc_crypto_info(object);
28824+ if (IS_ERR(info))
28825+ return info;
28826+ ret = alloc_crypto_tfms(info);
28827+ if (ret)
28828+ goto err;
28829+ /* instantiating a key */
28830+ ret = crypto_blkcipher_setkey(info_get_cipher(info),
28831+ data->key,
28832+ data->keysize);
28833+ if (ret) {
28834+ warning("edward-1379",
28835+ "setkey failed flags=%x",
28836+ crypto_blkcipher_get_flags(info_get_cipher(info)));
28837+ goto err;
28838+ }
28839+ info->keysize = data->keysize;
28840+ ret = create_keyid(info, data);
28841+ if (ret)
28842+ goto err;
28843+ instantiate_crypto_info(info);
28844+ return info;
28845+ err:
28846+ __free_crypto_info(object);
28847+ return ERR_PTR(ret);
28848+}
28849+#endif
28850+
28851+/* increment/decrement a load counter when
28852+ attaching/detaching the crypto-stat to any object */
28853+static void load_crypto_info(struct reiser4_crypto_info * info)
28854+{
28855+ assert("edward-1380", info != NULL);
28856+ inc_keyload_count(info);
28857+}
28858+
28859+static void unload_crypto_info(struct inode * inode)
28860+{
28861+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
28862+ assert("edward-1381", info->keyload_count > 0);
28863+
28864+ dec_keyload_count(inode_crypto_info(inode));
28865+ if (info->keyload_count == 0)
28866+ /* final release */
28867+ free_crypto_info(inode);
28868+}
28869+
28870+/* attach/detach an existing crypto-stat */
28871+void reiser4_attach_crypto_info(struct inode * inode,
28872+ struct reiser4_crypto_info * info)
28873+{
28874+ assert("edward-1382", inode != NULL);
28875+ assert("edward-1383", info != NULL);
28876+ assert("edward-1384", inode_crypto_info(inode) == NULL);
28877+
28878+ set_inode_crypto_info(inode, info);
28879+ load_crypto_info(info);
28880+}
28881+
28882+/* returns true, if crypto stat can be attached to the @host */
28883+#if REISER4_DEBUG
28884+static int host_allows_crypto_info(struct inode * host)
28885+{
28886+ int ret;
28887+ file_plugin * fplug = inode_file_plugin(host);
28888+
28889+ switch (fplug->h.id) {
28890+ case CRYPTCOMPRESS_FILE_PLUGIN_ID:
28891+ ret = 1;
28892+ break;
28893+ default:
28894+ ret = 0;
28895+ }
28896+ return ret;
28897+}
28898+#endif /* REISER4_DEBUG */
28899+
28900+static void reiser4_detach_crypto_info(struct inode * inode)
28901+{
28902+ assert("edward-1385", inode != NULL);
28903+ assert("edward-1386", host_allows_crypto_info(inode));
28904+
28905+ if (inode_crypto_info(inode))
28906+ unload_crypto_info(inode);
28907+ set_inode_crypto_info(inode, NULL);
28908+}
28909+
28910+#if 0
28911+
28912+/* compare fingerprints of @child and @parent */
28913+static int keyid_eq(struct reiser4_crypto_info * child,
28914+ struct reiser4_crypto_info * parent)
28915+{
28916+ return !memcmp(child->keyid,
28917+ parent->keyid,
28918+ info_digest_plugin(parent)->fipsize);
28919+}
28920+
28921+/* check if a crypto-stat (which is bound to @parent) can be inherited */
28922+int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
28923+{
28924+ if (!need_cipher(child))
28925+ return 0;
28926+ /* the child is created */
28927+ if (!inode_crypto_info(child))
28928+ return 1;
28929+ /* the child is looked up */
28930+ if (!inode_crypto_info(parent))
28931+ return 0;
28932+ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
28933+ inode_digest_plugin(child) == inode_digest_plugin(parent) &&
28934+ inode_crypto_info(child)->keysize ==
28935+ inode_crypto_info(parent)->keysize &&
28936+ keyid_eq(inode_crypto_info(child), inode_crypto_info(parent)));
28937+}
28938+#endif
28939+
28940+/* helper functions for ->create() method of the cryptcompress plugin */
28941+static int inode_set_crypto(struct inode * object)
28942+{
28943+ reiser4_inode * info;
28944+ if (!inode_crypto_info(object)) {
28945+ if (need_cipher(object))
28946+ return RETERR(-EINVAL);
28947+ /* the file is not to be encrypted */
28948+ return 0;
28949+ }
28950+ info = reiser4_inode_data(object);
28951+ info->extmask |= (1 << CRYPTO_STAT);
28952+ return 0;
28953+}
28954+
28955+static int inode_init_compression(struct inode * object)
28956+{
28957+ int result = 0;
28958+ assert("edward-1461", object != NULL);
28959+ if (inode_compression_plugin(object)->init)
28960+ result = inode_compression_plugin(object)->init();
28961+ return result;
28962+}
28963+
28964+static int inode_check_cluster(struct inode * object)
28965+{
28966+ assert("edward-696", object != NULL);
28967+
28968+ if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) {
28969+ warning("edward-1320", "Can not support '%s' "
28970+ "logical clusters (less then page size)",
28971+ inode_cluster_plugin(object)->h.label);
28972+ return RETERR(-EINVAL);
28973+ }
28974+ if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){
28975+ warning("edward-1463", "Can not support '%s' "
28976+ "logical clusters (too big for transform)",
28977+ inode_cluster_plugin(object)->h.label);
28978+ return RETERR(-EINVAL);
28979+ }
28980+ return 0;
28981+}
28982+
28983+/* plugin->destroy_inode() */
28984+void destroy_inode_cryptcompress(struct inode * inode)
28985+{
28986+ assert("edward-1464", INODE_PGCOUNT(inode) == 0);
28987+ reiser4_detach_crypto_info(inode);
28988+ return;
28989+}
28990+
28991+/* plugin->create_object():
28992+. install plugins
28993+. attach crypto info if specified
28994+. attach compression info if specified
28995+. attach cluster info
28996+*/
28997+int create_object_cryptcompress(struct inode *object, struct inode *parent,
28998+ reiser4_object_create_data * data)
28999+{
29000+ int result;
29001+ reiser4_inode *info;
29002+
29003+ assert("edward-23", object != NULL);
29004+ assert("edward-24", parent != NULL);
29005+ assert("edward-30", data != NULL);
29006+ assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
29007+ assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
29008+
29009+ info = reiser4_inode_data(object);
29010+
29011+ assert("edward-29", info != NULL);
29012+
29013+ /* set file bit */
29014+ info->plugin_mask |= (1 << PSET_FILE);
29015+
29016+ /* set crypto */
29017+ result = inode_set_crypto(object);
29018+ if (result)
29019+ goto error;
29020+ /* set compression */
29021+ result = inode_init_compression(object);
29022+ if (result)
29023+ goto error;
29024+ /* set cluster */
29025+ result = inode_check_cluster(object);
29026+ if (result)
29027+ goto error;
29028+
29029+ /* save everything in disk stat-data */
29030+ result = write_sd_by_inode_common(object);
29031+ if (!result)
29032+ return 0;
29033+ error:
29034+ reiser4_detach_crypto_info(object);
29035+ return result;
29036+}
29037+
29038+/* plugin->open() */
29039+int open_cryptcompress(struct inode * inode, struct file * file)
29040+{
29041+ return 0;
29042+}
29043+
29044+/* returns a blocksize, the attribute of a cipher algorithm */
29045+static unsigned int
29046+cipher_blocksize(struct inode * inode)
29047+{
29048+ assert("edward-758", need_cipher(inode));
29049+ assert("edward-1400", inode_crypto_info(inode) != NULL);
29050+ return crypto_blkcipher_blocksize
29051+ (info_get_cipher(inode_crypto_info(inode)));
29052+}
29053+
29054+/* returns offset translated by scale factor of the crypto-algorithm */
29055+static loff_t inode_scaled_offset (struct inode * inode,
29056+ const loff_t src_off /* input offset */)
29057+{
29058+ assert("edward-97", inode != NULL);
29059+
29060+ if (!need_cipher(inode) ||
29061+ src_off == get_key_offset(reiser4_min_key()) ||
29062+ src_off == get_key_offset(reiser4_max_key()))
29063+ return src_off;
29064+
29065+ return inode_cipher_plugin(inode)->scale(inode,
29066+ cipher_blocksize(inode),
29067+ src_off);
29068+}
29069+
29070+/* returns disk cluster size */
29071+size_t inode_scaled_cluster_size(struct inode * inode)
29072+{
29073+ assert("edward-110", inode != NULL);
29074+
29075+ return inode_scaled_offset(inode, inode_cluster_size(inode));
29076+}
29077+
29078+/* set number of cluster pages */
29079+static void set_cluster_nrpages(struct cluster_handle * clust,
29080+ struct inode *inode)
29081+{
29082+ struct reiser4_slide * win;
29083+
29084+ assert("edward-180", clust != NULL);
29085+ assert("edward-1040", inode != NULL);
29086+
29087+ clust->old_nrpages = size_in_pages(lbytes(clust->index, inode));
29088+ win = clust->win;
29089+ if (!win) {
29090+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
29091+ return;
29092+ }
29093+ assert("edward-1176", clust->op != LC_INVAL);
29094+ assert("edward-1064", win->off + win->count + win->delta != 0);
29095+
29096+ if (win->stat == HOLE_WINDOW &&
29097+ win->off == 0 && win->count == inode_cluster_size(inode)) {
29098+ /* special case: writing a "fake" logical cluster */
29099+ clust->nr_pages = 0;
29100+ return;
29101+ }
29102+ clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta,
29103+ lbytes(clust->index, inode)));
29104+ return;
29105+}
29106+
29107+/* plugin->key_by_inode()
29108+ build key of a disk cluster */
29109+int key_by_inode_cryptcompress(struct inode *inode, loff_t off,
29110+ reiser4_key * key)
29111+{
29112+ assert("edward-64", inode != 0);
29113+
29114+ if (likely(off != get_key_offset(reiser4_max_key())))
29115+ off = off_to_clust_to_off(off, inode);
29116+ if (inode_crypto_info(inode))
29117+ off = inode_scaled_offset(inode, off);
29118+
29119+ key_by_inode_and_offset_common(inode, 0, key);
29120+ set_key_offset(key, (__u64)off);
29121+ return 0;
29122+}
29123+
29124+/* plugin->flow_by_inode() */
29125+/* flow is used to read/write disk clusters */
29126+int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf,
29127+ int user, /* 1: @buf is of user space,
29128+ 0: kernel space */
29129+ loff_t size, /* @buf size */
29130+ loff_t off, /* offset to start io from */
29131+ rw_op op, /* READ or WRITE */
29132+ flow_t * f /* resulting flow */)
29133+{
29134+ assert("edward-436", f != NULL);
29135+ assert("edward-149", inode != NULL);
29136+ assert("edward-150", inode_file_plugin(inode) != NULL);
29137+ assert("edward-1465", user == 0); /* we use flow to read/write
29138+ disk clusters located in
29139+ kernel space */
29140+ f->length = size;
29141+ memcpy(&f->data, &buf, sizeof(buf));
29142+ f->user = user;
29143+ f->op = op;
29144+
29145+ return key_by_inode_cryptcompress(inode, off, &f->key);
29146+}
29147+
29148+static int
29149+cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
29150+ znode_lock_mode lock_mode)
29151+{
29152+ coord_t *coord;
29153+
29154+ assert("edward-704", hint != NULL);
29155+ assert("edward-1089", !hint_is_valid(hint));
29156+ assert("edward-706", hint->lh.owner == NULL);
29157+
29158+ coord = &hint->ext_coord.coord;
29159+
29160+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
29161+ /* hint either not set or set by different operation */
29162+ return RETERR(-E_REPEAT);
29163+
29164+ if (get_key_offset(key) != hint->offset)
29165+ /* hint is set for different key */
29166+ return RETERR(-E_REPEAT);
29167+
29168+ assert("edward-707", reiser4_schedulable());
29169+
29170+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
29171+ key, &hint->lh, lock_mode,
29172+ ZNODE_LOCK_LOPRI);
29173+}
29174+
29175+/* reserve disk space when writing a logical cluster */
29176+static int reserve4cluster(struct inode *inode, struct cluster_handle *clust)
29177+{
29178+ int result = 0;
29179+
29180+ assert("edward-965", reiser4_schedulable());
29181+ assert("edward-439", inode != NULL);
29182+ assert("edward-440", clust != NULL);
29183+ assert("edward-441", clust->pages != NULL);
29184+
29185+ if (clust->nr_pages == 0) {
29186+ assert("edward-1152", clust->win != NULL);
29187+ assert("edward-1153", clust->win->stat == HOLE_WINDOW);
29188+ /* don't reserve disk space for fake logical cluster */
29189+ return 0;
29190+ }
29191+ assert("edward-442", jprivate(clust->pages[0]) != NULL);
29192+
29193+ result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
29194+ estimate_update_cluster(inode),
29195+ BA_CAN_COMMIT);
29196+ if (result)
29197+ return result;
29198+ clust->reserved = 1;
29199+ grabbed2cluster_reserved(estimate_insert_cluster(inode) +
29200+ estimate_update_cluster(inode));
29201+#if REISER4_DEBUG
29202+ clust->reserved_prepped = estimate_update_cluster(inode);
29203+ clust->reserved_unprepped = estimate_insert_cluster(inode);
29204+#endif
29205+ /* there can be space grabbed by txnmgr_force_commit_all */
29206+ return 0;
29207+}
29208+
29209+/* free reserved disk space if writing a logical cluster fails */
29210+static void free_reserved4cluster(struct inode *inode,
29211+ struct cluster_handle *ch, int count)
29212+{
29213+ assert("edward-967", ch->reserved == 1);
29214+
29215+ cluster_reserved2free(count);
29216+ ch->reserved = 0;
29217+}
29218+
29219+/* The core search procedure of the cryptcompress plugin.
29220+ If returned value is not cbk_errored, then current znode is locked */
29221+static int find_cluster_item(hint_t * hint,
29222+ const reiser4_key * key, /* key of the item we are
29223+ looking for */
29224+ znode_lock_mode lock_mode /* which lock */ ,
29225+ ra_info_t * ra_info, lookup_bias bias, __u32 flags)
29226+{
29227+ int result;
29228+ reiser4_key ikey;
29229+ int went_right = 0;
29230+ coord_t *coord = &hint->ext_coord.coord;
29231+ coord_t orig = *coord;
29232+
29233+ assert("edward-152", hint != NULL);
29234+
29235+ if (!hint_is_valid(hint)) {
29236+ result = cryptcompress_hint_validate(hint, key, lock_mode);
29237+ if (result == -E_REPEAT)
29238+ goto traverse_tree;
29239+ else if (result) {
29240+ assert("edward-1216", 0);
29241+ return result;
29242+ }
29243+ hint_set_valid(hint);
29244+ }
29245+ assert("edward-709", znode_is_any_locked(coord->node));
29246+
29247+ /* In-place lookup is going here, it means we just need to
29248+ check if next item of the @coord match to the @keyhint) */
29249+
29250+ if (equal_to_rdk(coord->node, key)) {
29251+ result = goto_right_neighbor(coord, &hint->lh);
29252+ if (result == -E_NO_NEIGHBOR) {
29253+ assert("edward-1217", 0);
29254+ return RETERR(-EIO);
29255+ }
29256+ if (result)
29257+ return result;
29258+ assert("edward-1218", equal_to_ldk(coord->node, key));
29259+ went_right = 1;
29260+ } else {
29261+ coord->item_pos++;
29262+ coord->unit_pos = 0;
29263+ coord->between = AT_UNIT;
29264+ }
29265+ result = zload(coord->node);
29266+ if (result)
29267+ return result;
29268+ assert("edward-1219", !node_is_empty(coord->node));
29269+
29270+ if (!coord_is_existing_item(coord)) {
29271+ zrelse(coord->node);
29272+ goto not_found;
29273+ }
29274+ item_key_by_coord(coord, &ikey);
29275+ zrelse(coord->node);
29276+ if (!keyeq(key, &ikey))
29277+ goto not_found;
29278+ /* Ok, item is found, update node counts */
29279+ if (went_right)
29280+ dclust_inc_extension_ncount(hint);
29281+ return CBK_COORD_FOUND;
29282+
29283+ not_found:
29284+ assert("edward-1220", coord->item_pos > 0);
29285+ //coord->item_pos--;
29286+ /* roll back */
29287+ *coord = orig;
29288+ ON_DEBUG(coord_update_v(coord));
29289+ return CBK_COORD_NOTFOUND;
29290+
29291+ traverse_tree:
29292+ assert("edward-713", hint->lh.owner == NULL);
29293+ assert("edward-714", reiser4_schedulable());
29294+
29295+ reiser4_unset_hint(hint);
29296+ dclust_init_extension(hint);
29297+ coord_init_zero(coord);
29298+ result = coord_by_key(current_tree, key, coord, &hint->lh,
29299+ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
29300+ CBK_UNIQUE | flags, ra_info);
29301+ if (cbk_errored(result))
29302+ return result;
29303+ if(result == CBK_COORD_FOUND)
29304+ dclust_inc_extension_ncount(hint);
29305+ hint_set_valid(hint);
29306+ return result;
29307+}
29308+
29309+/* This function is called by deflate[inflate] manager when
29310+ creating a transformed/plain stream to check if we should
29311+ create/cut some overhead. If this returns true, then @oh
29312+ contains the size of this overhead.
29313+ */
29314+static int need_cut_or_align(struct inode * inode,
29315+ struct cluster_handle * ch, rw_op rw, int * oh)
29316+{
29317+ struct tfm_cluster * tc = &ch->tc;
29318+ switch (rw) {
29319+ case WRITE_OP: /* estimate align */
29320+ *oh = tc->len % cipher_blocksize(inode);
29321+ if (*oh != 0)
29322+ return 1;
29323+ break;
29324+ case READ_OP: /* estimate cut */
29325+ *oh = *(tfm_output_data(ch) + tc->len - 1);
29326+ break;
29327+ default:
29328+ impossible("edward-1401", "bad option");
29329+ }
29330+ return (tc->len != tc->lsize);
29331+}
29332+
29333+/* create/cut an overhead of transformed/plain stream */
29334+static void align_or_cut_overhead(struct inode * inode,
29335+ struct cluster_handle * ch, rw_op rw)
29336+{
29337+ int oh;
29338+ cipher_plugin * cplug = inode_cipher_plugin(inode);
29339+
29340+ assert("edward-1402", need_cipher(inode));
29341+
29342+ if (!need_cut_or_align(inode, ch, rw, &oh))
29343+ return;
29344+ switch (rw) {
29345+ case WRITE_OP: /* do align */
29346+ ch->tc.len +=
29347+ cplug->align_stream(tfm_input_data(ch) +
29348+ ch->tc.len, ch->tc.len,
29349+ cipher_blocksize(inode));
29350+ *(tfm_input_data(ch) + ch->tc.len - 1) =
29351+ cipher_blocksize(inode) - oh;
29352+ break;
29353+ case READ_OP: /* do cut */
29354+ assert("edward-1403", oh <= cipher_blocksize(inode));
29355+ ch->tc.len -= oh;
29356+ break;
29357+ default:
29358+ impossible("edward-1404", "bad option");
29359+ }
29360+ return;
29361+}
29362+
29363+static unsigned max_cipher_overhead(struct inode * inode)
29364+{
29365+ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
29366+ return 0;
29367+ return cipher_blocksize(inode);
29368+}
29369+
29370+static int deflate_overhead(struct inode *inode)
29371+{
29372+ return (inode_compression_plugin(inode)->
29373+ checksum ? DC_CHECKSUM_SIZE : 0);
29374+}
29375+
29376+static unsigned deflate_overrun(struct inode * inode, int ilen)
29377+{
29378+ return coa_overrun(inode_compression_plugin(inode), ilen);
29379+}
29380+
29381+/* Estimating compressibility of a logical cluster by various
29382+ policies represented by compression mode plugin.
29383+ If this returns false, then compressor won't be called for
29384+ the cluster of index @index.
29385+*/
29386+static int should_compress(struct tfm_cluster * tc, cloff_t index,
29387+ struct inode *inode)
29388+{
29389+ compression_plugin *cplug = inode_compression_plugin(inode);
29390+ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
29391+
29392+ assert("edward-1321", tc->len != 0);
29393+ assert("edward-1322", cplug != NULL);
29394+ assert("edward-1323", mplug != NULL);
29395+
29396+ return /* estimate by size */
29397+ (cplug->min_size_deflate ?
29398+ tc->len >= cplug->min_size_deflate() :
29399+ 1) &&
29400+ /* estimate by compression mode plugin */
29401+ (mplug->should_deflate ?
29402+ mplug->should_deflate(inode, index) :
29403+ 1);
29404+}
29405+
29406+/* Evaluating results of compression transform.
29407+ Returns true, if we need to accept this results */
29408+static int save_compressed(int size_before, int size_after, struct inode *inode)
29409+{
29410+ return (size_after + deflate_overhead(inode) +
29411+ max_cipher_overhead(inode) < size_before);
29412+}
29413+
29414+/* Guess result of the evaluation above */
29415+static int need_inflate(struct cluster_handle * ch, struct inode * inode,
29416+ int encrypted /* is cluster encrypted */ )
29417+{
29418+ struct tfm_cluster * tc = &ch->tc;
29419+
29420+ assert("edward-142", tc != 0);
29421+ assert("edward-143", inode != NULL);
29422+
29423+ return tc->len <
29424+ (encrypted ?
29425+ inode_scaled_offset(inode, tc->lsize) :
29426+ tc->lsize);
29427+}
29428+
29429+/* If results of compression were accepted, then we add
29430+ a checksum to catch possible disk cluster corruption.
29431+ The following is a format of the data stored in disk clusters:
29432+
29433+ data This is (transformed) logical cluster.
29434+ cipher_overhead This is created by ->align() method
29435+ of cipher plugin. May be absent.
29436+ checksum (4) This is created by ->checksum method
29437+ of compression plugin to check
29438+ integrity. May be absent.
29439+
29440+ Crypto overhead format:
29441+
29442+ data
29443+ control_byte (1) contains aligned overhead size:
29444+ 1 <= overhead <= cipher_blksize
29445+*/
29446+/* Append a checksum at the end of a transformed stream */
29447+static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29448+{
29449+ __u32 checksum;
29450+
29451+ assert("edward-1309", tc != NULL);
29452+ assert("edward-1310", tc->len > 0);
29453+ assert("edward-1311", cplug->checksum != NULL);
29454+
29455+ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
29456+ put_unaligned(cpu_to_le32(checksum),
29457+ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
29458+ tc->len += (int)DC_CHECKSUM_SIZE;
29459+}
29460+
29461+/* Check a disk cluster checksum.
29462+ Returns 0 if checksum is correct, otherwise returns 1 */
29463+static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29464+{
29465+ assert("edward-1312", tc != NULL);
29466+ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
29467+ assert("edward-1314", cplug->checksum != NULL);
29468+
29469+ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
29470+ tc->len - (int)DC_CHECKSUM_SIZE) !=
29471+ le32_to_cpu(get_unaligned((d32 *)
29472+ (tfm_stream_data(tc, INPUT_STREAM)
29473+ + tc->len - (int)DC_CHECKSUM_SIZE)))) {
29474+ warning("edward-156",
29475+ "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
29476+ (int)le32_to_cpu
29477+ (get_unaligned((d32 *)
29478+ (tfm_stream_data(tc, INPUT_STREAM) +
29479+ tc->len - (int)DC_CHECKSUM_SIZE))),
29480+ (int)cplug->checksum
29481+ (tfm_stream_data(tc, INPUT_STREAM),
29482+ tc->len - (int)DC_CHECKSUM_SIZE));
29483+ return 1;
29484+ }
29485+ tc->len -= (int)DC_CHECKSUM_SIZE;
29486+ return 0;
29487+}
29488+
29489+/* get input/output stream for some transform action */
29490+int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc,
29491+ tfm_stream_id id)
29492+{
29493+ size_t size = inode_scaled_cluster_size(inode);
29494+
29495+ assert("edward-901", tc != NULL);
29496+ assert("edward-1027", inode_compression_plugin(inode) != NULL);
29497+
29498+ if (cluster_get_tfm_act(tc) == TFMA_WRITE)
29499+ size += deflate_overrun(inode, inode_cluster_size(inode));
29500+
29501+ if (!get_tfm_stream(tc, id) && id == INPUT_STREAM)
29502+ alternate_streams(tc);
29503+ if (!get_tfm_stream(tc, id))
29504+ return alloc_tfm_stream(tc, size, id);
29505+
29506+ assert("edward-902", tfm_stream_is_set(tc, id));
29507+
29508+ if (tfm_stream_size(tc, id) < size)
29509+ return realloc_tfm_stream(tc, size, id);
29510+ return 0;
29511+}
29512+
29513+/* Common deflate manager */
29514+int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode)
29515+{
29516+ int result = 0;
29517+ int compressed = 0;
29518+ int encrypted = 0;
29519+ struct tfm_cluster * tc = &clust->tc;
29520+ compression_plugin * coplug;
29521+
29522+ assert("edward-401", inode != NULL);
29523+ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
29524+ assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
29525+ assert("edward-498", !tfm_cluster_is_uptodate(tc));
29526+
29527+ coplug = inode_compression_plugin(inode);
29528+ if (should_compress(tc, clust->index, inode)) {
29529+ /* try to compress, discard bad results */
29530+ __u32 dst_len;
29531+ compression_mode_plugin * mplug =
29532+ inode_compression_mode_plugin(inode);
29533+ assert("edward-602", coplug != NULL);
29534+ assert("edward-1423", coplug->compress != NULL);
29535+
29536+ result = grab_coa(tc, coplug);
29537+ if (result) {
29538+ warning("edward-1424",
29539+ "alloc_coa failed with ret=%d, skipped compression",
29540+ result);
29541+ goto cipher;
29542+ }
29543+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29544+ if (result) {
29545+ warning("edward-1425",
29546+ "alloc stream failed with ret=%d, skipped compression",
29547+ result);
29548+ goto cipher;
29549+ }
29550+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
29551+ coplug->compress(get_coa(tc, coplug->h.id, tc->act),
29552+ tfm_input_data(clust), tc->len,
29553+ tfm_output_data(clust), &dst_len);
29554+ /* make sure we didn't overwrite extra bytes */
29555+ assert("edward-603",
29556+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
29557+
29558+ /* evaluate results of compression transform */
29559+ if (save_compressed(tc->len, dst_len, inode)) {
29560+ /* good result, accept */
29561+ tc->len = dst_len;
29562+ if (mplug->accept_hook != NULL) {
29563+ result = mplug->accept_hook(inode, clust->index);
29564+ if (result)
29565+ warning("edward-1426",
29566+ "accept_hook failed with ret=%d",
29567+ result);
29568+ }
29569+ compressed = 1;
29570+ }
29571+ else {
29572+ /* bad result, discard */
29573+#if 0
29574+ if (cluster_is_complete(clust, inode))
29575+ warning("edward-1496",
29576+ "incompressible cluster %lu (inode %llu)",
29577+ clust->index,
29578+ (unsigned long long)get_inode_oid(inode));
29579+#endif
29580+ if (mplug->discard_hook != NULL &&
29581+ cluster_is_complete(clust, inode)) {
29582+ result = mplug->discard_hook(inode,
29583+ clust->index);
29584+ if (result)
29585+ warning("edward-1427",
29586+ "discard_hook failed with ret=%d",
29587+ result);
29588+ }
29589+ }
29590+ }
29591+ cipher:
29592+ if (need_cipher(inode)) {
29593+ cipher_plugin * ciplug;
29594+ struct blkcipher_desc desc;
29595+ struct scatterlist src;
29596+ struct scatterlist dst;
29597+
29598+ ciplug = inode_cipher_plugin(inode);
29599+ desc.tfm = info_get_cipher(inode_crypto_info(inode));
29600+ desc.flags = 0;
29601+ if (compressed)
29602+ alternate_streams(tc);
29603+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29604+ if (result)
29605+ return result;
29606+
29607+ align_or_cut_overhead(inode, clust, WRITE_OP);
29608+ sg_init_one(&src, tfm_input_data(clust), tc->len);
29609+ sg_init_one(&dst, tfm_output_data(clust), tc->len);
29610+
29611+ result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
29612+ if (result) {
29613+ warning("edward-1405",
29614+ "encryption failed flags=%x\n", desc.flags);
29615+ return result;
29616+ }
29617+ encrypted = 1;
29618+ }
29619+ if (compressed && coplug->checksum != NULL)
29620+ dc_set_checksum(coplug, tc);
29621+ if (!compressed && !encrypted)
29622+ alternate_streams(tc);
29623+ return result;
29624+}
29625+
29626+/* Common inflate manager. */
29627+int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode)
29628+{
29629+ int result = 0;
29630+ int transformed = 0;
29631+ struct tfm_cluster * tc = &clust->tc;
29632+ compression_plugin * coplug;
29633+
29634+ assert("edward-905", inode != NULL);
29635+ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
29636+ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
29637+ assert("edward-1349", tc->act == TFMA_READ);
29638+ assert("edward-907", !tfm_cluster_is_uptodate(tc));
29639+
29640+ /* Handle a checksum (if any) */
29641+ coplug = inode_compression_plugin(inode);
29642+ if (need_inflate(clust, inode, need_cipher(inode)) &&
29643+ coplug->checksum != NULL) {
29644+ result = dc_check_checksum(coplug, tc);
29645+ if (unlikely(result)) {
29646+ warning("edward-1460",
29647+ "Inode %llu: disk cluster %lu looks corrupted",
29648+ (unsigned long long)get_inode_oid(inode),
29649+ clust->index);
29650+ return RETERR(-EIO);
29651+ }
29652+ }
29653+ if (need_cipher(inode)) {
29654+ cipher_plugin * ciplug;
29655+ struct blkcipher_desc desc;
29656+ struct scatterlist src;
29657+ struct scatterlist dst;
29658+
29659+ ciplug = inode_cipher_plugin(inode);
29660+ desc.tfm = info_get_cipher(inode_crypto_info(inode));
29661+ desc.flags = 0;
29662+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29663+ if (result)
29664+ return result;
29665+ assert("edward-909", tfm_cluster_is_set(tc));
29666+
29667+ sg_init_one(&src, tfm_input_data(clust), tc->len);
29668+ sg_init_one(&dst, tfm_output_data(clust), tc->len);
29669+
29670+ result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
29671+ if (result) {
29672+ warning("edward-1600", "decrypt failed flags=%x\n",
29673+ desc.flags);
29674+ return result;
29675+ }
29676+ align_or_cut_overhead(inode, clust, READ_OP);
29677+ transformed = 1;
29678+ }
29679+ if (need_inflate(clust, inode, 0)) {
29680+ unsigned dst_len = inode_cluster_size(inode);
29681+ if(transformed)
29682+ alternate_streams(tc);
29683+
29684+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29685+ if (result)
29686+ return result;
29687+ assert("edward-1305", coplug->decompress != NULL);
29688+ assert("edward-910", tfm_cluster_is_set(tc));
29689+
29690+ coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
29691+ tfm_input_data(clust), tc->len,
29692+ tfm_output_data(clust), &dst_len);
29693+ /* check length */
29694+ tc->len = dst_len;
29695+ assert("edward-157", dst_len == tc->lsize);
29696+ transformed = 1;
29697+ }
29698+ if (!transformed)
29699+ alternate_streams(tc);
29700+ return result;
29701+}
29702+
29703+/* This is implementation of readpage method of struct
29704+ address_space_operations for cryptcompress plugin. */
29705+int readpage_cryptcompress(struct file *file, struct page *page)
29706+{
29707+ reiser4_context *ctx;
29708+ struct cluster_handle clust;
29709+ item_plugin *iplug;
29710+ int result;
29711+
29712+ assert("edward-88", PageLocked(page));
29713+ assert("vs-976", !PageUptodate(page));
29714+ assert("edward-89", page->mapping && page->mapping->host);
29715+
29716+ ctx = reiser4_init_context(page->mapping->host->i_sb);
29717+ if (IS_ERR(ctx)) {
29718+ unlock_page(page);
29719+ return PTR_ERR(ctx);
29720+ }
29721+ assert("edward-113",
29722+ ergo(file != NULL,
29723+ page->mapping == file->f_dentry->d_inode->i_mapping));
29724+
29725+ if (PageUptodate(page)) {
29726+ warning("edward-1338", "page is already uptodate\n");
29727+ unlock_page(page);
29728+ reiser4_exit_context(ctx);
29729+ return 0;
29730+ }
29731+ cluster_init_read(&clust, NULL);
29732+ clust.file = file;
29733+ iplug = item_plugin_by_id(CTAIL_ID);
29734+ if (!iplug->s.file.readpage) {
29735+ unlock_page(page);
29736+ put_cluster_handle(&clust);
29737+ reiser4_exit_context(ctx);
29738+ return -EINVAL;
29739+ }
29740+ result = iplug->s.file.readpage(&clust, page);
29741+
29742+ put_cluster_handle(&clust);
29743+ reiser4_txn_restart(ctx);
29744+ reiser4_exit_context(ctx);
29745+ return result;
29746+}
29747+
29748+/* number of pages to check in */
29749+static int get_new_nrpages(struct cluster_handle * clust)
29750+{
29751+ switch (clust->op) {
29752+ case LC_APPOV:
29753+ return clust->nr_pages;
29754+ case LC_TRUNC:
29755+ assert("edward-1179", clust->win != NULL);
29756+ return size_in_pages(clust->win->off + clust->win->count);
29757+ default:
29758+ impossible("edward-1180", "bad page cluster option");
29759+ return 0;
29760+ }
29761+}
29762+
29763+static void set_cluster_pages_dirty(struct cluster_handle * clust,
29764+ struct inode * inode)
29765+{
29766+ int i;
29767+ struct page *pg;
29768+ int nrpages = get_new_nrpages(clust);
29769+
29770+ for (i = 0; i < nrpages; i++) {
29771+
29772+ pg = clust->pages[i];
29773+ assert("edward-968", pg != NULL);
29774+ lock_page(pg);
29775+ assert("edward-1065", PageUptodate(pg));
29776+ reiser4_set_page_dirty_internal(pg);
29777+ unlock_page(pg);
29778+ mark_page_accessed(pg);
29779+ }
29780+}
29781+
29782+/* Grab a page cluster for read/write operations.
29783+ Attach a jnode for write operations (when preparing for modifications, which
29784+ are supposed to be committed).
29785+
29786+ We allocate only one jnode per page cluster; this jnode is binded to the
29787+ first page of this cluster, so we have an extra-reference that will be put
29788+ as soon as jnode is evicted from memory), other references will be cleaned
29789+ up in flush time (assume that check in page cluster was successful).
29790+*/
29791+int grab_page_cluster(struct inode * inode,
29792+ struct cluster_handle * clust, rw_op rw)
29793+{
29794+ int i;
29795+ int result = 0;
29796+ jnode *node = NULL;
29797+
29798+ assert("edward-182", clust != NULL);
29799+ assert("edward-183", clust->pages != NULL);
29800+ assert("edward-1466", clust->node == NULL);
29801+ assert("edward-1428", inode != NULL);
29802+ assert("edward-1429", inode->i_mapping != NULL);
29803+ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
29804+
29805+ if (clust->nr_pages == 0)
29806+ return 0;
29807+
29808+ for (i = 0; i < clust->nr_pages; i++) {
29809+
29810+ assert("edward-1044", clust->pages[i] == NULL);
29811+
29812+ clust->pages[i] =
29813+ find_or_create_page(inode->i_mapping,
29814+ clust_to_pg(clust->index, inode) + i,
29815+ reiser4_ctx_gfp_mask_get());
29816+ if (!clust->pages[i]) {
29817+ result = RETERR(-ENOMEM);
29818+ break;
29819+ }
29820+ if (i == 0 && rw == WRITE_OP) {
29821+ node = jnode_of_page(clust->pages[i]);
29822+ if (IS_ERR(node)) {
29823+ result = PTR_ERR(node);
29824+ unlock_page(clust->pages[i]);
29825+ break;
29826+ }
29827+ JF_SET(node, JNODE_CLUSTER_PAGE);
29828+ assert("edward-920", jprivate(clust->pages[0]));
29829+ }
29830+ INODE_PGCOUNT_INC(inode);
29831+ unlock_page(clust->pages[i]);
29832+ }
29833+ if (unlikely(result)) {
29834+ while (i) {
29835+ put_cluster_page(clust->pages[--i]);
29836+ INODE_PGCOUNT_DEC(inode);
29837+ }
29838+ if (node && !IS_ERR(node))
29839+ jput(node);
29840+ return result;
29841+ }
29842+ clust->node = node;
29843+ return 0;
29844+}
29845+
29846+static void truncate_page_cluster_range(struct inode * inode,
29847+ struct page ** pages,
29848+ cloff_t index,
29849+ int from, int count,
29850+ int even_cows)
29851+{
29852+ assert("edward-1467", count > 0);
29853+ reiser4_invalidate_pages(inode->i_mapping,
29854+ clust_to_pg(index, inode) + from,
29855+ count, even_cows);
29856+}
29857+
29858+/* Put @count pages starting from @from offset */
29859+static void __put_page_cluster(int from, int count,
29860+ struct page ** pages, struct inode * inode)
29861+{
29862+ int i;
29863+ assert("edward-1468", pages != NULL);
29864+ assert("edward-1469", inode != NULL);
29865+ assert("edward-1470", from >= 0 && count >= 0);
29866+
29867+ for (i = 0; i < count; i++) {
29868+ assert("edward-1471", pages[from + i] != NULL);
29869+ assert("edward-1472",
29870+ pages[from + i]->index == pages[from]->index + i);
29871+
29872+ put_cluster_page(pages[from + i]);
29873+ INODE_PGCOUNT_DEC(inode);
29874+ }
29875+}
29876+
29877+/*
29878+ * This is dual to grab_page_cluster,
29879+ * however if @rw == WRITE_OP, then we call this function
29880+ * only if something is failed before checkin page cluster.
29881+ */
29882+void put_page_cluster(struct cluster_handle * clust,
29883+ struct inode * inode, rw_op rw)
29884+{
29885+ assert("edward-445", clust != NULL);
29886+ assert("edward-922", clust->pages != NULL);
29887+ assert("edward-446",
29888+ ergo(clust->nr_pages != 0, clust->pages[0] != NULL));
29889+
29890+ __put_page_cluster(0, clust->nr_pages, clust->pages, inode);
29891+ if (rw == WRITE_OP) {
29892+ if (unlikely(clust->node)) {
29893+ assert("edward-447",
29894+ clust->node == jprivate(clust->pages[0]));
29895+ jput(clust->node);
29896+ clust->node = NULL;
29897+ }
29898+ }
29899+}
29900+
29901+#if REISER4_DEBUG
29902+int cryptcompress_inode_ok(struct inode *inode)
29903+{
29904+ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
29905+ return 0;
29906+ if (!cluster_shift_ok(inode_cluster_shift(inode)))
29907+ return 0;
29908+ return 1;
29909+}
29910+
29911+static int window_ok(struct reiser4_slide * win, struct inode *inode)
29912+{
29913+ assert("edward-1115", win != NULL);
29914+ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
29915+
29916+ return (win->off != inode_cluster_size(inode)) &&
29917+ (win->off + win->count + win->delta <= inode_cluster_size(inode));
29918+}
29919+
29920+static int cluster_ok(struct cluster_handle * clust, struct inode *inode)
29921+{
29922+ assert("edward-279", clust != NULL);
29923+
29924+ if (!clust->pages)
29925+ return 0;
29926+ return (clust->win ? window_ok(clust->win, inode) : 1);
29927+}
29928+#if 0
29929+static int pages_truncate_ok(struct inode *inode, pgoff_t start)
29930+{
29931+ int found;
29932+ struct page * page;
29933+
29934+ found = find_get_pages(inode->i_mapping, start, 1, &page);
29935+ if (found)
29936+ put_cluster_page(page);
29937+ return !found;
29938+}
29939+#else
29940+#define pages_truncate_ok(inode, start) 1
29941+#endif
29942+
29943+static int jnode_truncate_ok(struct inode *inode, cloff_t index)
29944+{
29945+ jnode *node;
29946+ node = jlookup(current_tree, get_inode_oid(inode),
29947+ clust_to_pg(index, inode));
29948+ if (likely(!node))
29949+ return 1;
29950+ jput(node);
29951+ return 0;
29952+}
29953+
29954+static int find_fake_appended(struct inode *inode, cloff_t * index);
29955+
29956+static int body_truncate_ok(struct inode *inode, cloff_t aidx)
29957+{
29958+ int result;
29959+ cloff_t raidx;
29960+
29961+ result = find_fake_appended(inode, &raidx);
29962+ return !result && (aidx == raidx);
29963+}
29964+#endif
29965+
29966+/* guess next window stat */
29967+static inline window_stat next_window_stat(struct reiser4_slide * win)
29968+{
29969+ assert("edward-1130", win != NULL);
29970+ return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
29971+ HOLE_WINDOW : DATA_WINDOW);
29972+}
29973+
29974+/* guess and set next cluster index and window params */
29975+static void move_update_window(struct inode * inode,
29976+ struct cluster_handle * clust,
29977+ loff_t file_off, loff_t to_file)
29978+{
29979+ struct reiser4_slide * win;
29980+
29981+ assert("edward-185", clust != NULL);
29982+ assert("edward-438", clust->pages != NULL);
29983+ assert("edward-281", cluster_ok(clust, inode));
29984+
29985+ win = clust->win;
29986+ if (!win)
29987+ return;
29988+
29989+ switch (win->stat) {
29990+ case DATA_WINDOW:
29991+ /* increment */
29992+ clust->index++;
29993+ win->stat = DATA_WINDOW;
29994+ win->off = 0;
29995+ win->count = min((loff_t)inode_cluster_size(inode), to_file);
29996+ break;
29997+ case HOLE_WINDOW:
29998+ switch (next_window_stat(win)) {
29999+ case HOLE_WINDOW:
30000+ /* skip */
30001+ clust->index = off_to_clust(file_off, inode);
30002+ win->stat = HOLE_WINDOW;
30003+ win->off = 0;
30004+ win->count = off_to_cloff(file_off, inode);
30005+ win->delta = min((loff_t)(inode_cluster_size(inode) -
30006+ win->count), to_file);
30007+ break;
30008+ case DATA_WINDOW:
30009+ /* stay */
30010+ win->stat = DATA_WINDOW;
30011+ /* off+count+delta=inv */
30012+ win->off = win->off + win->count;
30013+ win->count = win->delta;
30014+ win->delta = 0;
30015+ break;
30016+ default:
30017+ impossible("edward-282", "wrong next window state");
30018+ }
30019+ break;
30020+ default:
30021+ impossible("edward-283", "wrong current window state");
30022+ }
30023+ assert("edward-1068", cluster_ok(clust, inode));
30024+}
30025+
30026+static int update_sd_cryptcompress(struct inode *inode)
30027+{
30028+ int result = 0;
30029+
30030+ assert("edward-978", reiser4_schedulable());
30031+
30032+ result = reiser4_grab_space_force(/* one for stat data update */
30033+ estimate_update_common(inode),
30034+ BA_CAN_COMMIT);
30035+ if (result)
30036+ return result;
30037+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
30038+ result = reiser4_update_sd(inode);
30039+
30040+ return result;
30041+}
30042+
30043+static void uncapture_cluster_jnode(jnode * node)
30044+{
30045+ txn_atom *atom;
30046+
30047+ assert_spin_locked(&(node->guard));
30048+
30049+ atom = jnode_get_atom(node);
30050+ if (atom == NULL) {
30051+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
30052+ spin_unlock_jnode(node);
30053+ return;
30054+ }
30055+ reiser4_uncapture_block(node);
30056+ spin_unlock_atom(atom);
30057+ jput(node);
30058+}
30059+
30060+static void put_found_pages(struct page **pages, int nr)
30061+{
30062+ int i;
30063+ for (i = 0; i < nr; i++) {
30064+ assert("edward-1045", pages[i] != NULL);
30065+ put_cluster_page(pages[i]);
30066+ }
30067+}
30068+
30069+/* Lifecycle of a logical cluster in the system.
30070+ *
30071+ *
30072+ * Logical cluster of a cryptcompress file is represented in the system by
30073+ * . page cluster (in memory, primary cache, contains plain text);
30074+ * . disk cluster (in memory, secondary cache, contains transformed text).
30075+ * Primary cache is to reduce number of transform operations (compression,
30076+ * encryption), i.e. to implement transform-caching strategy.
30077+ * Secondary cache is to reduce number of I/O operations, i.e. for usual
30078+ * write-caching strategy. Page cluster is a set of pages, i.e. mapping of
30079+ * a logical cluster to the primary cache. Disk cluster is a set of items
30080+ * of the same type defined by some reiser4 item plugin id.
30081+ *
30082+ * 1. Performing modifications
30083+ *
30084+ * Every modification of a cryptcompress file is considered as a set of
30085+ * operations performed on file's logical clusters. Every such "atomic"
30086+ * modification is truncate, append and(or) overwrite some bytes of a
30087+ * logical cluster performed in the primary cache with the following
30088+ * synchronization with the secondary cache (in flush time). Disk clusters,
30089+ * which live in the secondary cache, are supposed to be synchronized with
30090+ * disk. The mechanism of synchronization of primary and secondary caches
30091+ * includes so-called checkin/checkout technique described below.
30092+ *
30093+ * 2. Submitting modifications
30094+ *
30095+ * Each page cluster has associated jnode (a special in-memory header to
30096+ * keep a track of transactions in reiser4), which is attached to its first
30097+ * page when grabbing page cluster for modifications (see grab_page_cluster).
30098+ * Submitting modifications (see checkin_logical_cluster) is going per logical
30099+ * cluster and includes:
30100+ * . checkin_cluster_size;
30101+ * . checkin_page_cluster.
30102+ * checkin_cluster_size() is resolved to file size update (which completely
30103+ * defines new size of logical cluster (number of file's bytes in a logical
30104+ * cluster).
30105+ * checkin_page_cluster() captures jnode of a page cluster and installs
30106+ * jnode's dirty flag (if needed) to indicate that modifications are
30107+ * successfully checked in.
30108+ *
30109+ * 3. Checking out modifications
30110+ *
30111+ * Is going per logical cluster in flush time (see checkout_logical_cluster).
30112+ * This is the time of synchronizing primary and secondary caches.
30113+ * checkout_logical_cluster() includes:
30114+ * . checkout_page_cluster (retrieving checked in pages).
30115+ * . uncapture jnode (including clear dirty flag and unlock)
30116+ *
30117+ * 4. Committing modifications
30118+ *
30119+ * Proceeding a synchronization of primary and secondary caches. When checking
30120+ * out page cluster (the phase above) pages are locked/flushed/unlocked
30121+ * one-by-one in ascending order of their indexes to contiguous stream, which
30122+ * is supposed to be transformed (compressed, encrypted), chopped up into items
30123+ * and committed to disk as a disk cluster.
30124+ *
30125+ * 5. Managing page references
30126+ *
30127+ * Every checked in page have a special additional "control" reference,
30128+ * which is dropped at checkout. We need this to avoid unexpected evicting
30129+ * pages from memory before checkout. Control references are managed so
30130+ * they are not accumulated with every checkin:
30131+ *
30132+ * 0
30133+ * checkin -> 1
30134+ * 0 -> checkout
30135+ * checkin -> 1
30136+ * checkin -> 1
30137+ * checkin -> 1
30138+ * 0 -> checkout
30139+ * ...
30140+ *
30141+ * Every page cluster has its own unique "cluster lock". Update/drop
30142+ * references are serialized via this lock. Number of checked in cluster
30143+ * pages is calculated by i_size under cluster lock. File size is updated
30144+ * at every checkin action also under cluster lock (except cases of
30145+ * appending/truncating fake logical clusters).
30146+ *
30147+ * Proof of correctness:
30148+ *
30149+ * Since we update file size under cluster lock, in the case of non-fake
30150+ * logical cluster with its lock held we do have expected number of checked
30151+ * in pages. On the other hand, append/truncate of fake logical clusters
30152+ * doesn't change number of checked in pages of any cluster.
30153+ *
30154+ * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode.
30155+ * Currently, I don't see any reason to create a special lock for those
30156+ * needs.
30157+ */
30158+
30159+static inline void lock_cluster(jnode * node)
30160+{
30161+ spin_lock_jnode(node);
30162+}
30163+
30164+static inline void unlock_cluster(jnode * node)
30165+{
30166+ spin_unlock_jnode(node);
30167+}
30168+
30169+static inline void unlock_cluster_uncapture(jnode * node)
30170+{
30171+ uncapture_cluster_jnode(node);
30172+}
30173+
30174+/* Set new file size by window. Cluster lock is required. */
30175+static void checkin_file_size(struct cluster_handle * clust,
30176+ struct inode * inode)
30177+{
30178+ loff_t new_size;
30179+ struct reiser4_slide * win;
30180+
30181+ assert("edward-1181", clust != NULL);
30182+ assert("edward-1182", inode != NULL);
30183+ assert("edward-1473", clust->pages != NULL);
30184+ assert("edward-1474", clust->pages[0] != NULL);
30185+ assert("edward-1475", jprivate(clust->pages[0]) != NULL);
30186+ assert_spin_locked(&(jprivate(clust->pages[0])->guard));
30187+
30188+
30189+ win = clust->win;
30190+ assert("edward-1183", win != NULL);
30191+
30192+ new_size = clust_to_off(clust->index, inode) + win->off;
30193+
30194+ switch (clust->op) {
30195+ case LC_APPOV:
30196+ if (new_size + win->count <= i_size_read(inode))
30197+ /* overwrite only */
30198+ return;
30199+ new_size += win->count;
30200+ break;
30201+ case LC_TRUNC:
30202+ break;
30203+ default:
30204+ impossible("edward-1184", "bad page cluster option");
30205+ break;
30206+ }
30207+ inode_check_scale_nolock(inode, i_size_read(inode), new_size);
30208+ i_size_write(inode, new_size);
30209+ return;
30210+}
30211+
30212+static inline void checkin_cluster_size(struct cluster_handle * clust,
30213+ struct inode * inode)
30214+{
30215+ if (clust->win)
30216+ checkin_file_size(clust, inode);
30217+}
30218+
30219+static int checkin_page_cluster(struct cluster_handle * clust,
30220+ struct inode * inode)
30221+{
30222+ int result;
30223+ jnode * node;
30224+ int old_nrpages = clust->old_nrpages;
30225+ int new_nrpages = get_new_nrpages(clust);
30226+
30227+ node = clust->node;
30228+
30229+ assert("edward-221", node != NULL);
30230+ assert("edward-971", clust->reserved == 1);
30231+ assert("edward-1263",
30232+ clust->reserved_prepped == estimate_update_cluster(inode));
30233+ assert("edward-1264", clust->reserved_unprepped == 0);
30234+
30235+ if (JF_ISSET(node, JNODE_DIRTY)) {
30236+ /*
30237+ * page cluster was checked in, but not yet
30238+ * checked out, so release related resources
30239+ */
30240+ free_reserved4cluster(inode, clust,
30241+ estimate_update_cluster(inode));
30242+ __put_page_cluster(0, clust->old_nrpages,
30243+ clust->pages, inode);
30244+ } else {
30245+ result = capture_cluster_jnode(node);
30246+ if (unlikely(result)) {
30247+ unlock_cluster(node);
30248+ return result;
30249+ }
30250+ jnode_make_dirty_locked(node);
30251+ clust->reserved = 0;
30252+ }
30253+ unlock_cluster(node);
30254+
30255+ if (new_nrpages < old_nrpages) {
30256+ /* truncate >= 1 complete pages */
30257+ __put_page_cluster(new_nrpages,
30258+ old_nrpages - new_nrpages,
30259+ clust->pages, inode);
30260+ truncate_page_cluster_range(inode,
30261+ clust->pages, clust->index,
30262+ new_nrpages,
30263+ old_nrpages - new_nrpages,
30264+ 0);
30265+ }
30266+#if REISER4_DEBUG
30267+ clust->reserved_prepped -= estimate_update_cluster(inode);
30268+#endif
30269+ return 0;
30270+}
30271+
30272+/* Submit modifications of a logical cluster */
30273+static int checkin_logical_cluster(struct cluster_handle * clust,
30274+ struct inode *inode)
30275+{
30276+ int result = 0;
30277+ jnode * node;
30278+
30279+ node = clust->node;
30280+
30281+ assert("edward-1035", node != NULL);
30282+ assert("edward-1029", clust != NULL);
30283+ assert("edward-1030", clust->reserved == 1);
30284+ assert("edward-1031", clust->nr_pages != 0);
30285+ assert("edward-1032", clust->pages != NULL);
30286+ assert("edward-1033", clust->pages[0] != NULL);
30287+ assert("edward-1446", jnode_is_cluster_page(node));
30288+ assert("edward-1476", node == jprivate(clust->pages[0]));
30289+
30290+ lock_cluster(node);
30291+ checkin_cluster_size(clust, inode);
30292+ /* this will unlock cluster */
30293+ result = checkin_page_cluster(clust, inode);
30294+ jput(node);
30295+ clust->node = NULL;
30296+ return result;
30297+}
30298+
30299+/*
30300+ * Retrieve size of logical cluster that was checked in at
30301+ * the latest modifying session (cluster lock is required)
30302+ */
30303+static inline void checkout_cluster_size(struct cluster_handle * clust,
30304+ struct inode * inode)
30305+{
30306+ struct tfm_cluster *tc = &clust->tc;
30307+
30308+ tc->len = lbytes(clust->index, inode);
30309+ assert("edward-1478", tc->len != 0);
30310+}
30311+
30312+/*
30313+ * Retrieve a page cluster with the latest submitted modifications
30314+ * and flush its pages to previously allocated contiguous stream.
30315+ */
30316+static void checkout_page_cluster(struct cluster_handle * clust,
30317+ jnode * node, struct inode * inode)
30318+{
30319+ int i;
30320+ int found;
30321+ int to_put;
30322+ struct tfm_cluster *tc = &clust->tc;
30323+
30324+ /* find and put checked in pages: cluster is locked,
30325+ * so we must get expected number (to_put) of pages
30326+ */
30327+ to_put = size_in_pages(lbytes(clust->index, inode));
30328+ found = find_get_pages(inode->i_mapping,
30329+ clust_to_pg(clust->index, inode),
30330+ to_put, clust->pages);
30331+ BUG_ON(found != to_put);
30332+
30333+ __put_page_cluster(0, to_put, clust->pages, inode);
30334+ unlock_cluster_uncapture(node);
30335+
30336+ /* Flush found pages.
30337+ *
30338+ * Note, that we don't disable modifications while flushing,
30339+ * moreover, some found pages can be truncated, as we have
30340+ * released cluster lock.
30341+ */
30342+ for (i = 0; i < found; i++) {
30343+ int in_page;
30344+ char * data;
30345+ assert("edward-1479",
30346+ clust->pages[i]->index == clust->pages[0]->index + i);
30347+
30348+ lock_page(clust->pages[i]);
30349+ if (!PageUptodate(clust->pages[i])) {
30350+ /* page was truncated */
30351+ assert("edward-1480",
30352+ i_size_read(inode) <= page_offset(clust->pages[i]));
30353+ assert("edward-1481",
30354+ clust->pages[i]->mapping != inode->i_mapping);
30355+ unlock_page(clust->pages[i]);
30356+ break;
30357+ }
30358+ /* Update the number of bytes in the logical cluster,
30359+ * as it could be partially truncated. Note, that only
30360+ * partial truncate is possible (complete truncate can
30361+ * not go here, as it is performed via ->kill_hook()
30362+ * called by cut_file_items(), and the last one must
30363+ * wait for znode locked with parent coord).
30364+ */
30365+ checkout_cluster_size(clust, inode);
30366+
30367+ /* this can be zero, as new file size is
30368+ checked in before truncating pages */
30369+ in_page = __mbp(tc->len, i);
30370+
30371+ data = kmap(clust->pages[i]);
30372+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
30373+ data, in_page);
30374+ kunmap(clust->pages[i]);
30375+
30376+ if (PageDirty(clust->pages[i]))
30377+ cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
30378+
30379+ unlock_page(clust->pages[i]);
30380+
30381+ if (in_page < PAGE_CACHE_SIZE)
30382+ /* end of the file */
30383+ break;
30384+ }
30385+ put_found_pages(clust->pages, found); /* find_get_pages */
30386+ tc->lsize = tc->len;
30387+ return;
30388+}
30389+
30390+/* Check out modifications of a logical cluster */
30391+int checkout_logical_cluster(struct cluster_handle * clust,
30392+ jnode * node, struct inode *inode)
30393+{
30394+ int result;
30395+ struct tfm_cluster *tc = &clust->tc;
30396+
30397+ assert("edward-980", node != NULL);
30398+ assert("edward-236", inode != NULL);
30399+ assert("edward-237", clust != NULL);
30400+ assert("edward-240", !clust->win);
30401+ assert("edward-241", reiser4_schedulable());
30402+ assert("edward-718", cryptcompress_inode_ok(inode));
30403+
30404+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
30405+ if (result) {
30406+ warning("edward-1430", "alloc stream failed with ret=%d",
30407+ result);
30408+ return RETERR(-E_REPEAT);
30409+ }
30410+ lock_cluster(node);
30411+
30412+ if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) {
30413+ /* race with another flush */
30414+ warning("edward-982",
30415+ "checking out logical cluster %lu of inode %llu: "
30416+ "jnode is not dirty", clust->index,
30417+ (unsigned long long)get_inode_oid(inode));
30418+ unlock_cluster(node);
30419+ return RETERR(-E_REPEAT);
30420+ }
30421+ cluster_reserved2grabbed(estimate_update_cluster(inode));
30422+
30423+ /* this will unlock cluster */
30424+ checkout_page_cluster(clust, node, inode);
30425+ return 0;
30426+}
30427+
30428+/* set hint for the cluster of the index @index */
30429+static void set_hint_cluster(struct inode *inode, hint_t * hint,
30430+ cloff_t index, znode_lock_mode mode)
30431+{
30432+ reiser4_key key;
30433+ assert("edward-722", cryptcompress_inode_ok(inode));
30434+ assert("edward-723",
30435+ inode_file_plugin(inode) ==
30436+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
30437+
30438+ inode_file_plugin(inode)->key_by_inode(inode,
30439+ clust_to_off(index, inode),
30440+ &key);
30441+
30442+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
30443+ hint->offset = get_key_offset(&key);
30444+ hint->mode = mode;
30445+}
30446+
30447+void invalidate_hint_cluster(struct cluster_handle * clust)
30448+{
30449+ assert("edward-1291", clust != NULL);
30450+ assert("edward-1292", clust->hint != NULL);
30451+
30452+ done_lh(&clust->hint->lh);
30453+ hint_clr_valid(clust->hint);
30454+}
30455+
30456+static void put_hint_cluster(struct cluster_handle * clust,
30457+ struct inode *inode, znode_lock_mode mode)
30458+{
30459+ assert("edward-1286", clust != NULL);
30460+ assert("edward-1287", clust->hint != NULL);
30461+
30462+ set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
30463+ invalidate_hint_cluster(clust);
30464+}
30465+
30466+static int balance_dirty_page_cluster(struct cluster_handle * clust,
30467+ struct inode *inode, loff_t off,
30468+ loff_t to_file)
30469+{
30470+ int result;
30471+ struct cryptcompress_info * info;
30472+
30473+ assert("edward-724", inode != NULL);
30474+ assert("edward-725", cryptcompress_inode_ok(inode));
30475+
30476+ /* set next window params */
30477+ move_update_window(inode, clust, off, to_file);
30478+
30479+ result = update_sd_cryptcompress(inode);
30480+ if (result)
30481+ return result;
30482+ assert("edward-726", clust->hint->lh.owner == NULL);
30483+ info = cryptcompress_inode_data(inode);
30484+
30485+ mutex_unlock(&info->checkin_mutex);
30486+ reiser4_throttle_write(inode);
30487+ mutex_lock(&info->checkin_mutex);
30488+ return 0;
30489+}
30490+
30491+/* set zeroes to the page cluster, proceed it, and maybe, try to capture
30492+ its pages */
30493+static int write_hole(struct inode *inode, struct cluster_handle * clust,
30494+ loff_t file_off, loff_t to_file)
30495+{
30496+ int result = 0;
30497+ unsigned cl_off, cl_count = 0;
30498+ unsigned to_pg, pg_off;
30499+ struct reiser4_slide * win;
30500+
30501+ assert("edward-190", clust != NULL);
30502+ assert("edward-1069", clust->win != NULL);
30503+ assert("edward-191", inode != NULL);
30504+ assert("edward-727", cryptcompress_inode_ok(inode));
30505+ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
30506+ assert("edward-1154",
30507+ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
30508+
30509+ win = clust->win;
30510+
30511+ assert("edward-1070", win != NULL);
30512+ assert("edward-201", win->stat == HOLE_WINDOW);
30513+ assert("edward-192", cluster_ok(clust, inode));
30514+
30515+ if (win->off == 0 && win->count == inode_cluster_size(inode)) {
30516+ /* This part of the hole will be represented by "fake"
30517+ * logical cluster, i.e. which doesn't have appropriate
30518+ * disk cluster until someone modify this logical cluster
30519+ * and make it dirty.
30520+ * So go forward here..
30521+ */
30522+ move_update_window(inode, clust, file_off, to_file);
30523+ return 0;
30524+ }
30525+ cl_count = win->count; /* number of zeroes to write */
30526+ cl_off = win->off;
30527+ pg_off = off_to_pgoff(win->off);
30528+
30529+ while (cl_count) {
30530+ struct page *page;
30531+ page = clust->pages[off_to_pg(cl_off)];
30532+
30533+ assert("edward-284", page != NULL);
30534+
30535+ to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count);
30536+ lock_page(page);
30537+ zero_user_page(page, pg_off, to_pg, KM_USER0);
30538+ SetPageUptodate(page);
30539+ reiser4_set_page_dirty_internal(page);
30540+ mark_page_accessed(page);
30541+ unlock_page(page);
30542+
30543+ cl_off += to_pg;
30544+ cl_count -= to_pg;
30545+ pg_off = 0;
30546+ }
30547+ if (!win->delta) {
30548+ /* only zeroes in this window, try to capture
30549+ */
30550+ result = checkin_logical_cluster(clust, inode);
30551+ if (result)
30552+ return result;
30553+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
30554+ result =
30555+ balance_dirty_page_cluster(clust, inode, file_off, to_file);
30556+ } else
30557+ move_update_window(inode, clust, file_off, to_file);
30558+ return result;
30559+}
30560+
30561+/*
30562+ The main disk search procedure for cryptcompress plugin, which
30563+ . scans all items of disk cluster with the lock mode @mode
30564+ . maybe reads each one (if @read)
30565+ . maybe makes its znode dirty (if write lock mode was specified)
30566+
30567+ NOTE-EDWARD: Callers should handle the case when disk cluster
30568+ is incomplete (-EIO)
30569+*/
30570+int find_disk_cluster(struct cluster_handle * clust,
30571+ struct inode *inode, int read, znode_lock_mode mode)
30572+{
30573+ flow_t f;
30574+ hint_t *hint;
30575+ int result = 0;
30576+ int was_grabbed;
30577+ ra_info_t ra_info;
30578+ file_plugin *fplug;
30579+ item_plugin *iplug;
30580+ struct tfm_cluster *tc;
30581+ struct cryptcompress_info * info;
30582+
30583+ assert("edward-138", clust != NULL);
30584+ assert("edward-728", clust->hint != NULL);
30585+ assert("edward-226", reiser4_schedulable());
30586+ assert("edward-137", inode != NULL);
30587+ assert("edward-729", cryptcompress_inode_ok(inode));
30588+
30589+ hint = clust->hint;
30590+ fplug = inode_file_plugin(inode);
30591+ was_grabbed = get_current_context()->grabbed_blocks;
30592+ info = cryptcompress_inode_data(inode);
30593+ tc = &clust->tc;
30594+
30595+ assert("edward-462", !tfm_cluster_is_uptodate(tc));
30596+ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
30597+
30598+ dclust_init_extension(hint);
30599+
30600+ /* set key of the first disk cluster item */
30601+ fplug->flow_by_inode(inode,
30602+ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
30603+ 0 /* kernel space */ ,
30604+ inode_scaled_cluster_size(inode),
30605+ clust_to_off(clust->index, inode), READ_OP, &f);
30606+ if (mode == ZNODE_WRITE_LOCK) {
30607+ /* reserve for flush to make dirty all the leaf nodes
30608+ which contain disk cluster */
30609+ result =
30610+ reiser4_grab_space_force(estimate_dirty_cluster(inode),
30611+ BA_CAN_COMMIT);
30612+ if (result)
30613+ goto out;
30614+ }
30615+
30616+ ra_info.key_to_stop = f.key;
30617+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30618+
30619+ while (f.length) {
30620+ result = find_cluster_item(hint, &f.key, mode,
30621+ NULL, FIND_EXACT,
30622+ (mode == ZNODE_WRITE_LOCK ?
30623+ CBK_FOR_INSERT : 0));
30624+ switch (result) {
30625+ case CBK_COORD_NOTFOUND:
30626+ result = 0;
30627+ if (inode_scaled_offset
30628+ (inode, clust_to_off(clust->index, inode)) ==
30629+ get_key_offset(&f.key)) {
30630+ /* first item not found, this is treated
30631+ as disk cluster is absent */
30632+ clust->dstat = FAKE_DISK_CLUSTER;
30633+ goto out;
30634+ }
30635+ /* we are outside the cluster, stop search here */
30636+ assert("edward-146",
30637+ f.length != inode_scaled_cluster_size(inode));
30638+ goto ok;
30639+ case CBK_COORD_FOUND:
30640+ assert("edward-148",
30641+ hint->ext_coord.coord.between == AT_UNIT);
30642+ assert("edward-460",
30643+ hint->ext_coord.coord.unit_pos == 0);
30644+
30645+ coord_clear_iplug(&hint->ext_coord.coord);
30646+ result = zload_ra(hint->ext_coord.coord.node, &ra_info);
30647+ if (unlikely(result))
30648+ goto out;
30649+ iplug = item_plugin_by_coord(&hint->ext_coord.coord);
30650+ assert("edward-147",
30651+ item_id_by_coord(&hint->ext_coord.coord) ==
30652+ CTAIL_ID);
30653+
30654+ result = iplug->s.file.read(NULL, &f, hint);
30655+ if (result) {
30656+ zrelse(hint->ext_coord.coord.node);
30657+ goto out;
30658+ }
30659+ if (mode == ZNODE_WRITE_LOCK) {
30660+ /* Don't make dirty more nodes then it was
30661+ estimated (see comments before
30662+ estimate_dirty_cluster). Missed nodes will be
30663+ read up in flush time if they are evicted from
30664+ memory */
30665+ if (dclust_get_extension_ncount(hint) <=
30666+ estimate_dirty_cluster(inode))
30667+ znode_make_dirty(hint->ext_coord.coord.node);
30668+
30669+ znode_set_convertible(hint->ext_coord.coord.
30670+ node);
30671+ }
30672+ zrelse(hint->ext_coord.coord.node);
30673+ break;
30674+ default:
30675+ goto out;
30676+ }
30677+ }
30678+ ok:
30679+ /* at least one item was found */
30680+ /* NOTE-EDWARD: Callers should handle the case
30681+ when disk cluster is incomplete (-EIO) */
30682+ tc->len = inode_scaled_cluster_size(inode) - f.length;
30683+ tc->lsize = lbytes(clust->index, inode);
30684+ assert("edward-1196", tc->len > 0);
30685+ assert("edward-1406", tc->lsize > 0);
30686+
30687+ if (hint_is_unprepped_dclust(clust->hint)) {
30688+ clust->dstat = UNPR_DISK_CLUSTER;
30689+ } else if (clust->index == info->trunc_index) {
30690+ clust->dstat = TRNC_DISK_CLUSTER;
30691+ } else {
30692+ clust->dstat = PREP_DISK_CLUSTER;
30693+ dclust_set_extension_dsize(clust->hint, tc->len);
30694+ }
30695+ out:
30696+ assert("edward-1339",
30697+ get_current_context()->grabbed_blocks >= was_grabbed);
30698+ grabbed2free(get_current_context(),
30699+ get_current_super_private(),
30700+ get_current_context()->grabbed_blocks - was_grabbed);
30701+ return result;
30702+}
30703+
30704+int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode,
30705+ znode_lock_mode lock_mode)
30706+{
30707+ reiser4_key key;
30708+ ra_info_t ra_info;
30709+
30710+ assert("edward-730", reiser4_schedulable());
30711+ assert("edward-731", clust != NULL);
30712+ assert("edward-732", inode != NULL);
30713+
30714+ if (hint_is_valid(clust->hint)) {
30715+ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
30716+ assert("edward-1294",
30717+ znode_is_write_locked(clust->hint->lh.node));
30718+ /* already have a valid locked position */
30719+ return (clust->dstat ==
30720+ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
30721+ CBK_COORD_FOUND);
30722+ }
30723+ key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
30724+ &key);
30725+ ra_info.key_to_stop = key;
30726+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30727+
30728+ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
30729+ CBK_FOR_INSERT);
30730+}
30731+
30732+/* Read needed cluster pages before modifying.
30733+ If success, @clust->hint contains locked position in the tree.
30734+ Also:
30735+ . find and set disk cluster state
30736+ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
30737+*/
30738+static int read_some_cluster_pages(struct inode * inode,
30739+ struct cluster_handle * clust)
30740+{
30741+ int i;
30742+ int result = 0;
30743+ item_plugin *iplug;
30744+ struct reiser4_slide * win = clust->win;
30745+ znode_lock_mode mode = ZNODE_WRITE_LOCK;
30746+
30747+ iplug = item_plugin_by_id(CTAIL_ID);
30748+
30749+ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
30750+
30751+#if REISER4_DEBUG
30752+ if (clust->nr_pages == 0) {
30753+ /* start write hole from fake disk cluster */
30754+ assert("edward-1117", win != NULL);
30755+ assert("edward-1118", win->stat == HOLE_WINDOW);
30756+ assert("edward-1119", new_logical_cluster(clust, inode));
30757+ }
30758+#endif
30759+ if (new_logical_cluster(clust, inode)) {
30760+ /*
30761+ new page cluster is about to be written, nothing to read,
30762+ */
30763+ assert("edward-734", reiser4_schedulable());
30764+ assert("edward-735", clust->hint->lh.owner == NULL);
30765+
30766+ if (clust->nr_pages) {
30767+ int off;
30768+ struct page * pg;
30769+ assert("edward-1419", clust->pages != NULL);
30770+ pg = clust->pages[clust->nr_pages - 1];
30771+ assert("edward-1420", pg != NULL);
30772+ off = off_to_pgoff(win->off+win->count+win->delta);
30773+ if (off) {
30774+ lock_page(pg);
30775+ zero_user_page(pg, off, PAGE_CACHE_SIZE - off,
30776+ KM_USER0);
30777+ unlock_page(pg);
30778+ }
30779+ }
30780+ clust->dstat = FAKE_DISK_CLUSTER;
30781+ return 0;
30782+ }
30783+ /*
30784+ Here we should search for disk cluster to figure out its real state.
30785+ Also there is one more important reason to do disk search: we need
30786+ to make disk cluster _dirty_ if it exists
30787+ */
30788+
30789+ /* if windows is specified, read the only pages
30790+ that will be modified partially */
30791+
30792+ for (i = 0; i < clust->nr_pages; i++) {
30793+ struct page *pg = clust->pages[i];
30794+
30795+ lock_page(pg);
30796+ if (PageUptodate(pg)) {
30797+ unlock_page(pg);
30798+ continue;
30799+ }
30800+ unlock_page(pg);
30801+
30802+ if (win &&
30803+ i >= size_in_pages(win->off) &&
30804+ i < off_to_pg(win->off + win->count + win->delta))
30805+ /* page will be completely overwritten */
30806+ continue;
30807+
30808+ if (win && (i == clust->nr_pages - 1) &&
30809+ /* the last page is
30810+ partially modified,
30811+ not uptodate .. */
30812+ (size_in_pages(i_size_read(inode)) <= pg->index)) {
30813+ /* .. and appended,
30814+ so set zeroes to the rest */
30815+ int offset;
30816+ lock_page(pg);
30817+ assert("edward-1260",
30818+ size_in_pages(win->off + win->count +
30819+ win->delta) - 1 == i);
30820+
30821+ offset =
30822+ off_to_pgoff(win->off + win->count + win->delta);
30823+ zero_user_page(pg, offset, PAGE_CACHE_SIZE - offset,
30824+ KM_USER0);
30825+ unlock_page(pg);
30826+ /* still not uptodate */
30827+ break;
30828+ }
30829+ lock_page(pg);
30830+ result = do_readpage_ctail(inode, clust, pg, mode);
30831+
30832+ assert("edward-1526", ergo(!result, PageUptodate(pg)));
30833+ unlock_page(pg);
30834+ if (result) {
30835+ warning("edward-219", "do_readpage_ctail failed");
30836+ goto out;
30837+ }
30838+ }
30839+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
30840+ /* disk cluster unclaimed, but we need to make its znodes dirty
30841+ * to make flush update convert its content
30842+ */
30843+ result = find_disk_cluster(clust, inode,
30844+ 0 /* do not read items */,
30845+ mode);
30846+ }
30847+ out:
30848+ tfm_cluster_clr_uptodate(&clust->tc);
30849+ return result;
30850+}
30851+
30852+static int should_create_unprepped_cluster(struct cluster_handle * clust,
30853+ struct inode * inode)
30854+{
30855+ assert("edward-737", clust != NULL);
30856+
30857+ switch (clust->dstat) {
30858+ case PREP_DISK_CLUSTER:
30859+ case UNPR_DISK_CLUSTER:
30860+ return 0;
30861+ case FAKE_DISK_CLUSTER:
30862+ if (clust->win &&
30863+ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
30864+ assert("edward-1172",
30865+ new_logical_cluster(clust, inode));
30866+ return 0;
30867+ }
30868+ return 1;
30869+ default:
30870+ impossible("edward-1173", "bad disk cluster state");
30871+ return 0;
30872+ }
30873+}
30874+
30875+static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust,
30876+ struct inode *inode)
30877+{
30878+ int result;
30879+
30880+ assert("edward-1123", reiser4_schedulable());
30881+ assert("edward-737", clust != NULL);
30882+ assert("edward-738", inode != NULL);
30883+ assert("edward-739", cryptcompress_inode_ok(inode));
30884+ assert("edward-1053", clust->hint != NULL);
30885+
30886+ if (!should_create_unprepped_cluster(clust, inode)) {
30887+ if (clust->reserved) {
30888+ cluster_reserved2free(estimate_insert_cluster(inode));
30889+#if REISER4_DEBUG
30890+ assert("edward-1267",
30891+ clust->reserved_unprepped ==
30892+ estimate_insert_cluster(inode));
30893+ clust->reserved_unprepped -=
30894+ estimate_insert_cluster(inode);
30895+#endif
30896+ }
30897+ return 0;
30898+ }
30899+ assert("edward-1268", clust->reserved);
30900+ cluster_reserved2grabbed(estimate_insert_cluster(inode));
30901+#if REISER4_DEBUG
30902+ assert("edward-1441",
30903+ clust->reserved_unprepped == estimate_insert_cluster(inode));
30904+ clust->reserved_unprepped -= estimate_insert_cluster(inode);
30905+#endif
30906+ result = ctail_insert_unprepped_cluster(clust, inode);
30907+ if (result)
30908+ return result;
30909+
30910+ inode_add_bytes(inode, inode_cluster_size(inode));
30911+
30912+ assert("edward-743", cryptcompress_inode_ok(inode));
30913+ assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
30914+
30915+ clust->dstat = UNPR_DISK_CLUSTER;
30916+ return 0;
30917+}
30918+
30919+/* . Grab page cluster for read, write, setattr, etc. operations;
30920+ * . Truncate its complete pages, if needed;
30921+ */
30922+int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust,
30923+ rw_op rw)
30924+{
30925+ assert("edward-177", inode != NULL);
30926+ assert("edward-741", cryptcompress_inode_ok(inode));
30927+ assert("edward-740", clust->pages != NULL);
30928+
30929+ set_cluster_nrpages(clust, inode);
30930+ reset_cluster_pgset(clust, cluster_nrpages(inode));
30931+ return grab_page_cluster(inode, clust, rw);
30932+}
30933+
30934+/* Truncate complete page cluster of index @index.
30935+ * This is called by ->kill_hook() method of item
30936+ * plugin when deleting a disk cluster of such index.
30937+ */
30938+void truncate_complete_page_cluster(struct inode *inode, cloff_t index,
30939+ int even_cows)
30940+{
30941+ int found;
30942+ int nr_pages;
30943+ jnode *node;
30944+ struct page *pages[MAX_CLUSTER_NRPAGES];
30945+
30946+ node = jlookup(current_tree, get_inode_oid(inode),
30947+ clust_to_pg(index, inode));
30948+ nr_pages = size_in_pages(lbytes(index, inode));
30949+ assert("edward-1483", nr_pages != 0);
30950+ if (!node)
30951+ goto truncate;
30952+ found = find_get_pages(inode->i_mapping,
30953+ clust_to_pg(index, inode),
30954+ cluster_nrpages(inode), pages);
30955+ if (!found) {
30956+ assert("edward-1484", jnode_truncate_ok(inode, index));
30957+ return;
30958+ }
30959+ lock_cluster(node);
30960+
30961+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
30962+ && index == 0)
30963+ /* converting to unix_file is in progress */
30964+ JF_CLR(node, JNODE_CLUSTER_PAGE);
30965+ if (JF_ISSET(node, JNODE_DIRTY)) {
30966+ /*
30967+ * @nr_pages were checked in, but not yet checked out -
30968+ * we need to release them. (also there can be pages
30969+ * attached to page cache by read(), etc. - don't take
30970+ * them into account).
30971+ */
30972+ assert("edward-1198", found >= nr_pages);
30973+
30974+ /* free disk space grabbed for disk cluster converting */
30975+ cluster_reserved2grabbed(estimate_update_cluster(inode));
30976+ grabbed2free(get_current_context(),
30977+ get_current_super_private(),
30978+ estimate_update_cluster(inode));
30979+ __put_page_cluster(0, nr_pages, pages, inode);
30980+
30981+ /* This will clear dirty bit, uncapture and unlock jnode */
30982+ unlock_cluster_uncapture(node);
30983+ } else
30984+ unlock_cluster(node);
30985+ jput(node); /* jlookup */
30986+ put_found_pages(pages, found); /* find_get_pages */
30987+ truncate:
30988+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
30989+ index == 0)
30990+ return;
30991+ truncate_page_cluster_range(inode, pages, index, 0,
30992+ cluster_nrpages(inode),
30993+ even_cows);
30994+ assert("edward-1201",
30995+ ergo(!reiser4_inode_get_flag(inode,
30996+ REISER4_FILE_CONV_IN_PROGRESS),
30997+ jnode_truncate_ok(inode, index)));
30998+ return;
30999+}
31000+
31001+/*
31002+ * Set cluster handle @clust of a logical cluster before
31003+ * modifications which are supposed to be committed.
31004+ *
31005+ * . grab cluster pages;
31006+ * . reserve disk space;
31007+ * . maybe read pages from disk and set the disk cluster dirty;
31008+ * . maybe write hole and check in (partially zeroed) logical cluster;
31009+ * . create 'unprepped' disk cluster for new or fake logical one.
31010+ */
31011+static int prepare_logical_cluster(struct inode *inode,
31012+ loff_t file_off, /* write position
31013+ in the file */
31014+ loff_t to_file, /* bytes of users data
31015+ to write to the file */
31016+ struct cluster_handle * clust,
31017+ logical_cluster_op op)
31018+{
31019+ int result = 0;
31020+ struct reiser4_slide * win = clust->win;
31021+
31022+ reset_cluster_params(clust);
31023+ cluster_set_tfm_act(&clust->tc, TFMA_READ);
31024+#if REISER4_DEBUG
31025+ clust->ctx = get_current_context();
31026+#endif
31027+ assert("edward-1190", op != LC_INVAL);
31028+
31029+ clust->op = op;
31030+
31031+ result = prepare_page_cluster(inode, clust, WRITE_OP);
31032+ if (result)
31033+ return result;
31034+ assert("edward-1447",
31035+ ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
31036+ assert("edward-1448",
31037+ ergo(clust->nr_pages != 0,
31038+ jnode_is_cluster_page(jprivate(clust->pages[0]))));
31039+
31040+ result = reserve4cluster(inode, clust);
31041+ if (result)
31042+ goto err1;
31043+ result = read_some_cluster_pages(inode, clust);
31044+ if (result) {
31045+ free_reserved4cluster(inode,
31046+ clust,
31047+ estimate_update_cluster(inode) +
31048+ estimate_insert_cluster(inode));
31049+ goto err1;
31050+ }
31051+ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
31052+
31053+ result = cryptcompress_make_unprepped_cluster(clust, inode);
31054+ if (result)
31055+ goto err2;
31056+ if (win && win->stat == HOLE_WINDOW) {
31057+ result = write_hole(inode, clust, file_off, to_file);
31058+ if (result)
31059+ goto err2;
31060+ }
31061+ return 0;
31062+ err2:
31063+ free_reserved4cluster(inode, clust,
31064+ estimate_update_cluster(inode));
31065+ err1:
31066+ put_page_cluster(clust, inode, WRITE_OP);
31067+ assert("edward-1125", result == -ENOSPC);
31068+ return result;
31069+}
31070+
31071+/* set window by two offsets */
31072+static void set_window(struct cluster_handle * clust,
31073+ struct reiser4_slide * win, struct inode *inode,
31074+ loff_t o1, loff_t o2)
31075+{
31076+ assert("edward-295", clust != NULL);
31077+ assert("edward-296", inode != NULL);
31078+ assert("edward-1071", win != NULL);
31079+ assert("edward-297", o1 <= o2);
31080+
31081+ clust->index = off_to_clust(o1, inode);
31082+
31083+ win->off = off_to_cloff(o1, inode);
31084+ win->count = min((loff_t)(inode_cluster_size(inode) - win->off),
31085+ o2 - o1);
31086+ win->delta = 0;
31087+
31088+ clust->win = win;
31089+}
31090+
31091+static int set_cluster_by_window(struct inode *inode,
31092+ struct cluster_handle * clust,
31093+ struct reiser4_slide * win, size_t length,
31094+ loff_t file_off)
31095+{
31096+ int result;
31097+
31098+ assert("edward-197", clust != NULL);
31099+ assert("edward-1072", win != NULL);
31100+ assert("edward-198", inode != NULL);
31101+
31102+ result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
31103+ if (result)
31104+ return result;
31105+
31106+ if (file_off > i_size_read(inode)) {
31107+ /* Uhmm, hole in cryptcompress file... */
31108+ loff_t hole_size;
31109+ hole_size = file_off - inode->i_size;
31110+
31111+ set_window(clust, win, inode, inode->i_size, file_off);
31112+ win->stat = HOLE_WINDOW;
31113+ if (win->off + hole_size < inode_cluster_size(inode))
31114+ /* there is also user's data to append to the hole */
31115+ win->delta = min(inode_cluster_size(inode) -
31116+ (win->off + win->count), length);
31117+ return 0;
31118+ }
31119+ set_window(clust, win, inode, file_off, file_off + length);
31120+ win->stat = DATA_WINDOW;
31121+ return 0;
31122+}
31123+
31124+int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
31125+ int count)
31126+{
31127+ int result = 0;
31128+ int (*setting_actor)(struct cluster_handle * clust, int count);
31129+
31130+ assert("edward-1358", clust != NULL);
31131+ assert("edward-1359", page != NULL);
31132+ assert("edward-1360", page->mapping != NULL);
31133+ assert("edward-1361", page->mapping->host != NULL);
31134+
31135+ setting_actor =
31136+ (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
31137+ result = setting_actor(clust, count);
31138+ clust->index = pg_to_clust(page->index, page->mapping->host);
31139+ return result;
31140+}
31141+
31142+/* reset all the params that not get updated */
31143+void reset_cluster_params(struct cluster_handle * clust)
31144+{
31145+ assert("edward-197", clust != NULL);
31146+
31147+ clust->dstat = INVAL_DISK_CLUSTER;
31148+ clust->tc.uptodate = 0;
31149+ clust->tc.len = 0;
31150+}
31151+
31152+/* the heart of write_cryptcompress */
31153+static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
31154+ const char __user *buf, size_t to_write,
31155+ loff_t pos, int *conv_occured)
31156+{
31157+ int i;
31158+ hint_t *hint;
31159+ int result = 0;
31160+ size_t count;
31161+ struct reiser4_slide win;
31162+ struct cluster_handle clust;
31163+ struct cryptcompress_info * info;
31164+
31165+ assert("edward-161", reiser4_schedulable());
31166+ assert("edward-748", cryptcompress_inode_ok(inode));
31167+ assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
31168+ assert("edward-1274", get_current_context()->grabbed_blocks == 0);
31169+
31170+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31171+ if (hint == NULL)
31172+ return RETERR(-ENOMEM);
31173+
31174+ result = load_file_hint(file, hint);
31175+ if (result) {
31176+ kfree(hint);
31177+ return result;
31178+ }
31179+ count = to_write;
31180+
31181+ reiser4_slide_init(&win);
31182+ cluster_init_read(&clust, &win);
31183+ clust.hint = hint;
31184+ info = cryptcompress_inode_data(inode);
31185+
31186+ mutex_lock(&info->checkin_mutex);
31187+
31188+ result = set_cluster_by_window(inode, &clust, &win, to_write, pos);
31189+ if (result)
31190+ goto out;
31191+
31192+ if (next_window_stat(&win) == HOLE_WINDOW) {
31193+ /* write hole in this iteration
31194+ separated from the loop below */
31195+ result = write_conversion_hook(file, inode,
31196+ pos,
31197+ &clust,
31198+ NULL);
31199+ if (result)
31200+ goto out;
31201+ result = prepare_logical_cluster(inode, pos, count, &clust,
31202+ LC_APPOV);
31203+ if (result)
31204+ goto out;
31205+ }
31206+ do {
31207+ const char __user * src;
31208+ unsigned page_off, to_page;
31209+
31210+ assert("edward-750", reiser4_schedulable());
31211+
31212+ result = write_conversion_hook(file, inode,
31213+ pos + to_write - count,
31214+ &clust,
31215+ conv_occured);
31216+ if (result || *conv_occured)
31217+ goto out;
31218+ result = prepare_logical_cluster(inode, pos, count, &clust,
31219+ LC_APPOV);
31220+ if (result)
31221+ goto out;
31222+
31223+ assert("edward-751", cryptcompress_inode_ok(inode));
31224+ assert("edward-204", win.stat == DATA_WINDOW);
31225+ assert("edward-1288", hint_is_valid(clust.hint));
31226+ assert("edward-752",
31227+ znode_is_write_locked(hint->ext_coord.coord.node));
31228+ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
31229+
31230+ /* set write position in page */
31231+ page_off = off_to_pgoff(win.off);
31232+
31233+ /* copy user's data to cluster pages */
31234+ for (i = off_to_pg(win.off), src = buf;
31235+ i < size_in_pages(win.off + win.count);
31236+ i++, src += to_page) {
31237+ to_page = __mbp(win.off + win.count, i) - page_off;
31238+ assert("edward-1039",
31239+ page_off + to_page <= PAGE_CACHE_SIZE);
31240+ assert("edward-287", clust.pages[i] != NULL);
31241+
31242+ fault_in_pages_readable(src, to_page);
31243+
31244+ lock_page(clust.pages[i]);
31245+ result =
31246+ __copy_from_user((char *)kmap(clust.pages[i]) +
31247+ page_off, src, to_page);
31248+ kunmap(clust.pages[i]);
31249+ if (unlikely(result)) {
31250+ unlock_page(clust.pages[i]);
31251+ result = -EFAULT;
31252+ goto err2;
31253+ }
31254+ SetPageUptodate(clust.pages[i]);
31255+ reiser4_set_page_dirty_internal(clust.pages[i]);
31256+ flush_dcache_page(clust.pages[i]);
31257+ mark_page_accessed(clust.pages[i]);
31258+ unlock_page(clust.pages[i]);
31259+ page_off = 0;
31260+ }
31261+ assert("edward-753", cryptcompress_inode_ok(inode));
31262+
31263+ result = checkin_logical_cluster(&clust, inode);
31264+ if (result)
31265+ goto err2;
31266+
31267+ buf += win.count;
31268+ count -= win.count;
31269+
31270+ result = balance_dirty_page_cluster(&clust, inode, 0, count);
31271+ if (result)
31272+ goto err1;
31273+ assert("edward-755", hint->lh.owner == NULL);
31274+ reset_cluster_params(&clust);
31275+ continue;
31276+ err2:
31277+ put_page_cluster(&clust, inode, WRITE_OP);
31278+ err1:
31279+ if (clust.reserved)
31280+ free_reserved4cluster(inode,
31281+ &clust,
31282+ estimate_update_cluster(inode));
31283+ break;
31284+ } while (count);
31285+ out:
31286+ /*
31287+ * NOTE: at this point file may have
31288+ * another (unix-file) plugin installed
31289+ */
31290+ done_lh(&hint->lh);
31291+ if (result == -EEXIST)
31292+ warning("edward-1407", "write returns EEXIST!\n");
31293+
31294+ put_cluster_handle(&clust);
31295+ save_file_hint(file, hint);
31296+ kfree(hint);
31297+ /*
31298+ * don't release cryptcompress-specific
31299+ * checkin_mutex, if conversion occured
31300+ */
31301+ if (*conv_occured == 0)
31302+ mutex_unlock(&info->checkin_mutex);
31303+ if (buf) {
31304+ /* if nothing were written - there must be an error */
31305+ assert("edward-195", ergo((to_write == count),
31306+ (result < 0 || *conv_occured)));
31307+ return (to_write - count) ? (to_write - count) : result;
31308+ }
31309+ return result;
31310+}
31311+
31312+/**
31313+ * plugin->write()
31314+ * @file: file to write to
31315+ * @buf: address of user-space buffer
31316+ * @read_amount: number of bytes to write
31317+ * @off: position in file to write to
31318+ */
31319+ssize_t write_cryptcompress(struct file *file, const char __user *buf,
31320+ size_t count, loff_t *off, int *conv)
31321+{
31322+ ssize_t result;
31323+ struct inode *inode;
31324+ reiser4_context *ctx;
31325+ loff_t pos = *off;
31326+ struct cryptcompress_info *info;
31327+
31328+ assert("edward-1449", *conv == 0);
31329+
31330+ inode = file->f_dentry->d_inode;
31331+ assert("edward-196", cryptcompress_inode_ok(inode));
31332+
31333+ info = cryptcompress_inode_data(inode);
31334+
31335+ ctx = reiser4_init_context(inode->i_sb);
31336+ if (IS_ERR(ctx))
31337+ return PTR_ERR(ctx);
31338+
31339+ mutex_lock(&inode->i_mutex);
31340+
31341+ result = generic_write_checks(file, &pos, &count, 0);
31342+ if (unlikely(result != 0))
31343+ goto out;
31344+ if (unlikely(count == 0))
31345+ goto out;
31346+ result = remove_suid(file->f_dentry);
31347+ if (unlikely(result != 0))
31348+ goto out;
31349+ /* remove_suid might create a transaction */
31350+ reiser4_txn_restart(ctx);
31351+
31352+ result = do_write_cryptcompress(file, inode, buf, count, pos, conv);
31353+
31354+ if (result < 0)
31355+ goto out;
31356+ /* update position in a file */
31357+ *off = pos + result;
31358+ out:
31359+ mutex_unlock(&inode->i_mutex);
31360+
31361+ context_set_commit_async(ctx);
31362+ reiser4_exit_context(ctx);
31363+ return result;
31364+}
31365+
31366+/* plugin->readpages */
31367+int readpages_cryptcompress(struct file *file, struct address_space *mapping,
31368+ struct list_head *pages, unsigned nr_pages)
31369+{
31370+ reiser4_context * ctx;
31371+ int ret;
31372+
31373+ ctx = reiser4_init_context(mapping->host->i_sb);
31374+ if (IS_ERR(ctx)) {
31375+ ret = PTR_ERR(ctx);
31376+ goto err;
31377+ }
31378+ /* cryptcompress file can be built of ctail items only */
31379+ ret = readpages_ctail(file, mapping, pages);
31380+ reiser4_txn_restart(ctx);
31381+ reiser4_exit_context(ctx);
31382+ if (ret) {
31383+err:
31384+ put_pages_list(pages);
31385+ }
31386+ return ret;
31387+}
31388+
31389+static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
31390+{
31391+ /* reserve one block to update stat data item */
31392+ assert("edward-1193",
31393+ inode_file_plugin(inode)->estimate.update ==
31394+ estimate_update_common);
31395+ return estimate_update_common(inode);
31396+}
31397+
31398+/**
31399+ * plugin->read
31400+ * @file: file to read from
31401+ * @buf: address of user-space buffer
31402+ * @read_amount: number of bytes to read
31403+ * @off: position in file to read from
31404+ */
31405+ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
31406+ loff_t * off)
31407+{
31408+ ssize_t result;
31409+ struct inode *inode;
31410+ reiser4_context *ctx;
31411+ struct cryptcompress_info *info;
31412+ reiser4_block_nr needed;
31413+
31414+ inode = file->f_dentry->d_inode;
31415+ assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
31416+
31417+ ctx = reiser4_init_context(inode->i_sb);
31418+ if (IS_ERR(ctx))
31419+ return PTR_ERR(ctx);
31420+
31421+ info = cryptcompress_inode_data(inode);
31422+ needed = cryptcompress_estimate_read(inode);
31423+
31424+ result = reiser4_grab_space(needed, BA_CAN_COMMIT);
31425+ if (result != 0) {
31426+ reiser4_exit_context(ctx);
31427+ return result;
31428+ }
31429+ result = do_sync_read(file, buf, size, off);
31430+
31431+ context_set_commit_async(ctx);
31432+ reiser4_exit_context(ctx);
31433+
31434+ return result;
31435+}
31436+
31437+/* Look for a disk cluster and keep lookup result in @found.
31438+ * If @index > 0, then find disk cluster of the index (@index - 1);
31439+ * If @index == 0, then find the rightmost disk cluster.
31440+ * Keep incremented index of the found disk cluster in @found.
31441+ * @found == 0 means that disk cluster was not found (in the last
31442+ * case (@index == 0) it means that file doesn't have disk clusters).
31443+ */
31444+static int lookup_disk_cluster(struct inode *inode, cloff_t * found,
31445+ cloff_t index)
31446+{
31447+ int result;
31448+ reiser4_key key;
31449+ loff_t offset;
31450+ hint_t *hint;
31451+ lock_handle *lh;
31452+ lookup_bias bias;
31453+ coord_t *coord;
31454+ item_plugin *iplug;
31455+
31456+ assert("edward-1131", inode != NULL);
31457+ assert("edward-95", cryptcompress_inode_ok(inode));
31458+
31459+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31460+ if (hint == NULL)
31461+ return RETERR(-ENOMEM);
31462+ hint_init_zero(hint);
31463+ lh = &hint->lh;
31464+
31465+ bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
31466+ offset =
31467+ (index ? clust_to_off(index, inode) -
31468+ 1 : get_key_offset(reiser4_max_key()));
31469+
31470+ key_by_inode_cryptcompress(inode, offset, &key);
31471+
31472+ /* find the last item of this object */
31473+ result =
31474+ find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
31475+ bias, 0);
31476+ if (cbk_errored(result)) {
31477+ done_lh(lh);
31478+ kfree(hint);
31479+ return result;
31480+ }
31481+ if (result == CBK_COORD_NOTFOUND) {
31482+ /* no real disk clusters */
31483+ done_lh(lh);
31484+ kfree(hint);
31485+ *found = 0;
31486+ return 0;
31487+ }
31488+ /* disk cluster is found */
31489+ coord = &hint->ext_coord.coord;
31490+ coord_clear_iplug(coord);
31491+ result = zload(coord->node);
31492+ if (unlikely(result)) {
31493+ done_lh(lh);
31494+ kfree(hint);
31495+ return result;
31496+ }
31497+ iplug = item_plugin_by_coord(coord);
31498+ assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
31499+ assert("edward-1202", ctail_ok(coord));
31500+
31501+ item_key_by_coord(coord, &key);
31502+ *found = off_to_clust(get_key_offset(&key), inode) + 1;
31503+
31504+ assert("edward-1132", ergo(index, index == *found));
31505+
31506+ zrelse(coord->node);
31507+ done_lh(lh);
31508+ kfree(hint);
31509+ return 0;
31510+}
31511+
31512+static int find_fake_appended(struct inode *inode, cloff_t * index)
31513+{
31514+ return lookup_disk_cluster(inode, index,
31515+ 0 /* find last real one */ );
31516+}
31517+
31518+/* Set left coord when unit is not found after node_lookup()
31519+ This takes into account that there can be holes in a sequence
31520+ of disk clusters */
31521+
31522+static void adjust_left_coord(coord_t * left_coord)
31523+{
31524+ switch (left_coord->between) {
31525+ case AFTER_UNIT:
31526+ left_coord->between = AFTER_ITEM;
31527+ case AFTER_ITEM:
31528+ case BEFORE_UNIT:
31529+ break;
31530+ default:
31531+ impossible("edward-1204", "bad left coord to cut");
31532+ }
31533+ return;
31534+}
31535+
31536+#define CRC_CUT_TREE_MIN_ITERATIONS 64
31537+
31538+/* plugin->cut_tree_worker */
31539+int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
31540+ const reiser4_key * to_key,
31541+ reiser4_key * smallest_removed,
31542+ struct inode *object, int truncate,
31543+ int *progress)
31544+{
31545+ lock_handle next_node_lock;
31546+ coord_t left_coord;
31547+ int result;
31548+
31549+ assert("edward-1158", tap->coord->node != NULL);
31550+ assert("edward-1159", znode_is_write_locked(tap->coord->node));
31551+ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
31552+
31553+ *progress = 0;
31554+ init_lh(&next_node_lock);
31555+
31556+ while (1) {
31557+ znode *node; /* node from which items are cut */
31558+ node_plugin *nplug; /* node plugin for @node */
31559+
31560+ node = tap->coord->node;
31561+
31562+ /* Move next_node_lock to the next node on the left. */
31563+ result =
31564+ reiser4_get_left_neighbor(&next_node_lock, node,
31565+ ZNODE_WRITE_LOCK,
31566+ GN_CAN_USE_UPPER_LEVELS);
31567+ if (result != 0 && result != -E_NO_NEIGHBOR)
31568+ break;
31569+ /* FIXME-EDWARD: Check can we delete the node as a whole. */
31570+ result = reiser4_tap_load(tap);
31571+ if (result)
31572+ return result;
31573+
31574+ /* Prepare the second (right) point for cut_node() */
31575+ if (*progress)
31576+ coord_init_last_unit(tap->coord, node);
31577+
31578+ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
31579+ /* set rightmost unit for the items without lookup method */
31580+ tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
31581+
31582+ nplug = node->nplug;
31583+
31584+ assert("edward-1161", nplug);
31585+ assert("edward-1162", nplug->lookup);
31586+
31587+ /* left_coord is leftmost unit cut from @node */
31588+ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
31589+
31590+ if (IS_CBKERR(result))
31591+ break;
31592+
31593+ if (result == CBK_COORD_NOTFOUND)
31594+ adjust_left_coord(&left_coord);
31595+
31596+ /* adjust coordinates so that they are set to existing units */
31597+ if (coord_set_to_right(&left_coord)
31598+ || coord_set_to_left(tap->coord)) {
31599+ result = 0;
31600+ break;
31601+ }
31602+
31603+ if (coord_compare(&left_coord, tap->coord) ==
31604+ COORD_CMP_ON_RIGHT) {
31605+ /* keys from @from_key to @to_key are not in the tree */
31606+ result = 0;
31607+ break;
31608+ }
31609+
31610+ /* cut data from one node */
31611+ *smallest_removed = *reiser4_min_key();
31612+ result = kill_node_content(&left_coord,
31613+ tap->coord,
31614+ from_key,
31615+ to_key,
31616+ smallest_removed,
31617+ next_node_lock.node,
31618+ object, truncate);
31619+ reiser4_tap_relse(tap);
31620+
31621+ if (result)
31622+ break;
31623+
31624+ ++(*progress);
31625+
31626+ /* Check whether all items with keys >= from_key were removed
31627+ * from the tree. */
31628+ if (keyle(smallest_removed, from_key))
31629+ /* result = 0; */
31630+ break;
31631+
31632+ if (next_node_lock.node == NULL)
31633+ break;
31634+
31635+ result = reiser4_tap_move(tap, &next_node_lock);
31636+ done_lh(&next_node_lock);
31637+ if (result)
31638+ break;
31639+
31640+ /* Break long cut_tree operation (deletion of a large file) if
31641+ * atom requires commit. */
31642+ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
31643+ && current_atom_should_commit()) {
31644+ result = -E_REPEAT;
31645+ break;
31646+ }
31647+ }
31648+ done_lh(&next_node_lock);
31649+ return result;
31650+}
31651+
31652+/* Append or expand hole in two steps:
31653+ * 1) set zeroes to the rightmost page of the rightmost non-fake
31654+ * logical cluster;
31655+ * 2) expand hole via fake logical clusters (just increase i_size)
31656+ */
31657+static int cryptcompress_append_hole(struct inode *inode /* with old size */,
31658+ loff_t new_size)
31659+{
31660+ int result = 0;
31661+ hint_t *hint;
31662+ lock_handle *lh;
31663+ loff_t hole_size;
31664+ int nr_zeroes;
31665+ struct reiser4_slide win;
31666+ struct cluster_handle clust;
31667+
31668+ assert("edward-1133", inode->i_size < new_size);
31669+ assert("edward-1134", reiser4_schedulable());
31670+ assert("edward-1135", cryptcompress_inode_ok(inode));
31671+ assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
31672+ assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
31673+
31674+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31675+ if (hint == NULL)
31676+ return RETERR(-ENOMEM);
31677+ hint_init_zero(hint);
31678+ lh = &hint->lh;
31679+
31680+ reiser4_slide_init(&win);
31681+ cluster_init_read(&clust, &win);
31682+ clust.hint = hint;
31683+
31684+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31685+ if (result)
31686+ goto out;
31687+ if (off_to_cloff(inode->i_size, inode) == 0)
31688+ goto append_fake;
31689+ hole_size = new_size - inode->i_size;
31690+ nr_zeroes =
31691+ inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
31692+ if (hole_size < nr_zeroes)
31693+ nr_zeroes = hole_size;
31694+ set_window(&clust, &win, inode, inode->i_size,
31695+ inode->i_size + nr_zeroes);
31696+ win.stat = HOLE_WINDOW;
31697+
31698+ assert("edward-1137",
31699+ clust.index == off_to_clust(inode->i_size, inode));
31700+
31701+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV);
31702+
31703+ assert("edward-1271", !result || result == -ENOSPC);
31704+ if (result)
31705+ goto out;
31706+ assert("edward-1139",
31707+ clust.dstat == PREP_DISK_CLUSTER ||
31708+ clust.dstat == UNPR_DISK_CLUSTER);
31709+
31710+ assert("edward-1431", hole_size >= nr_zeroes);
31711+ if (hole_size == nr_zeroes)
31712+ /* nothing to append anymore */
31713+ goto out;
31714+ append_fake:
31715+ INODE_SET_SIZE(inode, new_size);
31716+ out:
31717+ done_lh(lh);
31718+ kfree(hint);
31719+ put_cluster_handle(&clust);
31720+ return result;
31721+}
31722+
31723+static int
31724+update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
31725+{
31726+ return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
31727+ ? 0 : reiser4_update_file_size(inode, key, update_sd));
31728+}
31729+
31730+/* Prune cryptcompress file in two steps:
31731+ * 1) cut all nominated logical clusters except the leftmost one which
31732+ * is to be partially truncated. Note, that there can be "holes"
31733+ * represented by fake logical clusters.
31734+ * 2) set zeroes and capture leftmost partially truncated logical
31735+ * cluster, if it is not fake; otherwise prune fake logical cluster
31736+ * (just decrease i_size).
31737+ */
31738+static int prune_cryptcompress(struct inode *inode, loff_t new_size,
31739+ int update_sd, cloff_t aidx)
31740+{
31741+ int result = 0;
31742+ unsigned nr_zeroes;
31743+ loff_t to_prune;
31744+ loff_t old_size;
31745+ cloff_t ridx;
31746+
31747+ hint_t *hint;
31748+ lock_handle *lh;
31749+ struct reiser4_slide win;
31750+ struct cluster_handle clust;
31751+
31752+ assert("edward-1140", inode->i_size >= new_size);
31753+ assert("edward-1141", reiser4_schedulable());
31754+ assert("edward-1142", cryptcompress_inode_ok(inode));
31755+ assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
31756+
31757+ old_size = inode->i_size;
31758+
31759+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31760+ if (hint == NULL)
31761+ return RETERR(-ENOMEM);
31762+ hint_init_zero(hint);
31763+ lh = &hint->lh;
31764+
31765+ reiser4_slide_init(&win);
31766+ cluster_init_read(&clust, &win);
31767+ clust.hint = hint;
31768+
31769+ /* calculate index of the rightmost logical cluster
31770+ that will be completely truncated */
31771+ ridx = size_in_lc(new_size, inode);
31772+
31773+ /* truncate all disk clusters starting from @ridx */
31774+ assert("edward-1174", ridx <= aidx);
31775+ old_size = inode->i_size;
31776+ if (ridx != aidx) {
31777+ struct cryptcompress_info * info;
31778+ info = cryptcompress_inode_data(inode);
31779+ result = cut_file_items(inode,
31780+ clust_to_off(ridx, inode),
31781+ update_sd,
31782+ clust_to_off(aidx, inode),
31783+ update_cryptcompress_size);
31784+ info->trunc_index = ULONG_MAX;
31785+ if (result)
31786+ goto out;
31787+ }
31788+ /*
31789+ * there can be pages of fake logical clusters, truncate them
31790+ */
31791+ truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode));
31792+ assert("edward-1524",
31793+ pages_truncate_ok(inode, clust_to_pg(ridx, inode)));
31794+ /*
31795+ * now perform partial truncate of last logical cluster
31796+ */
31797+ if (!off_to_cloff(new_size, inode)) {
31798+ /* no partial truncate is needed */
31799+ assert("edward-1145", inode->i_size == new_size);
31800+ goto truncate_fake;
31801+ }
31802+ assert("edward-1146", new_size < inode->i_size);
31803+
31804+ to_prune = inode->i_size - new_size;
31805+
31806+ /* check if the last logical cluster is fake */
31807+ result = lookup_disk_cluster(inode, &aidx, ridx);
31808+ if (result)
31809+ goto out;
31810+ if (!aidx)
31811+ /* yup, this is fake one */
31812+ goto truncate_fake;
31813+
31814+ assert("edward-1148", aidx == ridx);
31815+
31816+ /* do partial truncate of the last page cluster,
31817+ and try to capture this one */
31818+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31819+ if (result)
31820+ goto out;
31821+ nr_zeroes = (off_to_pgoff(new_size) ?
31822+ PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
31823+ set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
31824+ win.stat = HOLE_WINDOW;
31825+
31826+ assert("edward-1149", clust.index == ridx - 1);
31827+
31828+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC);
31829+ if (result)
31830+ goto out;
31831+ assert("edward-1151",
31832+ clust.dstat == PREP_DISK_CLUSTER ||
31833+ clust.dstat == UNPR_DISK_CLUSTER);
31834+
31835+ assert("edward-1191", inode->i_size == new_size);
31836+ assert("edward-1206", body_truncate_ok(inode, ridx));
31837+ truncate_fake:
31838+ /* drop all the pages that don't have jnodes (i.e. pages
31839+ which can not be truncated by cut_file_items() because
31840+ of holes represented by fake disk clusters) including
31841+ the pages of partially truncated cluster which was
31842+ released by prepare_logical_cluster() */
31843+ INODE_SET_SIZE(inode, new_size);
31844+ truncate_inode_pages(inode->i_mapping, new_size);
31845+ out:
31846+ assert("edward-1334", !result || result == -ENOSPC);
31847+ assert("edward-1497",
31848+ pages_truncate_ok(inode, size_in_pages(new_size)));
31849+
31850+ done_lh(lh);
31851+ kfree(hint);
31852+ put_cluster_handle(&clust);
31853+ return result;
31854+}
31855+
31856+/* Prepare cryptcompress file for truncate:
31857+ * prune or append rightmost fake logical clusters (if any)
31858+ */
31859+static int start_truncate_fake(struct inode *inode, cloff_t aidx,
31860+ loff_t new_size, int update_sd)
31861+{
31862+ int result = 0;
31863+ int bytes;
31864+
31865+ if (new_size > inode->i_size) {
31866+ /* append */
31867+ if (inode->i_size < clust_to_off(aidx, inode))
31868+ /* no fake bytes */
31869+ return 0;
31870+ bytes = new_size - inode->i_size;
31871+ INODE_SET_SIZE(inode, inode->i_size + bytes);
31872+ } else {
31873+ /* prune */
31874+ if (inode->i_size <= clust_to_off(aidx, inode))
31875+ /* no fake bytes */
31876+ return 0;
31877+ bytes = inode->i_size -
31878+ max(new_size, clust_to_off(aidx, inode));
31879+ if (!bytes)
31880+ return 0;
31881+ INODE_SET_SIZE(inode, inode->i_size - bytes);
31882+ /* In the case of fake prune we need to drop page cluster.
31883+ There are only 2 cases for partially truncated page:
31884+ 1. If is is dirty, therefore it is anonymous
31885+ (was dirtied via mmap), and will be captured
31886+ later via ->capture().
31887+ 2. If is clean, therefore it is filled by zeroes.
31888+ In both cases we don't need to make it dirty and
31889+ capture here.
31890+ */
31891+ truncate_inode_pages(inode->i_mapping, inode->i_size);
31892+ }
31893+ if (update_sd)
31894+ result = update_sd_cryptcompress(inode);
31895+ return result;
31896+}
31897+
31898+/**
31899+ * This is called in setattr_cryptcompress when it is used to truncate,
31900+ * and in delete_object_cryptcompress
31901+ */
31902+static int cryptcompress_truncate(struct inode *inode, /* old size */
31903+ loff_t new_size, /* new size */
31904+ int update_sd)
31905+{
31906+ int result;
31907+ cloff_t aidx;
31908+
31909+ result = find_fake_appended(inode, &aidx);
31910+ if (result)
31911+ return result;
31912+ assert("edward-1208",
31913+ ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
31914+
31915+ result = start_truncate_fake(inode, aidx, new_size, update_sd);
31916+ if (result)
31917+ return result;
31918+ if (inode->i_size == new_size)
31919+ /* nothing to truncate anymore */
31920+ return 0;
31921+ result = (inode->i_size < new_size ?
31922+ cryptcompress_append_hole(inode, new_size) :
31923+ prune_cryptcompress(inode, new_size, update_sd, aidx));
31924+ if (!result && update_sd)
31925+ result = update_sd_cryptcompress(inode);
31926+ return result;
31927+}
31928+
31929+/* Capture an anonymous pager cluster. (Page cluser is
31930+ * anonymous if it contains at least one anonymous page
31931+ */
31932+static int capture_anon_page_cluster(struct cluster_handle * clust,
31933+ struct inode * inode)
31934+{
31935+ int result;
31936+
31937+ assert("edward-1073", clust != NULL);
31938+ assert("edward-1074", inode != NULL);
31939+ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
31940+
31941+ result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
31942+ if (result)
31943+ return result;
31944+ set_cluster_pages_dirty(clust, inode);
31945+ result = checkin_logical_cluster(clust, inode);
31946+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
31947+ if (unlikely(result))
31948+ put_page_cluster(clust, inode, WRITE_OP);
31949+ return result;
31950+}
31951+
31952+/* Starting from @index find tagged pages of the same page cluster.
31953+ * Clear the tag for each of them. Return number of found pages.
31954+ */
31955+static int find_anon_page_cluster(struct address_space * mapping,
31956+ pgoff_t * index, struct page ** pages)
31957+{
31958+ int i = 0;
31959+ int found;
31960+ write_lock_irq(&mapping->tree_lock);
31961+ do {
31962+ /* looking for one page */
31963+ found = radix_tree_gang_lookup_tag(&mapping->page_tree,
31964+ (void **)&pages[i],
31965+ *index, 1,
31966+ PAGECACHE_TAG_REISER4_MOVED);
31967+ if (!found)
31968+ break;
31969+ if (!same_page_cluster(pages[0], pages[i]))
31970+ break;
31971+
31972+ /* found */
31973+ page_cache_get(pages[i]);
31974+ *index = pages[i]->index + 1;
31975+
31976+ radix_tree_tag_clear(&mapping->page_tree,
31977+ pages[i]->index,
31978+ PAGECACHE_TAG_REISER4_MOVED);
31979+ if (last_page_in_cluster(pages[i++]))
31980+ break;
31981+ } while (1);
31982+ write_unlock_irq(&mapping->tree_lock);
31983+ return i;
31984+}
31985+
31986+#define MAX_PAGES_TO_CAPTURE (1024)
31987+
31988+/* Capture anonymous page clusters */
31989+static int capture_anon_pages(struct address_space * mapping, pgoff_t * index,
31990+ int to_capture)
31991+{
31992+ int count = 0;
31993+ int found = 0;
31994+ int result = 0;
31995+ hint_t *hint;
31996+ lock_handle *lh;
31997+ struct inode * inode;
31998+ struct cluster_handle clust;
31999+ struct page * pages[MAX_CLUSTER_NRPAGES];
32000+
32001+ assert("edward-1127", mapping != NULL);
32002+ assert("edward-1128", mapping->host != NULL);
32003+ assert("edward-1440", mapping->host->i_mapping == mapping);
32004+
32005+ inode = mapping->host;
32006+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
32007+ if (hint == NULL)
32008+ return RETERR(-ENOMEM);
32009+ hint_init_zero(hint);
32010+ lh = &hint->lh;
32011+
32012+ cluster_init_read(&clust, NULL);
32013+ clust.hint = hint;
32014+
32015+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
32016+ if (result)
32017+ goto out;
32018+
32019+ while (to_capture > 0) {
32020+ found = find_anon_page_cluster(mapping, index, pages);
32021+ if (!found) {
32022+ *index = (pgoff_t) - 1;
32023+ break;
32024+ }
32025+ move_cluster_forward(&clust, inode, pages[0]->index);
32026+ result = capture_anon_page_cluster(&clust, inode);
32027+
32028+ put_found_pages(pages, found); /* find_anon_page_cluster */
32029+ if (result)
32030+ break;
32031+ to_capture -= clust.nr_pages;
32032+ count += clust.nr_pages;
32033+ }
32034+ if (result) {
32035+ warning("edward-1077",
32036+ "Capture failed (inode %llu, result=%i, captured=%d)\n",
32037+ (unsigned long long)get_inode_oid(inode), result, count);
32038+ } else {
32039+ assert("edward-1078", ergo(found > 0, count > 0));
32040+ if (to_capture <= 0)
32041+ /* there may be left more pages */
32042+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
32043+ result = count;
32044+ }
32045+ out:
32046+ done_lh(lh);
32047+ kfree(hint);
32048+ put_cluster_handle(&clust);
32049+ return result;
32050+}
32051+
32052+/* Returns true if inode's mapping has dirty pages
32053+ which do not belong to any atom */
32054+static int cryptcompress_inode_has_anon_pages(struct inode *inode)
32055+{
32056+ int result;
32057+ read_lock_irq(&inode->i_mapping->tree_lock);
32058+ result = radix_tree_tagged(&inode->i_mapping->page_tree,
32059+ PAGECACHE_TAG_REISER4_MOVED);
32060+ read_unlock_irq(&inode->i_mapping->tree_lock);
32061+ return result;
32062+}
32063+
32064+/* plugin->writepages */
32065+int writepages_cryptcompress(struct address_space *mapping,
32066+ struct writeback_control *wbc)
32067+{
32068+ int result = 0;
32069+ long to_capture;
32070+ pgoff_t nrpages;
32071+ pgoff_t index = 0;
32072+ struct inode *inode;
32073+ struct cryptcompress_info *info;
32074+
32075+ inode = mapping->host;
32076+ if (!cryptcompress_inode_has_anon_pages(inode))
32077+ goto end;
32078+ info = cryptcompress_inode_data(inode);
32079+ nrpages = size_in_pages(i_size_read(inode));
32080+
32081+ if (wbc->sync_mode != WB_SYNC_ALL)
32082+ to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE);
32083+ else
32084+ to_capture = MAX_PAGES_TO_CAPTURE;
32085+ do {
32086+ reiser4_context *ctx;
32087+
32088+ ctx = reiser4_init_context(inode->i_sb);
32089+ if (IS_ERR(ctx)) {
32090+ result = PTR_ERR(ctx);
32091+ break;
32092+ }
32093+ /* avoid recursive calls to ->sync_inodes */
32094+ ctx->nobalance = 1;
32095+
32096+ assert("edward-1079",
32097+ lock_stack_isclean(get_current_lock_stack()));
32098+
32099+ reiser4_txn_restart_current();
32100+
32101+ if (get_current_context()->entd) {
32102+ if (mutex_trylock(&info->checkin_mutex) == 0) {
32103+ /* the mutex might be occupied by
32104+ entd caller */
32105+ result = RETERR(-EBUSY);
32106+ reiser4_exit_context(ctx);
32107+ break;
32108+ }
32109+ } else
32110+ mutex_lock(&info->checkin_mutex);
32111+
32112+ result = capture_anon_pages(inode->i_mapping, &index,
32113+ to_capture);
32114+ mutex_unlock(&info->checkin_mutex);
32115+
32116+ if (result < 0) {
32117+ reiser4_exit_context(ctx);
32118+ break;
32119+ }
32120+ wbc->nr_to_write -= result;
32121+ if (wbc->sync_mode != WB_SYNC_ALL) {
32122+ reiser4_exit_context(ctx);
32123+ break;
32124+ }
32125+ result = txnmgr_force_commit_all(inode->i_sb, 0);
32126+ reiser4_exit_context(ctx);
32127+ } while (result >= 0 && index < nrpages);
32128+
32129+ end:
32130+ if (is_in_reiser4_context()) {
32131+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
32132+ /* there are already pages to flush, flush them out,
32133+ do not delay until end of reiser4_sync_inodes */
32134+ reiser4_writeout(inode->i_sb, wbc);
32135+ get_current_context()->nr_captured = 0;
32136+ }
32137+ }
32138+ return result;
32139+}
32140+
32141+/* plugin->ioctl */
32142+int ioctl_cryptcompress(struct inode *inode, struct file *filp,
32143+ unsigned int cmd, unsigned long arg)
32144+{
32145+ return 0;
32146+}
32147+
32148+/* plugin->mmap */
32149+int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
32150+{
32151+ int result;
32152+ struct inode *inode;
32153+ reiser4_context *ctx;
32154+
32155+ inode = file->f_dentry->d_inode;
32156+ ctx = reiser4_init_context(inode->i_sb);
32157+ if (IS_ERR(ctx))
32158+ return PTR_ERR(ctx);
32159+ /*
32160+ * generic_file_mmap will do update_atime. Grab space for stat data
32161+ * update.
32162+ */
32163+ result = reiser4_grab_space_force
32164+ (inode_file_plugin(inode)->estimate.update(inode),
32165+ BA_CAN_COMMIT);
32166+ if (result) {
32167+ reiser4_exit_context(ctx);
32168+ return result;
32169+ }
32170+ result = generic_file_mmap(file, vma);
32171+ reiser4_exit_context(ctx);
32172+ return result;
32173+}
32174+
32175+/* plugin->delete_object */
32176+int delete_object_cryptcompress(struct inode *inode)
32177+{
32178+ int result;
32179+ struct cryptcompress_info * info;
32180+
32181+ assert("edward-429", inode->i_nlink == 0);
32182+
32183+ reiser4_txn_restart_current();
32184+ info = cryptcompress_inode_data(inode);
32185+
32186+ mutex_lock(&info->checkin_mutex);
32187+ result = cryptcompress_truncate(inode, 0, 0);
32188+ mutex_unlock(&info->checkin_mutex);
32189+
32190+ if (result) {
32191+ warning("edward-430",
32192+ "cannot truncate cryptcompress file %lli: %i",
32193+ (unsigned long long)get_inode_oid(inode),
32194+ result);
32195+ }
32196+ truncate_inode_pages(inode->i_mapping, 0);
32197+ assert("edward-1487", pages_truncate_ok(inode, 0));
32198+ /* and remove stat data */
32199+ return reiser4_delete_object_common(inode);
32200+}
32201+
32202+/*
32203+ * plugin->setattr
32204+ * This implements actual truncate (see comments in reiser4/page_cache.c)
32205+ */
32206+int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
32207+{
32208+ int result;
32209+ struct inode *inode;
32210+ struct cryptcompress_info * info;
32211+
32212+ inode = dentry->d_inode;
32213+ info = cryptcompress_inode_data(inode);
32214+
32215+ if (attr->ia_valid & ATTR_SIZE) {
32216+ if (i_size_read(inode) != attr->ia_size) {
32217+ reiser4_context *ctx;
32218+ loff_t old_size;
32219+
32220+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
32221+ if (IS_ERR(ctx))
32222+ return PTR_ERR(ctx);
32223+
32224+ old_size = i_size_read(inode);
32225+ inode_check_scale(inode, old_size, attr->ia_size);
32226+
32227+ mutex_lock(&info->checkin_mutex);
32228+ result = cryptcompress_truncate(inode,
32229+ attr->ia_size,
32230+ 1/* update sd */);
32231+ mutex_unlock(&info->checkin_mutex);
32232+ if (result) {
32233+ warning("edward-1192",
32234+ "truncate_cryptcompress failed: oid %lli, "
32235+ "old size %lld, new size %lld, retval %d",
32236+ (unsigned long long)
32237+ get_inode_oid(inode), old_size,
32238+ attr->ia_size, result);
32239+ }
32240+ context_set_commit_async(ctx);
32241+ reiser4_exit_context(ctx);
32242+ } else
32243+ result = 0;
32244+ } else
32245+ result = reiser4_setattr_common(dentry, attr);
32246+ return result;
32247+}
32248+
32249+/* plugin->release */
32250+int release_cryptcompress(struct inode *inode, struct file *file)
32251+{
32252+ reiser4_context *ctx = reiser4_init_context(inode->i_sb);
32253+
32254+ if (IS_ERR(ctx))
32255+ return PTR_ERR(ctx);
32256+ reiser4_free_file_fsdata(file);
32257+ reiser4_exit_context(ctx);
32258+ return 0;
32259+}
32260+
32261+/* plugin->prepare_write */
32262+int prepare_write_cryptcompress(struct file *file, struct page *page,
32263+ unsigned from, unsigned to)
32264+{
32265+ return -EINVAL;
32266+}
32267+
32268+/* plugin->commit_write */
32269+int commit_write_cryptcompress(struct file *file, struct page *page,
32270+ unsigned from, unsigned to)
32271+{
32272+ BUG();
32273+ return 0;
32274+}
32275+
32276+/* plugin->bmap */
32277+sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock)
32278+{
32279+ return -EINVAL;
32280+}
32281+
32282+/*
32283+ Local variables:
32284+ c-indentation-style: "K&R"
32285+ mode-name: "LC"
32286+ c-basic-offset: 8
32287+ tab-width: 8
32288+ fill-column: 80
32289+ scroll-step: 1
32290+ End:
32291+*/
32292diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.23/fs/reiser4/plugin/file/cryptcompress.h
32293--- linux-2.6.23.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 03:00:00.000000000 +0300
32294+++ linux-2.6.23/fs/reiser4/plugin/file/cryptcompress.h 2007-12-04 16:49:30.000000000 +0300
32295@@ -0,0 +1,604 @@
32296+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
32297+/* See http://www.namesys.com/cryptcompress_design.html */
32298+
32299+#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
32300+#define __FS_REISER4_CRYPTCOMPRESS_H__
32301+
32302+#include "../../page_cache.h"
32303+#include "../compress/compress.h"
32304+#include "../crypto/cipher.h"
32305+
32306+#include <linux/pagemap.h>
32307+
32308+#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
32309+#define MAX_CLUSTER_SHIFT 16
32310+#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
32311+#define DC_CHECKSUM_SIZE 4
32312+
32313+#define MIN_LATTICE_FACTOR 1
32314+#define MAX_LATTICE_FACTOR 32
32315+
32316+/* this mask contains all non-standard plugins that might
32317+ be present in reiser4-specific part of inode managed by
32318+ cryptcompress file plugin */
32319+#define cryptcompress_mask \
32320+ ((1 << PSET_FILE) | \
32321+ (1 << PSET_CLUSTER) | \
32322+ (1 << PSET_CIPHER) | \
32323+ (1 << PSET_DIGEST) | \
32324+ (1 << PSET_COMPRESSION) | \
32325+ (1 << PSET_COMPRESSION_MODE))
32326+
32327+#if REISER4_DEBUG
32328+static inline int cluster_shift_ok(int shift)
32329+{
32330+ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
32331+}
32332+#endif
32333+
32334+#if REISER4_DEBUG
32335+#define INODE_PGCOUNT(inode) \
32336+({ \
32337+ assert("edward-1530", inode_file_plugin(inode) == \
32338+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
32339+ atomic_read(&cryptcompress_inode_data(inode)->pgcount); \
32340+ })
32341+#define INODE_PGCOUNT_INC(inode) \
32342+do { \
32343+ assert("edward-1531", inode_file_plugin(inode) == \
32344+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
32345+ atomic_inc(&cryptcompress_inode_data(inode)->pgcount); \
32346+} while (0)
32347+#define INODE_PGCOUNT_DEC(inode) \
32348+do { \
32349+ if (inode_file_plugin(inode) == \
32350+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) \
32351+ atomic_dec(&cryptcompress_inode_data(inode)->pgcount); \
32352+} while (0)
32353+#else
32354+#define INODE_PGCOUNT(inode) (0)
32355+#define INODE_PGCOUNT_INC(inode)
32356+#define INODE_PGCOUNT_DEC(inode)
32357+#endif /* REISER4_DEBUG */
32358+
32359+struct tfm_stream {
32360+ __u8 *data;
32361+ size_t size;
32362+};
32363+
32364+typedef enum {
32365+ INPUT_STREAM,
32366+ OUTPUT_STREAM,
32367+ LAST_STREAM
32368+} tfm_stream_id;
32369+
32370+typedef struct tfm_stream * tfm_unit[LAST_STREAM];
32371+
32372+static inline __u8 *ts_data(struct tfm_stream * stm)
32373+{
32374+ assert("edward-928", stm != NULL);
32375+ return stm->data;
32376+}
32377+
32378+static inline size_t ts_size(struct tfm_stream * stm)
32379+{
32380+ assert("edward-929", stm != NULL);
32381+ return stm->size;
32382+}
32383+
32384+static inline void set_ts_size(struct tfm_stream * stm, size_t size)
32385+{
32386+ assert("edward-930", stm != NULL);
32387+
32388+ stm->size = size;
32389+}
32390+
32391+static inline int alloc_ts(struct tfm_stream ** stm)
32392+{
32393+ assert("edward-931", stm);
32394+ assert("edward-932", *stm == NULL);
32395+
32396+ *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get());
32397+ if (!*stm)
32398+ return -ENOMEM;
32399+ return 0;
32400+}
32401+
32402+static inline void free_ts(struct tfm_stream * stm)
32403+{
32404+ assert("edward-933", !ts_data(stm));
32405+ assert("edward-934", !ts_size(stm));
32406+
32407+ kfree(stm);
32408+}
32409+
32410+static inline int alloc_ts_data(struct tfm_stream * stm, size_t size)
32411+{
32412+ assert("edward-935", !ts_data(stm));
32413+ assert("edward-936", !ts_size(stm));
32414+ assert("edward-937", size != 0);
32415+
32416+ stm->data = reiser4_vmalloc(size);
32417+ if (!stm->data)
32418+ return -ENOMEM;
32419+ set_ts_size(stm, size);
32420+ return 0;
32421+}
32422+
32423+static inline void free_ts_data(struct tfm_stream * stm)
32424+{
32425+ assert("edward-938", equi(ts_data(stm), ts_size(stm)));
32426+
32427+ if (ts_data(stm))
32428+ vfree(ts_data(stm));
32429+ memset(stm, 0, sizeof *stm);
32430+}
32431+
32432+/* Write modes for item conversion in flush convert phase */
32433+typedef enum {
32434+ CRC_APPEND_ITEM = 1,
32435+ CRC_OVERWRITE_ITEM = 2,
32436+ CRC_CUT_ITEM = 3
32437+} cryptcompress_write_mode_t;
32438+
32439+typedef enum {
32440+ LC_INVAL = 0, /* invalid value */
32441+ LC_APPOV = 1, /* append and/or overwrite */
32442+ LC_TRUNC = 2 /* truncate */
32443+} logical_cluster_op;
32444+
32445+/* Transform cluster.
32446+ * Intermediate state between page cluster and disk cluster
32447+ * Is used for data transform (compression/encryption)
32448+ */
32449+struct tfm_cluster {
32450+ coa_set coa; /* compression algorithms info */
32451+ tfm_unit tun; /* plain and transformed streams */
32452+ tfm_action act;
32453+ int uptodate;
32454+ int lsize; /* number of bytes in logical cluster */
32455+ int len; /* length of the transform stream */
32456+};
32457+
32458+static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32459+ tfm_action act)
32460+{
32461+ return tc->coa[id][act];
32462+}
32463+
32464+static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32465+ tfm_action act, coa_t coa)
32466+{
32467+ tc->coa[id][act] = coa;
32468+}
32469+
32470+static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32471+{
32472+ coa_t coa;
32473+
32474+ coa = cplug->alloc(tc->act);
32475+ if (IS_ERR(coa))
32476+ return PTR_ERR(coa);
32477+ set_coa(tc, cplug->h.id, tc->act, coa);
32478+ return 0;
32479+}
32480+
32481+static inline int
32482+grab_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32483+{
32484+ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
32485+ alloc_coa(tc, cplug) : 0);
32486+}
32487+
32488+static inline void free_coa_set(struct tfm_cluster * tc)
32489+{
32490+ tfm_action j;
32491+ reiser4_compression_id i;
32492+ compression_plugin *cplug;
32493+
32494+ assert("edward-810", tc != NULL);
32495+
32496+ for (j = 0; j < TFMA_LAST; j++)
32497+ for (i = 0; i < LAST_COMPRESSION_ID; i++) {
32498+ if (!get_coa(tc, i, j))
32499+ continue;
32500+ cplug = compression_plugin_by_id(i);
32501+ assert("edward-812", cplug->free != NULL);
32502+ cplug->free(get_coa(tc, i, j), j);
32503+ set_coa(tc, i, j, 0);
32504+ }
32505+ return;
32506+}
32507+
32508+static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc,
32509+ tfm_stream_id id)
32510+{
32511+ return tc->tun[id];
32512+}
32513+
32514+static inline void set_tfm_stream(struct tfm_cluster * tc,
32515+ tfm_stream_id id, struct tfm_stream * ts)
32516+{
32517+ tc->tun[id] = ts;
32518+}
32519+
32520+static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id)
32521+{
32522+ return ts_data(get_tfm_stream(tc, id));
32523+}
32524+
32525+static inline void set_tfm_stream_data(struct tfm_cluster * tc,
32526+ tfm_stream_id id, __u8 * data)
32527+{
32528+ get_tfm_stream(tc, id)->data = data;
32529+}
32530+
32531+static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id)
32532+{
32533+ return ts_size(get_tfm_stream(tc, id));
32534+}
32535+
32536+static inline void
32537+set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size)
32538+{
32539+ get_tfm_stream(tc, id)->size = size;
32540+}
32541+
32542+static inline int
32543+alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32544+{
32545+ assert("edward-939", tc != NULL);
32546+ assert("edward-940", !get_tfm_stream(tc, id));
32547+
32548+ tc->tun[id] = kzalloc(sizeof(struct tfm_stream),
32549+ reiser4_ctx_gfp_mask_get());
32550+ if (!tc->tun[id])
32551+ return -ENOMEM;
32552+ return alloc_ts_data(get_tfm_stream(tc, id), size);
32553+}
32554+
32555+static inline int
32556+realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32557+{
32558+ assert("edward-941", tfm_stream_size(tc, id) < size);
32559+ free_ts_data(get_tfm_stream(tc, id));
32560+ return alloc_ts_data(get_tfm_stream(tc, id), size);
32561+}
32562+
32563+static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id)
32564+{
32565+ free_ts_data(get_tfm_stream(tc, id));
32566+ free_ts(get_tfm_stream(tc, id));
32567+ set_tfm_stream(tc, id, 0);
32568+}
32569+
32570+static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
32571+{
32572+ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
32573+}
32574+
32575+static inline void free_tfm_unit(struct tfm_cluster * tc)
32576+{
32577+ tfm_stream_id id;
32578+ for (id = 0; id < LAST_STREAM; id++) {
32579+ if (!get_tfm_stream(tc, id))
32580+ continue;
32581+ free_tfm_stream(tc, id);
32582+ }
32583+}
32584+
32585+static inline void put_tfm_cluster(struct tfm_cluster * tc)
32586+{
32587+ assert("edward-942", tc != NULL);
32588+ free_coa_set(tc);
32589+ free_tfm_unit(tc);
32590+}
32591+
32592+static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc)
32593+{
32594+ assert("edward-943", tc != NULL);
32595+ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
32596+ return (tc->uptodate == 1);
32597+}
32598+
32599+static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc)
32600+{
32601+ assert("edward-945", tc != NULL);
32602+ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
32603+ tc->uptodate = 1;
32604+ return;
32605+}
32606+
32607+static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc)
32608+{
32609+ assert("edward-947", tc != NULL);
32610+ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
32611+ tc->uptodate = 0;
32612+ return;
32613+}
32614+
32615+static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id)
32616+{
32617+ return (get_tfm_stream(tc, id) &&
32618+ tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
32619+}
32620+
32621+static inline int tfm_cluster_is_set(struct tfm_cluster * tc)
32622+{
32623+ int i;
32624+ for (i = 0; i < LAST_STREAM; i++)
32625+ if (!tfm_stream_is_set(tc, i))
32626+ return 0;
32627+ return 1;
32628+}
32629+
32630+static inline void alternate_streams(struct tfm_cluster * tc)
32631+{
32632+ struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM);
32633+
32634+ set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM));
32635+ set_tfm_stream(tc, OUTPUT_STREAM, tmp);
32636+}
32637+
32638+/* Set of states to indicate a kind of data
32639+ * that will be written to the window */
32640+typedef enum {
32641+ DATA_WINDOW, /* user's data */
32642+ HOLE_WINDOW /* zeroes (such kind of data can be written
32643+ * if we start to write from offset > i_size) */
32644+} window_stat;
32645+
32646+/* Window (of logical cluster size) discretely sliding along a file.
32647+ * Is used to locate hole region in a logical cluster to be properly
32648+ * represented on disk.
32649+ * We split a write to cryptcompress file into writes to its logical
32650+ * clusters. Before writing to a logical cluster we set a window, i.e.
32651+ * calculate values of the following fields:
32652+ */
32653+struct reiser4_slide {
32654+ unsigned off; /* offset to write from */
32655+ unsigned count; /* number of bytes to write */
32656+ unsigned delta; /* number of bytes to append to the hole */
32657+ window_stat stat; /* what kind of data will be written starting
32658+ from @off */
32659+};
32660+
32661+/* Possible states of a disk cluster */
32662+typedef enum {
32663+ INVAL_DISK_CLUSTER, /* unknown state */
32664+ PREP_DISK_CLUSTER, /* disk cluster got converted by flush
32665+ * at least 1 time */
32666+ UNPR_DISK_CLUSTER, /* disk cluster just created and should be
32667+ * converted by flush */
32668+ FAKE_DISK_CLUSTER, /* disk cluster doesn't exist neither in memory
32669+ * nor on disk */
32670+ TRNC_DISK_CLUSTER /* disk cluster is partially truncated */
32671+} disk_cluster_stat;
32672+
32673+/* The following structure represents various stages of the same logical
32674+ * cluster of index @index:
32675+ * . fixed slide
32676+ * . page cluster (stage in primary cache)
32677+ * . transform cluster (transition stage)
32678+ * . disk cluster (stage in secondary cache)
32679+ * This structure is used in transition and synchronizing operations, e.g.
32680+ * transform cluster is a transition state when synchronizing page cluster
32681+ * and disk cluster.
32682+ * FIXME: Encapsulate page cluster, disk cluster.
32683+ */
32684+struct cluster_handle {
32685+ cloff_t index; /* offset in a file (unit is a cluster size) */
32686+ int index_valid; /* for validating the index above, if needed */
32687+ struct file *file; /* host file */
32688+
32689+ /* logical cluster */
32690+ struct reiser4_slide *win; /* sliding window to locate holes */
32691+ logical_cluster_op op; /* logical cluster operation (truncate or
32692+ append/overwrite) */
32693+ /* transform cluster */
32694+ struct tfm_cluster tc; /* contains all needed info to synchronize
32695+ page cluster and disk cluster) */
32696+ /* page cluster */
32697+ int nr_pages; /* number of pages of current checkin action */
32698+ int old_nrpages; /* number of pages of last checkin action */
32699+ struct page **pages; /* attached pages */
32700+ jnode * node; /* jnode for capture */
32701+
32702+ /* disk cluster */
32703+ hint_t *hint; /* current position in the tree */
32704+ disk_cluster_stat dstat; /* state of the current disk cluster */
32705+ int reserved; /* is space for disk cluster reserved */
32706+#if REISER4_DEBUG
32707+ reiser4_context *ctx;
32708+ int reserved_prepped;
32709+ int reserved_unprepped;
32710+#endif
32711+
32712+};
32713+
32714+static inline __u8 * tfm_input_data (struct cluster_handle * clust)
32715+{
32716+ return tfm_stream_data(&clust->tc, INPUT_STREAM);
32717+}
32718+
32719+static inline __u8 * tfm_output_data (struct cluster_handle * clust)
32720+{
32721+ return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
32722+}
32723+
32724+static inline int reset_cluster_pgset(struct cluster_handle * clust,
32725+ int nrpages)
32726+{
32727+ assert("edward-1057", clust->pages != NULL);
32728+ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
32729+ return 0;
32730+}
32731+
32732+static inline int alloc_cluster_pgset(struct cluster_handle * clust,
32733+ int nrpages)
32734+{
32735+ assert("edward-949", clust != NULL);
32736+ assert("edward-1362", clust->pages == NULL);
32737+ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
32738+
32739+ clust->pages = kzalloc(sizeof(*clust->pages) * nrpages,
32740+ reiser4_ctx_gfp_mask_get());
32741+ if (!clust->pages)
32742+ return RETERR(-ENOMEM);
32743+ return 0;
32744+}
32745+
32746+static inline void free_cluster_pgset(struct cluster_handle * clust)
32747+{
32748+ assert("edward-951", clust->pages != NULL);
32749+ kfree(clust->pages);
32750+ clust->pages = NULL;
32751+}
32752+
32753+static inline void put_cluster_handle(struct cluster_handle * clust)
32754+{
32755+ assert("edward-435", clust != NULL);
32756+
32757+ put_tfm_cluster(&clust->tc);
32758+ if (clust->pages)
32759+ free_cluster_pgset(clust);
32760+ memset(clust, 0, sizeof *clust);
32761+}
32762+
32763+static inline void inc_keyload_count(struct reiser4_crypto_info * data)
32764+{
32765+ assert("edward-1410", data != NULL);
32766+ data->keyload_count++;
32767+}
32768+
32769+static inline void dec_keyload_count(struct reiser4_crypto_info * data)
32770+{
32771+ assert("edward-1411", data != NULL);
32772+ assert("edward-1412", data->keyload_count > 0);
32773+ data->keyload_count--;
32774+}
32775+
32776+static inline int capture_cluster_jnode(jnode * node)
32777+{
32778+ return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
32779+}
32780+
32781+/* cryptcompress specific part of reiser4_inode */
32782+struct cryptcompress_info {
32783+ struct mutex checkin_mutex; /* This is to serialize
32784+ * checkin_logical_cluster operations */
32785+ cloff_t trunc_index; /* Index of the leftmost truncated disk
32786+ * cluster (to resolve races with read) */
32787+ struct reiser4_crypto_info *crypt;
32788+ /*
32789+ * the following 2 fields are controlled by compression mode plugin
32790+ */
32791+ int compress_toggle; /* Current status of compressibility */
32792+ int lattice_factor; /* Factor of dynamic lattice. FIXME: Have
32793+ * a compression_toggle to keep the factor
32794+ */
32795+#if REISER4_DEBUG
32796+ atomic_t pgcount; /* number of grabbed pages */
32797+#endif
32798+};
32799+
32800+static inline void set_compression_toggle (struct cryptcompress_info * info, int val)
32801+{
32802+ info->compress_toggle = val;
32803+}
32804+
32805+static inline int get_compression_toggle (struct cryptcompress_info * info)
32806+{
32807+ return info->compress_toggle;
32808+}
32809+
32810+static inline int compression_is_on(struct cryptcompress_info * info)
32811+{
32812+ return get_compression_toggle(info) == 1;
32813+}
32814+
32815+static inline void turn_on_compression(struct cryptcompress_info * info)
32816+{
32817+ set_compression_toggle(info, 1);
32818+}
32819+
32820+static inline void turn_off_compression(struct cryptcompress_info * info)
32821+{
32822+ set_compression_toggle(info, 0);
32823+}
32824+
32825+static inline void set_lattice_factor(struct cryptcompress_info * info, int val)
32826+{
32827+ info->lattice_factor = val;
32828+}
32829+
32830+static inline int get_lattice_factor(struct cryptcompress_info * info)
32831+{
32832+ return info->lattice_factor;
32833+}
32834+
32835+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *);
32836+int equal_to_rdk(znode *, const reiser4_key *);
32837+int goto_right_neighbor(coord_t *, lock_handle *);
32838+int cryptcompress_inode_ok(struct inode *inode);
32839+int coord_is_unprepped_ctail(const coord_t * coord);
32840+extern int do_readpage_ctail(struct inode *, struct cluster_handle *,
32841+ struct page * page, znode_lock_mode mode);
32842+extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
32843+ struct inode * inode);
32844+extern int readpages_cryptcompress(struct file*, struct address_space*,
32845+ struct list_head*, unsigned);
32846+int bind_cryptcompress(struct inode *child, struct inode *parent);
32847+void destroy_inode_cryptcompress(struct inode * inode);
32848+int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
32849+ rw_op rw);
32850+int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
32851+ struct cluster_handle * clust, int * progress);
32852+struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
32853+void inherit_crypto_info_common(struct inode * parent, struct inode * object,
32854+ int (*can_inherit)(struct inode * child,
32855+ struct inode * parent));
32856+void reiser4_attach_crypto_info(struct inode * inode,
32857+ struct reiser4_crypto_info * info);
32858+void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new);
32859+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode);
32860+
32861+static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info)
32862+{
32863+ return info->cipher;
32864+}
32865+
32866+static inline void info_set_cipher(struct reiser4_crypto_info * info,
32867+ struct crypto_blkcipher * tfm)
32868+{
32869+ info->cipher = tfm;
32870+}
32871+
32872+static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info)
32873+{
32874+ return info->digest;
32875+}
32876+
32877+static inline void info_set_digest(struct reiser4_crypto_info * info,
32878+ struct crypto_hash * tfm)
32879+{
32880+ info->digest = tfm;
32881+}
32882+
32883+static inline void put_cluster_page(struct page * page)
32884+{
32885+ page_cache_release(page);
32886+}
32887+
32888+#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
32889+
32890+/* Make Linus happy.
32891+ Local variables:
32892+ c-indentation-style: "K&R"
32893+ mode-name: "LC"
32894+ c-basic-offset: 8
32895+ tab-width: 8
32896+ fill-column: 120
32897+ scroll-step: 1
32898+ End:
32899+*/
32900diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/file.c linux-2.6.23/fs/reiser4/plugin/file/file.c
32901--- linux-2.6.23.orig/fs/reiser4/plugin/file/file.c 1970-01-01 03:00:00.000000000 +0300
32902+++ linux-2.6.23/fs/reiser4/plugin/file/file.c 2007-12-04 23:04:00.726305004 +0300
32903@@ -0,0 +1,2735 @@
32904+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
32905+ * reiser4/README */
32906+
32907+/*
32908+ * this file contains implementations of inode/file/address_space/file plugin
32909+ * operations specific for "unix file plugin" (plugin id is
32910+ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
32911+ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
32912+ * no items but stat data)
32913+ */
32914+
32915+#include "../../inode.h"
32916+#include "../../super.h"
32917+#include "../../tree_walk.h"
32918+#include "../../carry.h"
32919+#include "../../page_cache.h"
32920+#include "../../ioctl.h"
32921+#include "../object.h"
32922+#include "../cluster.h"
32923+#include "../../safe_link.h"
32924+
32925+#include <linux/writeback.h>
32926+#include <linux/pagevec.h>
32927+#include <linux/syscalls.h>
32928+
32929+
32930+static int unpack(struct file *file, struct inode *inode, int forever);
32931+static void drop_access(struct unix_file_info *);
32932+static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
32933+ znode_lock_mode lock_mode);
32934+
32935+/* Get exclusive access and make sure that file is not partially
32936+ * converted (It may happen that another process is doing tail
32937+ * conversion. If so, wait until it completes)
32938+ */
32939+static inline void get_exclusive_access_careful(struct unix_file_info * uf_info,
32940+ struct inode *inode)
32941+{
32942+ do {
32943+ get_exclusive_access(uf_info);
32944+ if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
32945+ break;
32946+ drop_exclusive_access(uf_info);
32947+ schedule();
32948+ } while (1);
32949+}
32950+
32951+/* get unix file plugin specific portion of inode */
32952+struct unix_file_info *unix_file_inode_data(const struct inode *inode)
32953+{
32954+ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
32955+}
32956+
32957+/**
32958+ * equal_to_rdk - compare key and znode's right delimiting key
32959+ * @node: node whose right delimiting key to compare with @key
32960+ * @key: key to compare with @node's right delimiting key
32961+ *
32962+ * Returns true if @key is equal to right delimiting key of @node.
32963+ */
32964+int equal_to_rdk(znode *node, const reiser4_key *key)
32965+{
32966+ int result;
32967+
32968+ read_lock_dk(znode_get_tree(node));
32969+ result = keyeq(key, znode_get_rd_key(node));
32970+ read_unlock_dk(znode_get_tree(node));
32971+ return result;
32972+}
32973+
32974+#if REISER4_DEBUG
32975+
32976+/**
32977+ * equal_to_ldk - compare key and znode's left delimiting key
32978+ * @node: node whose left delimiting key to compare with @key
32979+ * @key: key to compare with @node's left delimiting key
32980+ *
32981+ * Returns true if @key is equal to left delimiting key of @node.
32982+ */
32983+int equal_to_ldk(znode *node, const reiser4_key *key)
32984+{
32985+ int result;
32986+
32987+ read_lock_dk(znode_get_tree(node));
32988+ result = keyeq(key, znode_get_ld_key(node));
32989+ read_unlock_dk(znode_get_tree(node));
32990+ return result;
32991+}
32992+
32993+/**
32994+ * check_coord - check whether coord corresponds to key
32995+ * @coord: coord to check
32996+ * @key: key @coord has to correspond to
32997+ *
32998+ * Returns true if @coord is set as if it was set as result of lookup with @key
32999+ * in coord->node.
33000+ */
33001+static int check_coord(const coord_t *coord, const reiser4_key *key)
33002+{
33003+ coord_t twin;
33004+
33005+ node_plugin_by_node(coord->node)->lookup(coord->node, key,
33006+ FIND_MAX_NOT_MORE_THAN, &twin);
33007+ return coords_equal(coord, &twin);
33008+}
33009+
33010+#endif /* REISER4_DEBUG */
33011+
33012+/**
33013+ * init_uf_coord - initialize extended coord
33014+ * @uf_coord:
33015+ * @lh:
33016+ *
33017+ *
33018+ */
33019+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
33020+{
33021+ coord_init_zero(&uf_coord->coord);
33022+ coord_clear_iplug(&uf_coord->coord);
33023+ uf_coord->lh = lh;
33024+ init_lh(lh);
33025+ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
33026+ uf_coord->valid = 0;
33027+}
33028+
33029+static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
33030+{
33031+ assert("vs-1333", uf_coord->valid == 0);
33032+
33033+ if (coord_is_between_items(&uf_coord->coord))
33034+ return;
33035+
33036+ assert("vs-1348",
33037+ item_plugin_by_coord(&uf_coord->coord)->s.file.
33038+ init_coord_extension);
33039+
33040+ item_body_by_coord(&uf_coord->coord);
33041+ item_plugin_by_coord(&uf_coord->coord)->s.file.
33042+ init_coord_extension(uf_coord, offset);
33043+}
33044+
33045+/**
33046+ * goto_right_neighbor - lock right neighbor, drop current node lock
33047+ * @coord:
33048+ * @lh:
33049+ *
33050+ * Obtain lock on right neighbor and drop lock on current node.
33051+ */
33052+int goto_right_neighbor(coord_t *coord, lock_handle *lh)
33053+{
33054+ int result;
33055+ lock_handle lh_right;
33056+
33057+ assert("vs-1100", znode_is_locked(coord->node));
33058+
33059+ init_lh(&lh_right);
33060+ result = reiser4_get_right_neighbor(&lh_right, coord->node,
33061+ znode_is_wlocked(coord->node) ?
33062+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
33063+ GN_CAN_USE_UPPER_LEVELS);
33064+ if (result) {
33065+ done_lh(&lh_right);
33066+ return result;
33067+ }
33068+
33069+ /*
33070+ * we hold two longterm locks on neighboring nodes. Unlock left of
33071+ * them
33072+ */
33073+ done_lh(lh);
33074+
33075+ coord_init_first_unit_nocheck(coord, lh_right.node);
33076+ move_lh(lh, &lh_right);
33077+
33078+ return 0;
33079+
33080+}
33081+
33082+/**
33083+ * set_file_state
33084+ * @uf_info:
33085+ * @cbk_result:
33086+ * @level:
33087+ *
33088+ * This is to be used by find_file_item and in find_file_state to
33089+ * determine real state of file
33090+ */
33091+static void set_file_state(struct unix_file_info *uf_info, int cbk_result,
33092+ tree_level level)
33093+{
33094+ if (cbk_errored(cbk_result))
33095+ /* error happened in find_file_item */
33096+ return;
33097+
33098+ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
33099+
33100+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33101+ if (cbk_result == CBK_COORD_NOTFOUND)
33102+ uf_info->container = UF_CONTAINER_EMPTY;
33103+ else if (level == LEAF_LEVEL)
33104+ uf_info->container = UF_CONTAINER_TAILS;
33105+ else
33106+ uf_info->container = UF_CONTAINER_EXTENTS;
33107+ } else {
33108+ /*
33109+ * file state is known, check whether it is set correctly if
33110+ * file is not being tail converted
33111+ */
33112+ if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
33113+ REISER4_PART_IN_CONV)) {
33114+ assert("vs-1162",
33115+ ergo(level == LEAF_LEVEL &&
33116+ cbk_result == CBK_COORD_FOUND,
33117+ uf_info->container == UF_CONTAINER_TAILS));
33118+ assert("vs-1165",
33119+ ergo(level == TWIG_LEVEL &&
33120+ cbk_result == CBK_COORD_FOUND,
33121+ uf_info->container == UF_CONTAINER_EXTENTS));
33122+ }
33123+ }
33124+}
33125+
33126+int find_file_item_nohint(coord_t *coord, lock_handle *lh,
33127+ const reiser4_key *key, znode_lock_mode lock_mode,
33128+ struct inode *inode)
33129+{
33130+ return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
33131+ FIND_MAX_NOT_MORE_THAN,
33132+ TWIG_LEVEL, LEAF_LEVEL,
33133+ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
33134+ (CBK_UNIQUE | CBK_FOR_INSERT),
33135+ NULL /* ra_info */ );
33136+}
33137+
33138+/**
33139+ * find_file_item - look for file item in the tree
33140+ * @hint: provides coordinate, lock handle, seal
33141+ * @key: key for search
33142+ * @mode: mode of lock to put on returned node
33143+ * @ra_info:
33144+ * @inode:
33145+ *
33146+ * This finds position in the tree corresponding to @key. It first tries to use
33147+ * @hint's seal if it is set.
33148+ */
33149+int find_file_item(hint_t *hint, const reiser4_key *key,
33150+ znode_lock_mode lock_mode,
33151+ struct inode *inode)
33152+{
33153+ int result;
33154+ coord_t *coord;
33155+ lock_handle *lh;
33156+
33157+ assert("nikita-3030", reiser4_schedulable());
33158+ assert("vs-1707", hint != NULL);
33159+ assert("vs-47", inode != NULL);
33160+
33161+ coord = &hint->ext_coord.coord;
33162+ lh = hint->ext_coord.lh;
33163+ init_lh(lh);
33164+
33165+ result = hint_validate(hint, key, 1 /* check key */, lock_mode);
33166+ if (!result) {
33167+ if (coord->between == AFTER_UNIT &&
33168+ equal_to_rdk(coord->node, key)) {
33169+ result = goto_right_neighbor(coord, lh);
33170+ if (result == -E_NO_NEIGHBOR)
33171+ return RETERR(-EIO);
33172+ if (result)
33173+ return result;
33174+ assert("vs-1152", equal_to_ldk(coord->node, key));
33175+ /*
33176+ * we moved to different node. Invalidate coord
33177+ * extension, zload is necessary to init it again
33178+ */
33179+ hint->ext_coord.valid = 0;
33180+ }
33181+
33182+ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
33183+ znode_get_level(coord->node));
33184+
33185+ return CBK_COORD_FOUND;
33186+ }
33187+
33188+ coord_init_zero(coord);
33189+ result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
33190+ set_file_state(unix_file_inode_data(inode), result,
33191+ znode_get_level(coord->node));
33192+
33193+ /* FIXME: we might already have coord extension initialized */
33194+ hint->ext_coord.valid = 0;
33195+ return result;
33196+}
33197+
33198+/* plugin->u.file.write_flowom = NULL
33199+ plugin->u.file.read_flow = NULL */
33200+
33201+void hint_init_zero(hint_t * hint)
33202+{
33203+ memset(hint, 0, sizeof(*hint));
33204+ init_lh(&hint->lh);
33205+ hint->ext_coord.lh = &hint->lh;
33206+}
33207+
33208+static int find_file_state(struct inode *inode, struct unix_file_info *uf_info)
33209+{
33210+ int result;
33211+ reiser4_key key;
33212+ coord_t coord;
33213+ lock_handle lh;
33214+
33215+ assert("vs-1628", ea_obtained(uf_info));
33216+
33217+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33218+ key_by_inode_and_offset_common(inode, 0, &key);
33219+ init_lh(&lh);
33220+ result = find_file_item_nohint(&coord, &lh, &key,
33221+ ZNODE_READ_LOCK, inode);
33222+ set_file_state(uf_info, result, znode_get_level(coord.node));
33223+ done_lh(&lh);
33224+ if (!cbk_errored(result))
33225+ result = 0;
33226+ } else
33227+ result = 0;
33228+ assert("vs-1074",
33229+ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
33230+ reiser4_txn_restart_current();
33231+ return result;
33232+}
33233+
33234+/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
33235+ data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
33236+ if page corresponds to hole extent and unallocated one will have to be created */
33237+static int reserve_partial_page(reiser4_tree * tree)
33238+{
33239+ grab_space_enable();
33240+ return reiser4_grab_reserved(reiser4_get_current_sb(),
33241+ 1 +
33242+ 2 * estimate_one_insert_into_item(tree),
33243+ BA_CAN_COMMIT);
33244+}
33245+
33246+/* estimate and reserve space needed to cut one item and update one stat data */
33247+static int reserve_cut_iteration(reiser4_tree * tree)
33248+{
33249+ __u64 estimate = estimate_one_item_removal(tree)
33250+ + estimate_one_insert_into_item(tree);
33251+
33252+ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
33253+
33254+ grab_space_enable();
33255+ /* We need to double our estimate now that we can delete more than one
33256+ node. */
33257+ return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
33258+ BA_CAN_COMMIT);
33259+}
33260+
33261+int reiser4_update_file_size(struct inode *inode, reiser4_key * key,
33262+ int update_sd)
33263+{
33264+ int result = 0;
33265+
33266+ INODE_SET_SIZE(inode, get_key_offset(key));
33267+ if (update_sd) {
33268+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
33269+ result = reiser4_update_sd(inode);
33270+ }
33271+ return result;
33272+}
33273+
33274+/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
33275+ and update file stat data on every single cut from the tree */
33276+int
33277+cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
33278+ loff_t cur_size, int (*update_actor) (struct inode *,
33279+ reiser4_key *, int))
33280+{
33281+ reiser4_key from_key, to_key;
33282+ reiser4_key smallest_removed;
33283+ file_plugin *fplug = inode_file_plugin(inode);
33284+ int result;
33285+ int progress = 0;
33286+
33287+ assert("vs-1248",
33288+ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
33289+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
33290+
33291+ fplug->key_by_inode(inode, new_size, &from_key);
33292+ to_key = from_key;
33293+ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
33294+ /* this loop normally runs just once */
33295+ while (1) {
33296+ result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
33297+ if (result)
33298+ break;
33299+
33300+ result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
33301+ &smallest_removed, inode, 1,
33302+ &progress);
33303+ if (result == -E_REPEAT) {
33304+ /* -E_REPEAT is a signal to interrupt a long file truncation process */
33305+ if (progress) {
33306+ result =
33307+ update_actor(inode, &smallest_removed,
33308+ update_sd);
33309+ if (result)
33310+ break;
33311+ }
33312+
33313+ /* the below does up(sbinfo->delete_mutex). Do not get folled */
33314+ reiser4_release_reserved(inode->i_sb);
33315+
33316+ /* reiser4_cut_tree_object() was interrupted probably because
33317+ * current atom requires commit, we have to release
33318+ * transaction handle to allow atom commit. */
33319+ reiser4_txn_restart_current();
33320+ continue;
33321+ }
33322+ if (result
33323+ && !(result == CBK_COORD_NOTFOUND && new_size == 0
33324+ && inode->i_size == 0))
33325+ break;
33326+
33327+ set_key_offset(&smallest_removed, new_size);
33328+ /* Final sd update after the file gets its correct size */
33329+ result = update_actor(inode, &smallest_removed, update_sd);
33330+ break;
33331+ }
33332+
33333+ /* the below does up(sbinfo->delete_mutex). Do not get folled */
33334+ reiser4_release_reserved(inode->i_sb);
33335+
33336+ return result;
33337+}
33338+
33339+int find_or_create_extent(struct page *page);
33340+
33341+/* part of truncate_file_body: it is called when truncate is used to make file
33342+ shorter */
33343+static int shorten_file(struct inode *inode, loff_t new_size)
33344+{
33345+ int result;
33346+ struct page *page;
33347+ int padd_from;
33348+ unsigned long index;
33349+ struct unix_file_info *uf_info;
33350+
33351+ /*
33352+ * all items of ordinary reiser4 file are grouped together. That is why
33353+ * we can use reiser4_cut_tree. Plan B files (for instance) can not be
33354+ * truncated that simply
33355+ */
33356+ result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
33357+ get_key_offset(reiser4_max_key()),
33358+ reiser4_update_file_size);
33359+ if (result)
33360+ return result;
33361+
33362+ uf_info = unix_file_inode_data(inode);
33363+ assert("vs-1105", new_size == inode->i_size);
33364+ if (new_size == 0) {
33365+ uf_info->container = UF_CONTAINER_EMPTY;
33366+ return 0;
33367+ }
33368+
33369+ result = find_file_state(inode, uf_info);
33370+ if (result)
33371+ return result;
33372+ if (uf_info->container == UF_CONTAINER_TAILS)
33373+ /*
33374+ * No need to worry about zeroing last page after new file
33375+ * end
33376+ */
33377+ return 0;
33378+
33379+ padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
33380+ if (!padd_from)
33381+ /* file is truncated to page boundary */
33382+ return 0;
33383+
33384+ result = reserve_partial_page(reiser4_tree_by_inode(inode));
33385+ if (result) {
33386+ reiser4_release_reserved(inode->i_sb);
33387+ return result;
33388+ }
33389+
33390+ /* last page is partially truncated - zero its content */
33391+ index = (inode->i_size >> PAGE_CACHE_SHIFT);
33392+ page = read_mapping_page(inode->i_mapping, index, NULL);
33393+ if (IS_ERR(page)) {
33394+ /*
33395+ * the below does up(sbinfo->delete_mutex). Do not get
33396+ * confused
33397+ */
33398+ reiser4_release_reserved(inode->i_sb);
33399+ if (likely(PTR_ERR(page) == -EINVAL)) {
33400+ /* looks like file is built of tail items */
33401+ return 0;
33402+ }
33403+ return PTR_ERR(page);
33404+ }
33405+ wait_on_page_locked(page);
33406+ if (!PageUptodate(page)) {
33407+ page_cache_release(page);
33408+ /*
33409+ * the below does up(sbinfo->delete_mutex). Do not get
33410+ * confused
33411+ */
33412+ reiser4_release_reserved(inode->i_sb);
33413+ return RETERR(-EIO);
33414+ }
33415+
33416+ /*
33417+ * if page correspons to hole extent unit - unallocated one will be
33418+ * created here. This is not necessary
33419+ */
33420+ result = find_or_create_extent(page);
33421+
33422+ /*
33423+ * FIXME: cut_file_items has already updated inode. Probably it would
33424+ * be better to update it here when file is really truncated
33425+ */
33426+ if (result) {
33427+ page_cache_release(page);
33428+ /*
33429+ * the below does up(sbinfo->delete_mutex). Do not get
33430+ * confused
33431+ */
33432+ reiser4_release_reserved(inode->i_sb);
33433+ return result;
33434+ }
33435+
33436+ lock_page(page);
33437+ assert("vs-1066", PageLocked(page));
33438+ zero_user_page(page, padd_from, PAGE_CACHE_SIZE - padd_from, KM_USER0);
33439+ unlock_page(page);
33440+ page_cache_release(page);
33441+ /* the below does up(sbinfo->delete_mutex). Do not get confused */
33442+ reiser4_release_reserved(inode->i_sb);
33443+ return 0;
33444+}
33445+
33446+/**
33447+ * should_have_notail
33448+ * @uf_info:
33449+ * @new_size:
33450+ *
33451+ * Calls formatting plugin to see whether file of size @new_size has to be
33452+ * stored in unformatted nodes or in tail items. 0 is returned for later case.
33453+ */
33454+static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size)
33455+{
33456+ if (!uf_info->tplug)
33457+ return 1;
33458+ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
33459+ new_size);
33460+
33461+}
33462+
33463+/**
33464+ * truncate_file_body - change length of file
33465+ * @inode: inode of file
33466+ * @new_size: new file length
33467+ *
33468+ * Adjusts items file @inode is built of to match @new_size. It may either cut
33469+ * items or add them to represent a hole at the end of file. The caller has to
33470+ * obtain exclusive access to the file.
33471+ */
33472+static int truncate_file_body(struct inode *inode, struct iattr *attr)
33473+{
33474+ int result;
33475+ loff_t new_size = attr->ia_size;
33476+
33477+ if (inode->i_size < new_size) {
33478+ /* expanding truncate */
33479+ struct file * file = attr->ia_file;
33480+ struct unix_file_info *uf_info = unix_file_inode_data(inode);
33481+
33482+ assert("edward-1532", attr->ia_valid & ATTR_FILE);
33483+
33484+ result = find_file_state(inode, uf_info);
33485+ if (result)
33486+ return result;
33487+
33488+ if (should_have_notail(uf_info, new_size)) {
33489+ /*
33490+ * file of size @new_size has to be built of
33491+ * extents. If it is built of tails - convert to
33492+ * extents
33493+ */
33494+ if (uf_info->container == UF_CONTAINER_TAILS) {
33495+ /*
33496+ * if file is being convered by another process
33497+ * - wait until it completes
33498+ */
33499+ while (1) {
33500+ if (reiser4_inode_get_flag(inode,
33501+ REISER4_PART_IN_CONV)) {
33502+ drop_exclusive_access(uf_info);
33503+ schedule();
33504+ get_exclusive_access(uf_info);
33505+ continue;
33506+ }
33507+ break;
33508+ }
33509+
33510+ if (uf_info->container == UF_CONTAINER_TAILS) {
33511+ result = tail2extent(uf_info);
33512+ if (result)
33513+ return result;
33514+ }
33515+ }
33516+ result = reiser4_write_extent(file, NULL, 0,
33517+ &new_size);
33518+ if (result)
33519+ return result;
33520+ uf_info->container = UF_CONTAINER_EXTENTS;
33521+ } else {
33522+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
33523+ result = reiser4_write_extent(file, NULL, 0,
33524+ &new_size);
33525+ if (result)
33526+ return result;
33527+ } else {
33528+ result = reiser4_write_tail(file, NULL, 0,
33529+ &new_size);
33530+ if (result)
33531+ return result;
33532+ uf_info->container = UF_CONTAINER_TAILS;
33533+ }
33534+ }
33535+ BUG_ON(result > 0);
33536+ INODE_SET_FIELD(inode, i_size, new_size);
33537+ file_update_time(file);
33538+ result = reiser4_update_sd(inode);
33539+ BUG_ON(result != 0);
33540+ reiser4_free_file_fsdata(file);
33541+ } else
33542+ result = shorten_file(inode, new_size);
33543+ return result;
33544+}
33545+
33546+/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
33547+
33548+/**
33549+ * load_file_hint - copy hint from struct file to local variable
33550+ * @file: file to get hint from
33551+ * @hint: structure to fill
33552+ *
33553+ * Reiser4 specific portion of struct file may contain information (hint)
33554+ * stored on exiting from previous read or write. That information includes
33555+ * seal of znode and coord within that znode where previous read or write
33556+ * stopped. This function copies that information to @hint if it was stored or
33557+ * initializes @hint by 0s otherwise.
33558+ */
33559+int load_file_hint(struct file *file, hint_t *hint)
33560+{
33561+ reiser4_file_fsdata *fsdata;
33562+
33563+ if (file) {
33564+ fsdata = reiser4_get_file_fsdata(file);
33565+ if (IS_ERR(fsdata))
33566+ return PTR_ERR(fsdata);
33567+
33568+ spin_lock_inode(file->f_dentry->d_inode);
33569+ if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
33570+ *hint = fsdata->reg.hint;
33571+ init_lh(&hint->lh);
33572+ hint->ext_coord.lh = &hint->lh;
33573+ spin_unlock_inode(file->f_dentry->d_inode);
33574+ /*
33575+ * force re-validation of the coord on the first
33576+ * iteration of the read/write loop.
33577+ */
33578+ hint->ext_coord.valid = 0;
33579+ assert("nikita-19892", coords_equal(&hint->seal.coord1,
33580+ &hint->ext_coord.
33581+ coord));
33582+ return 0;
33583+ }
33584+ memset(&fsdata->reg.hint, 0, sizeof(hint_t));
33585+ spin_unlock_inode(file->f_dentry->d_inode);
33586+ }
33587+ hint_init_zero(hint);
33588+ return 0;
33589+}
33590+
33591+/**
33592+ * save_file_hint - copy hint to reiser4 private struct file's part
33593+ * @file: file to save hint in
33594+ * @hint: hint to save
33595+ *
33596+ * This copies @hint to reiser4 private part of struct file. It can help
33597+ * speedup future accesses to the file.
33598+ */
33599+void save_file_hint(struct file *file, const hint_t *hint)
33600+{
33601+ reiser4_file_fsdata *fsdata;
33602+
33603+ assert("edward-1337", hint != NULL);
33604+
33605+ if (!file || !reiser4_seal_is_set(&hint->seal))
33606+ return;
33607+ fsdata = reiser4_get_file_fsdata(file);
33608+ assert("vs-965", !IS_ERR(fsdata));
33609+ assert("nikita-19891",
33610+ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
33611+ assert("vs-30", hint->lh.owner == NULL);
33612+ spin_lock_inode(file->f_dentry->d_inode);
33613+ fsdata->reg.hint = *hint;
33614+ spin_unlock_inode(file->f_dentry->d_inode);
33615+ return;
33616+}
33617+
33618+void reiser4_unset_hint(hint_t * hint)
33619+{
33620+ assert("vs-1315", hint);
33621+ hint->ext_coord.valid = 0;
33622+ reiser4_seal_done(&hint->seal);
33623+ done_lh(&hint->lh);
33624+}
33625+
33626+/* coord must be set properly. So, that reiser4_set_hint
33627+ has nothing to do */
33628+void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
33629+ znode_lock_mode mode)
33630+{
33631+ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
33632+ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
33633+
33634+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
33635+ hint->offset = get_key_offset(key);
33636+ hint->mode = mode;
33637+ done_lh(&hint->lh);
33638+}
33639+
33640+int hint_is_set(const hint_t * hint)
33641+{
33642+ return reiser4_seal_is_set(&hint->seal);
33643+}
33644+
33645+#if REISER4_DEBUG
33646+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
33647+{
33648+ return (get_key_locality(k1) == get_key_locality(k2) &&
33649+ get_key_type(k1) == get_key_type(k2) &&
33650+ get_key_band(k1) == get_key_band(k2) &&
33651+ get_key_ordering(k1) == get_key_ordering(k2) &&
33652+ get_key_objectid(k1) == get_key_objectid(k2));
33653+}
33654+#endif
33655+
33656+static int
33657+hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
33658+ znode_lock_mode lock_mode)
33659+{
33660+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
33661+ /* hint either not set or set by different operation */
33662+ return RETERR(-E_REPEAT);
33663+
33664+ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
33665+
33666+ if (check_key && get_key_offset(key) != hint->offset)
33667+ /* hint is set for different key */
33668+ return RETERR(-E_REPEAT);
33669+
33670+ assert("vs-31", hint->ext_coord.lh == &hint->lh);
33671+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
33672+ hint->ext_coord.lh, lock_mode,
33673+ ZNODE_LOCK_LOPRI);
33674+}
33675+
33676+/**
33677+ * find_or_create_extent -
33678+ * @page:
33679+ *
33680+ *
33681+ */
33682+/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
33683+ unallocated extent if it does not exist yet, initialize jnode, capture page */
33684+int find_or_create_extent(struct page *page)
33685+{
33686+ int result;
33687+ struct inode *inode;
33688+ int plugged_hole;
33689+
33690+ jnode *node;
33691+
33692+ assert("vs-1065", page->mapping && page->mapping->host);
33693+ inode = page->mapping->host;
33694+
33695+ lock_page(page);
33696+ node = jnode_of_page(page);
33697+ if (IS_ERR(node)) {
33698+ unlock_page(page);
33699+ return PTR_ERR(node);
33700+ }
33701+ JF_SET(node, JNODE_WRITE_PREPARED);
33702+ unlock_page(page);
33703+
33704+ if (node->blocknr == 0) {
33705+ plugged_hole = 0;
33706+ result = reiser4_update_extent(inode, node, page_offset(page),
33707+ &plugged_hole);
33708+ if (result) {
33709+ JF_CLR(node, JNODE_WRITE_PREPARED);
33710+ jput(node);
33711+ warning("", "reiser4_update_extent failed: %d", result);
33712+ return result;
33713+ }
33714+ if (plugged_hole)
33715+ reiser4_update_sd(inode);
33716+ } else {
33717+ spin_lock_jnode(node);
33718+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
33719+ BUG_ON(result != 0);
33720+ jnode_make_dirty_locked(node);
33721+ spin_unlock_jnode(node);
33722+ }
33723+
33724+ BUG_ON(node->atom == NULL);
33725+ JF_CLR(node, JNODE_WRITE_PREPARED);
33726+ jput(node);
33727+
33728+ if (get_current_context()->entd) {
33729+ entd_context *ent = get_entd_context(node->tree->super);
33730+
33731+ if (ent->cur_request->page == page)
33732+ ent->cur_request->node = node;
33733+ }
33734+ return 0;
33735+}
33736+
33737+/**
33738+ * has_anonymous_pages - check whether inode has pages dirtied via mmap
33739+ * @inode: inode to check
33740+ *
33741+ * Returns true if inode's mapping has dirty pages which do not belong to any
33742+ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
33743+ * tree or were eflushed and can be found via jnodes tagged
33744+ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
33745+ */
33746+static int has_anonymous_pages(struct inode *inode)
33747+{
33748+ int result;
33749+
33750+ read_lock_irq(&inode->i_mapping->tree_lock);
33751+ result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
33752+ read_unlock_irq(&inode->i_mapping->tree_lock);
33753+ return result;
33754+}
33755+
33756+/**
33757+ * capture_page_and_create_extent -
33758+ * @page: page to be captured
33759+ *
33760+ * Grabs space for extent creation and stat data update and calls function to
33761+ * do actual work.
33762+ */
33763+static int capture_page_and_create_extent(struct page *page)
33764+{
33765+ int result;
33766+ struct inode *inode;
33767+
33768+ assert("vs-1084", page->mapping && page->mapping->host);
33769+ inode = page->mapping->host;
33770+ assert("vs-1139",
33771+ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
33772+ /* page belongs to file */
33773+ assert("vs-1393",
33774+ inode->i_size > page_offset(page));
33775+
33776+ /* page capture may require extent creation (if it does not exist yet)
33777+ and stat data's update (number of blocks changes on extent
33778+ creation) */
33779+ grab_space_enable();
33780+ result = reiser4_grab_space(2 * estimate_one_insert_into_item
33781+ (reiser4_tree_by_inode(inode)),
33782+ BA_CAN_COMMIT);
33783+ if (likely(!result))
33784+ result = find_or_create_extent(page);
33785+
33786+ if (result != 0)
33787+ SetPageError(page);
33788+ return result;
33789+}
33790+
33791+/* this is implementation of method commit_write of struct
33792+ address_space_operations for unix file plugin */
33793+int
33794+commit_write_unix_file(struct file *file, struct page *page,
33795+ unsigned from, unsigned to)
33796+{
33797+ reiser4_context *ctx;
33798+ struct inode *inode;
33799+ int result;
33800+
33801+ assert("umka-3101", file != NULL);
33802+ assert("umka-3102", page != NULL);
33803+ assert("umka-3093", PageLocked(page));
33804+
33805+ SetPageUptodate(page);
33806+
33807+ inode = page->mapping->host;
33808+ ctx = reiser4_init_context(page->mapping->host->i_sb);
33809+ if (IS_ERR(ctx))
33810+ return PTR_ERR(ctx);
33811+ page_cache_get(page);
33812+ unlock_page(page);
33813+ result = capture_page_and_create_extent(page);
33814+ lock_page(page);
33815+ page_cache_release(page);
33816+
33817+ /* don't commit transaction under inode semaphore */
33818+ context_set_commit_async(ctx);
33819+ reiser4_exit_context(ctx);
33820+ return result;
33821+}
33822+
33823+/*
33824+ * Support for "anonymous" pages and jnodes.
33825+ *
33826+ * When file is write-accessed through mmap pages can be dirtied from the user
33827+ * level. In this case kernel is not notified until one of following happens:
33828+ *
33829+ * (1) msync()
33830+ *
33831+ * (2) truncate() (either explicit or through unlink)
33832+ *
33833+ * (3) VM scanner starts reclaiming mapped pages, dirtying them before
33834+ * starting write-back.
33835+ *
33836+ * As a result of (3) ->writepage may be called on a dirty page without
33837+ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
33838+ * (iozone) generate huge number of anonymous pages. Emergency flush handles
33839+ * this situation by creating jnode for anonymous page, starting IO on the
33840+ * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
33841+ * memory. Such jnode is also called anonymous.
33842+ *
33843+ * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
33844+ * tree. This is done by capture_anonymous_*() functions below.
33845+ */
33846+
33847+/**
33848+ * capture_anonymous_page - involve page into transaction
33849+ * @pg: page to deal with
33850+ *
33851+ * Takes care that @page has corresponding metadata in the tree, creates jnode
33852+ * for @page and captures it. On success 1 is returned.
33853+ */
33854+static int capture_anonymous_page(struct page *page)
33855+{
33856+ int result;
33857+
33858+ if (PageWriteback(page))
33859+ /* FIXME: do nothing? */
33860+ return 0;
33861+
33862+ result = capture_page_and_create_extent(page);
33863+ if (result == 0) {
33864+ result = 1;
33865+ } else
33866+ warning("nikita-3329",
33867+ "Cannot capture anon page: %i", result);
33868+
33869+ return result;
33870+}
33871+
33872+/**
33873+ * capture_anonymous_pages - find and capture pages dirtied via mmap
33874+ * @mapping: address space where to look for pages
33875+ * @index: start index
33876+ * @to_capture: maximum number of pages to capture
33877+ *
33878+ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
33879+ * captures (involves into atom) them, returns number of captured pages,
33880+ * updates @index to next page after the last captured one.
33881+ */
33882+static int
33883+capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
33884+ unsigned int to_capture)
33885+{
33886+ int result;
33887+ struct pagevec pvec;
33888+ unsigned int i, count;
33889+ int nr;
33890+
33891+ pagevec_init(&pvec, 0);
33892+ count = min(pagevec_space(&pvec), to_capture);
33893+ nr = 0;
33894+
33895+ /* find pages tagged MOVED */
33896+ write_lock_irq(&mapping->tree_lock);
33897+ pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
33898+ (void **)pvec.pages, *index, count,
33899+ PAGECACHE_TAG_REISER4_MOVED);
33900+ if (pagevec_count(&pvec) == 0) {
33901+ /*
33902+ * there are no pages tagged MOVED in mapping->page_tree
33903+ * starting from *index
33904+ */
33905+ write_unlock_irq(&mapping->tree_lock);
33906+ *index = (pgoff_t)-1;
33907+ return 0;
33908+ }
33909+
33910+ /* clear MOVED tag for all found pages */
33911+ for (i = 0; i < pagevec_count(&pvec); i++) {
33912+ void *p;
33913+
33914+ page_cache_get(pvec.pages[i]);
33915+ p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
33916+ PAGECACHE_TAG_REISER4_MOVED);
33917+ assert("vs-49", p == pvec.pages[i]);
33918+ }
33919+ write_unlock_irq(&mapping->tree_lock);
33920+
33921+
33922+ *index = pvec.pages[i - 1]->index + 1;
33923+
33924+ for (i = 0; i < pagevec_count(&pvec); i++) {
33925+ /*
33926+ * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
33927+ * reiser4_set_page_dirty_internal which is called when jnode is
33928+ * captured
33929+ */
33930+ result = capture_anonymous_page(pvec.pages[i]);
33931+ if (result == 1)
33932+ nr++;
33933+ else {
33934+ if (result < 0) {
33935+ warning("vs-1454",
33936+ "failed to capture page: "
33937+ "result=%d, captured=%d)\n",
33938+ result, i);
33939+
33940+ /*
33941+ * set MOVED tag to all pages which left not
33942+ * captured
33943+ */
33944+ write_lock_irq(&mapping->tree_lock);
33945+ for (; i < pagevec_count(&pvec); i ++) {
33946+ radix_tree_tag_set(&mapping->page_tree,
33947+ pvec.pages[i]->index,
33948+ PAGECACHE_TAG_REISER4_MOVED);
33949+ }
33950+ write_unlock_irq(&mapping->tree_lock);
33951+
33952+ pagevec_release(&pvec);
33953+ return result;
33954+ } else {
33955+ /*
33956+ * result == 0. capture_anonymous_page returns
33957+ * 0 for Writeback-ed page. Set MOVED tag on
33958+ * that page
33959+ */
33960+ write_lock_irq(&mapping->tree_lock);
33961+ radix_tree_tag_set(&mapping->page_tree,
33962+ pvec.pages[i]->index,
33963+ PAGECACHE_TAG_REISER4_MOVED);
33964+ write_unlock_irq(&mapping->tree_lock);
33965+ if (i == 0)
33966+ *index = pvec.pages[0]->index;
33967+ else
33968+ *index = pvec.pages[i - 1]->index + 1;
33969+ }
33970+ }
33971+ }
33972+ pagevec_release(&pvec);
33973+ return nr;
33974+}
33975+
33976+/**
33977+ * capture_anonymous_jnodes - find and capture anonymous jnodes
33978+ * @mapping: address space where to look for jnodes
33979+ * @from: start index
33980+ * @to: end index
33981+ * @to_capture: maximum number of jnodes to capture
33982+ *
33983+ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
33984+ * the range of indexes @from-@to and captures them, returns number of captured
33985+ * jnodes, updates @from to next jnode after the last captured one.
33986+ */
33987+static int
33988+capture_anonymous_jnodes(struct address_space *mapping,
33989+ pgoff_t *from, pgoff_t to, int to_capture)
33990+{
33991+ *from = to;
33992+ return 0;
33993+}
33994+
33995+/*
33996+ * Commit atom of the jnode of a page.
33997+ */
33998+static int sync_page(struct page *page)
33999+{
34000+ int result;
34001+ do {
34002+ jnode *node;
34003+ txn_atom *atom;
34004+
34005+ lock_page(page);
34006+ node = jprivate(page);
34007+ if (node != NULL) {
34008+ spin_lock_jnode(node);
34009+ atom = jnode_get_atom(node);
34010+ spin_unlock_jnode(node);
34011+ } else
34012+ atom = NULL;
34013+ unlock_page(page);
34014+ result = reiser4_sync_atom(atom);
34015+ } while (result == -E_REPEAT);
34016+ /*
34017+ * ZAM-FIXME-HANS: document the logic of this loop, is it just to
34018+ * handle the case where more pages get added to the atom while we are
34019+ * syncing it?
34020+ */
34021+ assert("nikita-3485", ergo(result == 0,
34022+ get_current_context()->trans->atom == NULL));
34023+ return result;
34024+}
34025+
34026+/*
34027+ * Commit atoms of pages on @pages list.
34028+ * call sync_page for each page from mapping's page tree
34029+ */
34030+static int sync_page_list(struct inode *inode)
34031+{
34032+ int result;
34033+ struct address_space *mapping;
34034+ unsigned long from; /* start index for radix_tree_gang_lookup */
34035+ unsigned int found; /* return value for radix_tree_gang_lookup */
34036+
34037+ mapping = inode->i_mapping;
34038+ from = 0;
34039+ result = 0;
34040+ read_lock_irq(&mapping->tree_lock);
34041+ while (result == 0) {
34042+ struct page *page;
34043+
34044+ found =
34045+ radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
34046+ from, 1);
34047+ assert("", found < 2);
34048+ if (found == 0)
34049+ break;
34050+
34051+ /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
34052+ sys_fsync */
34053+ page_cache_get(page);
34054+ read_unlock_irq(&mapping->tree_lock);
34055+
34056+ from = page->index + 1;
34057+
34058+ result = sync_page(page);
34059+
34060+ page_cache_release(page);
34061+ read_lock_irq(&mapping->tree_lock);
34062+ }
34063+
34064+ read_unlock_irq(&mapping->tree_lock);
34065+ return result;
34066+}
34067+
34068+static int commit_file_atoms(struct inode *inode)
34069+{
34070+ int result;
34071+ struct unix_file_info *uf_info;
34072+
34073+ uf_info = unix_file_inode_data(inode);
34074+
34075+ get_exclusive_access(uf_info);
34076+ /*
34077+ * find what items file is made from
34078+ */
34079+ result = find_file_state(inode, uf_info);
34080+ drop_exclusive_access(uf_info);
34081+ if (result != 0)
34082+ return result;
34083+
34084+ /*
34085+ * file state cannot change because we are under ->i_mutex
34086+ */
34087+ switch (uf_info->container) {
34088+ case UF_CONTAINER_EXTENTS:
34089+ /* find_file_state might open join an atom */
34090+ reiser4_txn_restart_current();
34091+ result =
34092+ /*
34093+ * when we are called by
34094+ * filemap_fdatawrite->
34095+ * do_writepages()->
34096+ * reiser4_writepages()
34097+ *
34098+ * inode->i_mapping->dirty_pages are spices into
34099+ * ->io_pages, leaving ->dirty_pages dirty.
34100+ *
34101+ * When we are called from
34102+ * reiser4_fsync()->sync_unix_file(), we have to
34103+ * commit atoms of all pages on the ->dirty_list.
34104+ *
34105+ * So for simplicity we just commit ->io_pages and
34106+ * ->dirty_pages.
34107+ */
34108+ sync_page_list(inode);
34109+ break;
34110+ case UF_CONTAINER_TAILS:
34111+ /*
34112+ * NOTE-NIKITA probably we can be smarter for tails. For now
34113+ * just commit all existing atoms.
34114+ */
34115+ result = txnmgr_force_commit_all(inode->i_sb, 0);
34116+ break;
34117+ case UF_CONTAINER_EMPTY:
34118+ result = 0;
34119+ break;
34120+ case UF_CONTAINER_UNKNOWN:
34121+ default:
34122+ result = -EIO;
34123+ break;
34124+ }
34125+
34126+ /*
34127+ * commit current transaction: there can be captured nodes from
34128+ * find_file_state() and finish_conversion().
34129+ */
34130+ reiser4_txn_restart_current();
34131+ return result;
34132+}
34133+
34134+/**
34135+ * writepages_unix_file - writepages of struct address_space_operations
34136+ * @mapping:
34137+ * @wbc:
34138+ *
34139+ * This captures anonymous pages and anonymous jnodes. Anonymous pages are
34140+ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
34141+ * created by reiser4_writepage.
34142+ */
34143+int writepages_unix_file(struct address_space *mapping,
34144+ struct writeback_control *wbc)
34145+{
34146+ int result;
34147+ struct unix_file_info *uf_info;
34148+ pgoff_t pindex, jindex, nr_pages;
34149+ long to_capture;
34150+ struct inode *inode;
34151+
34152+ inode = mapping->host;
34153+ if (!has_anonymous_pages(inode)) {
34154+ result = 0;
34155+ goto end;
34156+ }
34157+ jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
34158+ result = 0;
34159+ nr_pages = size_in_pages(i_size_read(inode));
34160+
34161+ uf_info = unix_file_inode_data(inode);
34162+
34163+ do {
34164+ reiser4_context *ctx;
34165+
34166+ if (wbc->sync_mode != WB_SYNC_ALL)
34167+ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
34168+ else
34169+ to_capture = CAPTURE_APAGE_BURST;
34170+
34171+ ctx = reiser4_init_context(inode->i_sb);
34172+ if (IS_ERR(ctx)) {
34173+ result = PTR_ERR(ctx);
34174+ break;
34175+ }
34176+ /* avoid recursive calls to ->sync_inodes */
34177+ ctx->nobalance = 1;
34178+ assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
34179+ assert("", LOCK_CNT_NIL(inode_sem_w));
34180+ assert("", LOCK_CNT_NIL(inode_sem_r));
34181+
34182+ reiser4_txn_restart_current();
34183+
34184+ /* we have to get nonexclusive access to the file */
34185+ if (get_current_context()->entd) {
34186+ /*
34187+ * use nonblocking version of nonexclusive_access to
34188+ * avoid deadlock which might look like the following:
34189+ * process P1 holds NEA on file F1 and called entd to
34190+ * reclaim some memory. Entd works for P1 and is going
34191+ * to capture pages of file F2. To do that entd has to
34192+ * get NEA to F2. F2 is held by process P2 which also
34193+ * called entd. But entd is serving P1 at the moment
34194+ * and P2 has to wait. Process P3 trying to get EA to
34195+ * file F2. Existence of pending EA request to file F2
34196+ * makes impossible for entd to get NEA to file
34197+ * F2. Neither of these process can continue. Using
34198+ * nonblocking version of gettign NEA is supposed to
34199+ * avoid this deadlock.
34200+ */
34201+ if (try_to_get_nonexclusive_access(uf_info) == 0) {
34202+ result = RETERR(-EBUSY);
34203+ reiser4_exit_context(ctx);
34204+ break;
34205+ }
34206+ } else
34207+ get_nonexclusive_access(uf_info);
34208+
34209+ while (to_capture > 0) {
34210+ pgoff_t start;
34211+
34212+ assert("vs-1727", jindex <= pindex);
34213+ if (pindex == jindex) {
34214+ start = pindex;
34215+ result =
34216+ capture_anonymous_pages(inode->i_mapping,
34217+ &pindex,
34218+ to_capture);
34219+ if (result <= 0)
34220+ break;
34221+ to_capture -= result;
34222+ wbc->nr_to_write -= result;
34223+ if (start + result == pindex) {
34224+ jindex = pindex;
34225+ continue;
34226+ }
34227+ if (to_capture <= 0)
34228+ break;
34229+ }
34230+ /* deal with anonymous jnodes between jindex and pindex */
34231+ result =
34232+ capture_anonymous_jnodes(inode->i_mapping, &jindex,
34233+ pindex, to_capture);
34234+ if (result < 0)
34235+ break;
34236+ to_capture -= result;
34237+ get_current_context()->nr_captured += result;
34238+
34239+ if (jindex == (pgoff_t) - 1) {
34240+ assert("vs-1728", pindex == (pgoff_t) - 1);
34241+ break;
34242+ }
34243+ }
34244+ if (to_capture <= 0)
34245+ /* there may be left more pages */
34246+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
34247+
34248+ drop_nonexclusive_access(uf_info);
34249+ if (result < 0) {
34250+ /* error happened */
34251+ reiser4_exit_context(ctx);
34252+ return result;
34253+ }
34254+ if (wbc->sync_mode != WB_SYNC_ALL) {
34255+ reiser4_exit_context(ctx);
34256+ return 0;
34257+ }
34258+ result = commit_file_atoms(inode);
34259+ reiser4_exit_context(ctx);
34260+ if (pindex >= nr_pages && jindex == pindex)
34261+ break;
34262+ } while (1);
34263+
34264+ end:
34265+ if (is_in_reiser4_context()) {
34266+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34267+ /*
34268+ * there are already pages to flush, flush them out, do
34269+ * not delay until end of reiser4_sync_inodes
34270+ */
34271+ reiser4_writeout(inode->i_sb, wbc);
34272+ get_current_context()->nr_captured = 0;
34273+ }
34274+ }
34275+ return result;
34276+}
34277+
34278+/**
34279+ * readpage_unix_file_nolock - readpage of struct address_space_operations
34280+ * @file:
34281+ * @page:
34282+ *
34283+ * Compose a key and search for item containing information about @page
34284+ * data. If item is found - its readpage method is called.
34285+ */
34286+int readpage_unix_file(struct file *file, struct page *page)
34287+{
34288+ reiser4_context *ctx;
34289+ int result;
34290+ struct inode *inode;
34291+ reiser4_key key;
34292+ item_plugin *iplug;
34293+ hint_t *hint;
34294+ lock_handle *lh;
34295+ coord_t *coord;
34296+
34297+ assert("vs-1062", PageLocked(page));
34298+ assert("vs-976", !PageUptodate(page));
34299+ assert("vs-1061", page->mapping && page->mapping->host);
34300+
34301+ if (page->mapping->host->i_size <= page_offset(page)) {
34302+ /* page is out of file */
34303+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
34304+ SetPageUptodate(page);
34305+ unlock_page(page);
34306+ return 0;
34307+ }
34308+
34309+ inode = page->mapping->host;
34310+ ctx = reiser4_init_context(inode->i_sb);
34311+ if (IS_ERR(ctx)) {
34312+ unlock_page(page);
34313+ return PTR_ERR(ctx);
34314+ }
34315+
34316+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34317+ if (hint == NULL) {
34318+ unlock_page(page);
34319+ reiser4_exit_context(ctx);
34320+ return RETERR(-ENOMEM);
34321+ }
34322+
34323+ result = load_file_hint(file, hint);
34324+ if (result) {
34325+ kfree(hint);
34326+ unlock_page(page);
34327+ reiser4_exit_context(ctx);
34328+ return result;
34329+ }
34330+ lh = &hint->lh;
34331+
34332+ /* get key of first byte of the page */
34333+ key_by_inode_and_offset_common(inode, page_offset(page), &key);
34334+
34335+ /* look for file metadata corresponding to first byte of page */
34336+ page_cache_get(page);
34337+ unlock_page(page);
34338+ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
34339+ lock_page(page);
34340+ page_cache_release(page);
34341+
34342+ if (page->mapping == NULL) {
34343+ /*
34344+ * readpage allows truncate to run concurrently. Page was
34345+ * truncated while it was not locked
34346+ */
34347+ done_lh(lh);
34348+ kfree(hint);
34349+ unlock_page(page);
34350+ reiser4_txn_restart(ctx);
34351+ reiser4_exit_context(ctx);
34352+ return -EINVAL;
34353+ }
34354+
34355+ if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
34356+ if (result == CBK_COORD_FOUND &&
34357+ hint->ext_coord.coord.between != AT_UNIT)
34358+ /* file is truncated */
34359+ result = -EINVAL;
34360+ done_lh(lh);
34361+ kfree(hint);
34362+ unlock_page(page);
34363+ reiser4_txn_restart(ctx);
34364+ reiser4_exit_context(ctx);
34365+ return result;
34366+ }
34367+
34368+ /*
34369+ * item corresponding to page is found. It can not be removed because
34370+ * znode lock is held
34371+ */
34372+ if (PageUptodate(page)) {
34373+ done_lh(lh);
34374+ kfree(hint);
34375+ unlock_page(page);
34376+ reiser4_txn_restart(ctx);
34377+ reiser4_exit_context(ctx);
34378+ return 0;
34379+ }
34380+
34381+ coord = &hint->ext_coord.coord;
34382+ result = zload(coord->node);
34383+ if (result) {
34384+ done_lh(lh);
34385+ kfree(hint);
34386+ unlock_page(page);
34387+ reiser4_txn_restart(ctx);
34388+ reiser4_exit_context(ctx);
34389+ return result;
34390+ }
34391+
34392+ validate_extended_coord(&hint->ext_coord, page_offset(page));
34393+
34394+ if (!coord_is_existing_unit(coord)) {
34395+ /* this indicates corruption */
34396+ warning("vs-280",
34397+ "Looking for page %lu of file %llu (size %lli). "
34398+ "No file items found (%d). File is corrupted?\n",
34399+ page->index, (unsigned long long)get_inode_oid(inode),
34400+ inode->i_size, result);
34401+ zrelse(coord->node);
34402+ done_lh(lh);
34403+ kfree(hint);
34404+ unlock_page(page);
34405+ reiser4_txn_restart(ctx);
34406+ reiser4_exit_context(ctx);
34407+ return RETERR(-EIO);
34408+ }
34409+
34410+ /*
34411+ * get plugin of found item or use plugin if extent if there are no
34412+ * one
34413+ */
34414+ iplug = item_plugin_by_coord(coord);
34415+ if (iplug->s.file.readpage)
34416+ result = iplug->s.file.readpage(coord, page);
34417+ else
34418+ result = RETERR(-EINVAL);
34419+
34420+ if (!result) {
34421+ set_key_offset(&key,
34422+ (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
34423+ /* FIXME should call reiser4_set_hint() */
34424+ reiser4_unset_hint(hint);
34425+ } else {
34426+ unlock_page(page);
34427+ reiser4_unset_hint(hint);
34428+ }
34429+ assert("vs-979",
34430+ ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
34431+ assert("vs-9791", ergo(result != 0, !PageLocked(page)));
34432+
34433+ zrelse(coord->node);
34434+ done_lh(lh);
34435+
34436+ save_file_hint(file, hint);
34437+ kfree(hint);
34438+
34439+ /*
34440+ * FIXME: explain why it is needed. HINT: page allocation in write can
34441+ * not be done when atom is not NULL because reiser4_writepage can not
34442+ * kick entd and have to eflush
34443+ */
34444+ reiser4_txn_restart(ctx);
34445+ reiser4_exit_context(ctx);
34446+ return result;
34447+}
34448+
34449+struct uf_readpages_context {
34450+ lock_handle lh;
34451+ coord_t coord;
34452+};
34453+
34454+/* A callback function for readpages_unix_file/read_cache_pages.
34455+ * If the file is build of tails, then return error (-ENOENT).
34456+ *
34457+ * @data -- a pointer to reiser4_readpages_context object,
34458+ * to save the twig lock and the coord between
34459+ * read_cache_page iterations.
34460+ * @page -- page to start read.
34461+ */
34462+static int uf_readpages_filler(void * data, struct page * page)
34463+{
34464+ struct uf_readpages_context *rc = data;
34465+ jnode * node;
34466+ int ret = 0;
34467+ reiser4_extent *ext;
34468+ __u64 ext_index;
34469+ int cbk_done = 0;
34470+ struct address_space * mapping = page->mapping;
34471+
34472+ if (PageUptodate(page)) {
34473+ unlock_page(page);
34474+ return 0;
34475+ }
34476+ page_cache_get(page);
34477+
34478+ if (rc->lh.node == 0) {
34479+ /* no twig lock - have to do tree search. */
34480+ reiser4_key key;
34481+ repeat:
34482+ unlock_page(page);
34483+ key_by_inode_and_offset_common(
34484+ mapping->host, page_offset(page), &key);
34485+ ret = coord_by_key(
34486+ &get_super_private(mapping->host->i_sb)->tree,
34487+ &key, &rc->coord, &rc->lh,
34488+ ZNODE_READ_LOCK, FIND_EXACT,
34489+ TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
34490+ if (unlikely(ret))
34491+ goto exit;
34492+ lock_page(page);
34493+ if (PageUptodate(page))
34494+ goto unlock;
34495+ cbk_done = 1;
34496+ }
34497+ ret = zload(rc->coord.node);
34498+ if (unlikely(ret))
34499+ goto unlock;
34500+ if (!coord_is_existing_item(&rc->coord) ||
34501+ !item_is_extent(&rc->coord)) {
34502+ zrelse(rc->coord.node);
34503+ ret = RETERR(-EIO);
34504+ goto unlock;
34505+ }
34506+ ext = extent_by_coord(&rc->coord);
34507+ ext_index = extent_unit_index(&rc->coord);
34508+ if (page->index < ext_index ||
34509+ page->index >= ext_index + extent_get_width(ext)) {
34510+ /* the page index doesn't belong to the extent unit
34511+ which the coord points to - release the lock and
34512+ repeat with tree search. */
34513+ zrelse(rc->coord.node);
34514+ done_lh(&rc->lh);
34515+ /* we can be here after a CBK call only in case of
34516+ corruption of the tree or the tree lookup algorithm bug. */
34517+ if (unlikely(cbk_done)) {
34518+ ret = RETERR(-EIO);
34519+ goto unlock;
34520+ }
34521+ goto repeat;
34522+ }
34523+ node = jnode_of_page(page);
34524+ if (unlikely(IS_ERR(node))) {
34525+ zrelse(rc->coord.node);
34526+ ret = PTR_ERR(node);
34527+ goto unlock;
34528+ }
34529+ ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
34530+ jput(node);
34531+ zrelse(rc->coord.node);
34532+ if (likely(!ret))
34533+ goto exit;
34534+ unlock:
34535+ unlock_page(page);
34536+ exit:
34537+ page_cache_release(page);
34538+ return ret;
34539+}
34540+
34541+/**
34542+ * readpages_unix_file - called by the readahead code, starts reading for each
34543+ * page of given list of pages
34544+ */
34545+int readpages_unix_file(
34546+ struct file *file, struct address_space *mapping,
34547+ struct list_head *pages, unsigned nr_pages)
34548+{
34549+ reiser4_context *ctx;
34550+ struct uf_readpages_context rc;
34551+ int ret;
34552+
34553+ ctx = reiser4_init_context(mapping->host->i_sb);
34554+ if (IS_ERR(ctx)) {
34555+ put_pages_list(pages);
34556+ return PTR_ERR(ctx);
34557+ }
34558+ init_lh(&rc.lh);
34559+ ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc);
34560+ done_lh(&rc.lh);
34561+ context_set_commit_async(ctx);
34562+ /* close the transaction to protect further page allocation from deadlocks */
34563+ reiser4_txn_restart(ctx);
34564+ reiser4_exit_context(ctx);
34565+ return ret;
34566+}
34567+
34568+static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
34569+ loff_t count UNUSED_ARG)
34570+{
34571+ /* We should reserve one block, because of updating of the stat data
34572+ item */
34573+ assert("vs-1249",
34574+ inode_file_plugin(inode)->estimate.update ==
34575+ estimate_update_common);
34576+ return estimate_update_common(inode);
34577+}
34578+
34579+/* this is called with nonexclusive access obtained, file's container can not change */
34580+static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */
34581+ char __user *buf, /* address of user-space buffer */
34582+ size_t count, /* number of bytes to read */
34583+ loff_t *off)
34584+{
34585+ int result;
34586+ struct inode *inode;
34587+ flow_t flow;
34588+ int (*read_f) (struct file *, flow_t *, hint_t *);
34589+ coord_t *coord;
34590+ znode *loaded;
34591+
34592+ inode = file->f_dentry->d_inode;
34593+
34594+ /* build flow */
34595+ assert("vs-1250",
34596+ inode_file_plugin(inode)->flow_by_inode ==
34597+ flow_by_inode_unix_file);
34598+ result =
34599+ flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
34600+ *off, READ_OP, &flow);
34601+ if (unlikely(result))
34602+ return result;
34603+
34604+ /* get seal and coord sealed with it from reiser4 private data
34605+ of struct file. The coord will tell us where our last read
34606+ of this file finished, and the seal will help to determine
34607+ if that location is still valid.
34608+ */
34609+ coord = &hint->ext_coord.coord;
34610+ while (flow.length && result == 0) {
34611+ result =
34612+ find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
34613+ if (cbk_errored(result))
34614+ /* error happened */
34615+ break;
34616+
34617+ if (coord->between != AT_UNIT) {
34618+ /* there were no items corresponding to given offset */
34619+ done_lh(hint->ext_coord.lh);
34620+ break;
34621+ }
34622+
34623+ loaded = coord->node;
34624+ result = zload(loaded);
34625+ if (unlikely(result)) {
34626+ done_lh(hint->ext_coord.lh);
34627+ break;
34628+ }
34629+
34630+ if (hint->ext_coord.valid == 0)
34631+ validate_extended_coord(&hint->ext_coord,
34632+ get_key_offset(&flow.key));
34633+
34634+ assert("vs-4", hint->ext_coord.valid == 1);
34635+ assert("vs-33", hint->ext_coord.lh == &hint->lh);
34636+ /* call item's read method */
34637+ read_f = item_plugin_by_coord(coord)->s.file.read;
34638+ result = read_f(file, &flow, hint);
34639+ zrelse(loaded);
34640+ done_lh(hint->ext_coord.lh);
34641+ }
34642+
34643+ return (count - flow.length) ? (count - flow.length) : result;
34644+}
34645+
34646+static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
34647+
34648+/**
34649+ * read_unix_file - read of struct file_operations
34650+ * @file: file to read from
34651+ * @buf: address of user-space buffer
34652+ * @read_amount: number of bytes to read
34653+ * @off: position in file to read from
34654+ *
34655+ * This is implementation of vfs's read method of struct file_operations for
34656+ * unix file plugin.
34657+ */
34658+ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
34659+ loff_t *off)
34660+{
34661+ reiser4_context *ctx;
34662+ ssize_t result;
34663+ struct inode *inode;
34664+ struct unix_file_info *uf_info;
34665+
34666+ if (unlikely(read_amount == 0))
34667+ return 0;
34668+
34669+ assert("umka-072", file != NULL);
34670+ assert("umka-074", off != NULL);
34671+ inode = file->f_dentry->d_inode;
34672+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34673+
34674+ ctx = reiser4_init_context(inode->i_sb);
34675+ if (IS_ERR(ctx))
34676+ return PTR_ERR(ctx);
34677+ uf_info = unix_file_inode_data(inode);
34678+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
34679+ get_exclusive_access(uf_info);
34680+ result = find_file_state(inode, uf_info);
34681+ if (unlikely(result != 0))
34682+ goto out;
34683+ } else
34684+ get_nonexclusive_access(uf_info);
34685+ result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
34686+ BA_CAN_COMMIT);
34687+ if (unlikely(result != 0))
34688+ goto out;
34689+ if (uf_info->container == UF_CONTAINER_EXTENTS){
34690+ result = do_sync_read(file, buf, read_amount, off);
34691+ } else if (uf_info->container == UF_CONTAINER_TAILS ||
34692+ reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
34693+ reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
34694+ result = read_unix_file_container_tails(file, buf, read_amount, off);
34695+ } else {
34696+ assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
34697+ result = 0;
34698+ }
34699+out:
34700+ drop_access(uf_info);
34701+ context_set_commit_async(ctx);
34702+ reiser4_exit_context(ctx);
34703+ return result;
34704+}
34705+
34706+static ssize_t read_unix_file_container_tails(
34707+ struct file *file, char __user *buf, size_t read_amount, loff_t *off)
34708+{
34709+ int result;
34710+ struct inode *inode;
34711+ hint_t *hint;
34712+ struct unix_file_info *uf_info;
34713+ size_t count, read, left;
34714+ loff_t size;
34715+
34716+ assert("umka-072", file != NULL);
34717+ assert("umka-074", off != NULL);
34718+ inode = file->f_dentry->d_inode;
34719+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34720+
34721+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34722+ if (hint == NULL)
34723+ return RETERR(-ENOMEM);
34724+
34725+ result = load_file_hint(file, hint);
34726+ if (result) {
34727+ kfree(hint);
34728+ return result;
34729+ }
34730+
34731+ left = read_amount;
34732+ count = 0;
34733+ uf_info = unix_file_inode_data(inode);
34734+ while (left > 0) {
34735+ reiser4_txn_restart_current();
34736+ size = i_size_read(inode);
34737+ if (*off >= size)
34738+ /* position to read from is past the end of file */
34739+ break;
34740+ if (*off + left > size)
34741+ left = size - *off;
34742+ /* faultin user page */
34743+ result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
34744+ if (result)
34745+ return RETERR(-EFAULT);
34746+
34747+ read = read_file(hint, file, buf,
34748+ left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
34749+ off);
34750+ if (read < 0) {
34751+ result = read;
34752+ break;
34753+ }
34754+ left -= read;
34755+ buf += read;
34756+
34757+ /* update position in a file */
34758+ *off += read;
34759+ /* total number of read bytes */
34760+ count += read;
34761+ }
34762+ done_lh(&hint->lh);
34763+ save_file_hint(file, hint);
34764+ kfree(hint);
34765+ if (count)
34766+ file_accessed(file);
34767+ /* return number of read bytes or error code if nothing is read */
34768+ return count ? count : result;
34769+}
34770+
34771+/* This function takes care about @file's pages. First of all it checks if
34772+ filesystems readonly and if so gets out. Otherwise, it throws out all
34773+ pages of file if it was mapped for read and going to be mapped for write
34774+ and consists of tails. This is done in order to not manage few copies
34775+ of the data (first in page cache and second one in tails them selves)
34776+ for the case of mapping files consisting tails.
34777+
34778+ Here also tail2extent conversion is performed if it is allowed and file
34779+ is going to be written or mapped for write. This functions may be called
34780+ from write_unix_file() or mmap_unix_file(). */
34781+static int check_pages_unix_file(struct file *file, struct inode *inode)
34782+{
34783+ reiser4_invalidate_pages(inode->i_mapping, 0,
34784+ (inode->i_size + PAGE_CACHE_SIZE -
34785+ 1) >> PAGE_CACHE_SHIFT, 0);
34786+ return unpack(file, inode, 0 /* not forever */ );
34787+}
34788+
34789+/**
34790+ * mmap_unix_file - mmap of struct file_operations
34791+ * @file: file to mmap
34792+ * @vma:
34793+ *
34794+ * This is implementation of vfs's mmap method of struct file_operations for
34795+ * unix file plugin. It converts file to extent if necessary. Sets
34796+ * reiser4_inode's flag - REISER4_HAS_MMAP.
34797+ */
34798+int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
34799+{
34800+ reiser4_context *ctx;
34801+ int result;
34802+ struct inode *inode;
34803+ struct unix_file_info *uf_info;
34804+ reiser4_block_nr needed;
34805+
34806+ inode = file->f_dentry->d_inode;
34807+ ctx = reiser4_init_context(inode->i_sb);
34808+ if (IS_ERR(ctx))
34809+ return PTR_ERR(ctx);
34810+
34811+ uf_info = unix_file_inode_data(inode);
34812+
34813+ get_exclusive_access_careful(uf_info, inode);
34814+
34815+ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
34816+ /*
34817+ * we need file built of extent items. If it is still built of
34818+ * tail items we have to convert it. Find what items the file
34819+ * is built of
34820+ */
34821+ result = find_file_state(inode, uf_info);
34822+ if (result != 0) {
34823+ drop_exclusive_access(uf_info);
34824+ reiser4_exit_context(ctx);
34825+ return result;
34826+ }
34827+
34828+ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
34829+ uf_info->container == UF_CONTAINER_EXTENTS ||
34830+ uf_info->container == UF_CONTAINER_EMPTY));
34831+ if (uf_info->container == UF_CONTAINER_TAILS) {
34832+ /*
34833+ * invalidate all pages and convert file from tails to
34834+ * extents
34835+ */
34836+ result = check_pages_unix_file(file, inode);
34837+ if (result) {
34838+ drop_exclusive_access(uf_info);
34839+ reiser4_exit_context(ctx);
34840+ return result;
34841+ }
34842+ }
34843+ }
34844+
34845+ /*
34846+ * generic_file_mmap will do update_atime. Grab space for stat data
34847+ * update.
34848+ */
34849+ needed = inode_file_plugin(inode)->estimate.update(inode);
34850+ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
34851+ if (result) {
34852+ drop_exclusive_access(uf_info);
34853+ reiser4_exit_context(ctx);
34854+ return result;
34855+ }
34856+
34857+ result = generic_file_mmap(file, vma);
34858+ if (result == 0) {
34859+ /* mark file as having mapping. */
34860+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
34861+ }
34862+
34863+ drop_exclusive_access(uf_info);
34864+ reiser4_exit_context(ctx);
34865+ return result;
34866+}
34867+
34868+/**
34869+ * find_first_item
34870+ * @inode:
34871+ *
34872+ * Finds file item which is responsible for first byte in the file.
34873+ */
34874+static int find_first_item(struct inode *inode)
34875+{
34876+ coord_t coord;
34877+ lock_handle lh;
34878+ reiser4_key key;
34879+ int result;
34880+
34881+ coord_init_zero(&coord);
34882+ init_lh(&lh);
34883+ inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
34884+ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
34885+ inode);
34886+ if (result == CBK_COORD_FOUND) {
34887+ if (coord.between == AT_UNIT) {
34888+ result = zload(coord.node);
34889+ if (result == 0) {
34890+ result = item_id_by_coord(&coord);
34891+ zrelse(coord.node);
34892+ if (result != EXTENT_POINTER_ID &&
34893+ result != FORMATTING_ID)
34894+ result = RETERR(-EIO);
34895+ }
34896+ } else
34897+ result = RETERR(-EIO);
34898+ }
34899+ done_lh(&lh);
34900+ return result;
34901+}
34902+
34903+/**
34904+ * open_unix_file
34905+ * @inode:
34906+ * @file:
34907+ *
34908+ * If filesystem is not readonly - complete uncompleted tail conversion if
34909+ * there was one
34910+ */
34911+int open_unix_file(struct inode *inode, struct file *file)
34912+{
34913+ int result;
34914+ reiser4_context *ctx;
34915+ struct unix_file_info *uf_info;
34916+
34917+ if (IS_RDONLY(inode))
34918+ return 0;
34919+
34920+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
34921+ return 0;
34922+
34923+ ctx = reiser4_init_context(inode->i_sb);
34924+ if (IS_ERR(ctx))
34925+ return PTR_ERR(ctx);
34926+
34927+ uf_info = unix_file_inode_data(inode);
34928+
34929+ get_exclusive_access_careful(uf_info, inode);
34930+
34931+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
34932+ /*
34933+ * other process completed the conversion
34934+ */
34935+ drop_exclusive_access(uf_info);
34936+ reiser4_exit_context(ctx);
34937+ return 0;
34938+ }
34939+
34940+ /*
34941+ * file left in semi converted state after unclean shutdown or another
34942+ * thread is doing conversion and dropped exclusive access which doing
34943+ * balance dirty pages. Complete the conversion
34944+ */
34945+ result = find_first_item(inode);
34946+ if (result == EXTENT_POINTER_ID)
34947+ /*
34948+ * first item is extent, therefore there was incomplete
34949+ * tail2extent conversion. Complete it
34950+ */
34951+ result = tail2extent(unix_file_inode_data(inode));
34952+ else if (result == FORMATTING_ID)
34953+ /*
34954+ * first item is formatting item, therefore there was
34955+ * incomplete extent2tail conversion. Complete it
34956+ */
34957+ result = extent2tail(file, unix_file_inode_data(inode));
34958+ else
34959+ result = -EIO;
34960+
34961+ assert("vs-1712",
34962+ ergo(result == 0,
34963+ (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
34964+ !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
34965+ drop_exclusive_access(uf_info);
34966+ reiser4_exit_context(ctx);
34967+ return result;
34968+}
34969+
34970+#define NEITHER_OBTAINED 0
34971+#define EA_OBTAINED 1
34972+#define NEA_OBTAINED 2
34973+
34974+static void drop_access(struct unix_file_info *uf_info)
34975+{
34976+ if (uf_info->exclusive_use)
34977+ drop_exclusive_access(uf_info);
34978+ else
34979+ drop_nonexclusive_access(uf_info);
34980+}
34981+
34982+#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
34983+ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
34984+
34985+/**
34986+ * write_unix_file - write of struct file_operations
34987+ * @file: file to write to
34988+ * @buf: address of user-space buffer
34989+ * @write_amount: number of bytes to write
34990+ * @off: position in file to write to
34991+ *
34992+ * This is implementation of vfs's write method of struct file_operations for
34993+ * unix file plugin.
34994+ */
34995+ssize_t write_unix_file(struct file *file, const char __user *buf,
34996+ size_t count, loff_t *pos, int *conv)
34997+{
34998+ int result;
34999+ reiser4_context *ctx;
35000+ struct inode *inode;
35001+ struct unix_file_info *uf_info;
35002+ ssize_t written;
35003+ int try_free_space;
35004+ int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
35005+ size_t left;
35006+ ssize_t (*write_op)(struct file *, const char __user *, size_t,
35007+ loff_t *pos);
35008+ int ea;
35009+ loff_t new_size;
35010+
35011+ inode = file->f_dentry->d_inode;
35012+ ctx = reiser4_init_context(inode->i_sb);
35013+ if (IS_ERR(ctx))
35014+ return PTR_ERR(ctx);
35015+
35016+ mutex_lock(&inode->i_mutex);
35017+
35018+ assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
35019+ assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
35020+
35021+ /* check amount of bytes to write and writing position */
35022+ result = generic_write_checks(file, pos, &count, 0);
35023+ if (result) {
35024+ mutex_unlock(&inode->i_mutex);
35025+ context_set_commit_async(ctx);
35026+ reiser4_exit_context(ctx);
35027+ return result;
35028+ }
35029+
35030+ result = remove_suid(file->f_dentry);
35031+ if (result) {
35032+ mutex_unlock(&inode->i_mutex);
35033+ context_set_commit_async(ctx);
35034+ reiser4_exit_context(ctx);
35035+ return result;
35036+ }
35037+ /* remove_suid might create a transaction */
35038+ reiser4_txn_restart(ctx);
35039+
35040+ uf_info = unix_file_inode_data(inode);
35041+
35042+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
35043+ written = 0;
35044+ try_free_space = 0;
35045+ left = count;
35046+ ea = NEITHER_OBTAINED;
35047+
35048+ new_size = i_size_read(inode);
35049+ if (*pos + count > new_size)
35050+ new_size = *pos + count;
35051+
35052+ while (left) {
35053+ if (left < to_write)
35054+ to_write = left;
35055+
35056+ if (uf_info->container == UF_CONTAINER_EMPTY) {
35057+ get_exclusive_access(uf_info);
35058+ ea = EA_OBTAINED;
35059+ if (uf_info->container != UF_CONTAINER_EMPTY) {
35060+ /* file is made not empty by another process */
35061+ drop_exclusive_access(uf_info);
35062+ ea = NEITHER_OBTAINED;
35063+ continue;
35064+ }
35065+ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35066+ /*
35067+ * get exclusive access directly just to not have to
35068+ * re-obtain it if file will appear empty
35069+ */
35070+ get_exclusive_access(uf_info);
35071+ ea = EA_OBTAINED;
35072+ result = find_file_state(inode, uf_info);
35073+ if (result) {
35074+ drop_exclusive_access(uf_info);
35075+ ea = NEITHER_OBTAINED;
35076+ break;
35077+ }
35078+ } else {
35079+ get_nonexclusive_access(uf_info);
35080+ ea = NEA_OBTAINED;
35081+ }
35082+
35083+ /* either EA or NEA is obtained. Choose item write method */
35084+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
35085+ /* file is built of extent items */
35086+ write_op = reiser4_write_extent;
35087+ } else if (uf_info->container == UF_CONTAINER_EMPTY) {
35088+ /* file is empty */
35089+ if (should_have_notail(uf_info, new_size))
35090+ write_op = reiser4_write_extent;
35091+ else
35092+ write_op = reiser4_write_tail;
35093+ } else {
35094+ /* file is built of tail items */
35095+ if (should_have_notail(uf_info, new_size)) {
35096+ if (ea == NEA_OBTAINED) {
35097+ drop_nonexclusive_access(uf_info);
35098+ get_exclusive_access(uf_info);
35099+ ea = EA_OBTAINED;
35100+ }
35101+ if (uf_info->container == UF_CONTAINER_TAILS) {
35102+ /*
35103+ * if file is being convered by another
35104+ * process - wait until it completes
35105+ */
35106+ while (1) {
35107+ if (reiser4_inode_get_flag(inode,
35108+ REISER4_PART_IN_CONV)) {
35109+ drop_exclusive_access(uf_info);
35110+ schedule();
35111+ get_exclusive_access(uf_info);
35112+ continue;
35113+ }
35114+ break;
35115+ }
35116+ if (uf_info->container == UF_CONTAINER_TAILS) {
35117+ result = tail2extent(uf_info);
35118+ if (result)
35119+ break;
35120+ }
35121+ }
35122+ drop_exclusive_access(uf_info);
35123+ ea = NEITHER_OBTAINED;
35124+ continue;
35125+ }
35126+ write_op = reiser4_write_tail;
35127+ }
35128+
35129+ written = write_op(file, buf, to_write, pos);
35130+ if (written == -ENOSPC && try_free_space) {
35131+ drop_access(uf_info);
35132+ txnmgr_force_commit_all(inode->i_sb, 0);
35133+ try_free_space = 0;
35134+ continue;
35135+ }
35136+ if (written < 0) {
35137+ drop_access(uf_info);
35138+ result = written;
35139+ break;
35140+ }
35141+ /* something is written. */
35142+ if (uf_info->container == UF_CONTAINER_EMPTY) {
35143+ assert("", ea == EA_OBTAINED);
35144+ uf_info->container =
35145+ (write_op == reiser4_write_extent) ?
35146+ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
35147+ } else {
35148+ assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
35149+ write_op == reiser4_write_extent));
35150+ assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
35151+ write_op == reiser4_write_tail));
35152+ }
35153+ if (*pos + written > inode->i_size)
35154+ INODE_SET_FIELD(inode, i_size, *pos + written);
35155+ file_update_time(file);
35156+ result = reiser4_update_sd(inode);
35157+ if (result) {
35158+ mutex_unlock(&inode->i_mutex);
35159+ current->backing_dev_info = NULL;
35160+ drop_access(uf_info);
35161+ context_set_commit_async(ctx);
35162+ reiser4_exit_context(ctx);
35163+ return result;
35164+ }
35165+ drop_access(uf_info);
35166+ ea = NEITHER_OBTAINED;
35167+ reiser4_txn_restart(ctx);
35168+ current->journal_info = NULL;
35169+ /*
35170+ * tell VM how many pages were dirtied. Maybe number of pages
35171+ * which were dirty already should not be counted
35172+ */
35173+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
35174+ (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
35175+ current->journal_info = ctx;
35176+
35177+ left -= written;
35178+ buf += written;
35179+ *pos += written;
35180+ }
35181+
35182+ mutex_unlock(&inode->i_mutex);
35183+
35184+ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
35185+ reiser4_txn_restart_current();
35186+ grab_space_enable();
35187+ result = reiser4_sync_file_common(file, file->f_dentry,
35188+ 0 /* data and stat data */);
35189+ if (result)
35190+ warning("reiser4-7", "failed to sync file %llu",
35191+ (unsigned long long)get_inode_oid(inode));
35192+ }
35193+
35194+ current->backing_dev_info = NULL;
35195+
35196+ reiser4_exit_context(ctx);
35197+
35198+ /*
35199+ * return number of written bytes or error code if nothing is
35200+ * written. Note, that it does not work correctly in case when
35201+ * sync_unix_file returns error
35202+ */
35203+ return (count - left) ? (count - left) : result;
35204+}
35205+
35206+/**
35207+ * release_unix_file - release of struct file_operations
35208+ * @inode: inode of released file
35209+ * @file: file to release
35210+ *
35211+ * Implementation of release method of struct file_operations for unix file
35212+ * plugin. If last reference to indode is released - convert all extent items
35213+ * into tail items if necessary. Frees reiser4 specific file data.
35214+ */
35215+int release_unix_file(struct inode *inode, struct file *file)
35216+{
35217+ reiser4_context *ctx;
35218+ struct unix_file_info *uf_info;
35219+ int result;
35220+ int in_reiser4;
35221+
35222+ in_reiser4 = is_in_reiser4_context();
35223+
35224+ ctx = reiser4_init_context(inode->i_sb);
35225+ if (IS_ERR(ctx))
35226+ return PTR_ERR(ctx);
35227+
35228+ result = 0;
35229+ if (in_reiser4 == 0) {
35230+ uf_info = unix_file_inode_data(inode);
35231+
35232+ get_exclusive_access_careful(uf_info, inode);
35233+ if (atomic_read(&file->f_dentry->d_count) == 1 &&
35234+ uf_info->container == UF_CONTAINER_EXTENTS &&
35235+ !should_have_notail(uf_info, inode->i_size) &&
35236+ !rofs_inode(inode)) {
35237+ result = extent2tail(file, uf_info);
35238+ if (result != 0) {
35239+ warning("nikita-3233",
35240+ "Failed (%d) to convert in %s (%llu)",
35241+ result, __FUNCTION__,
35242+ (unsigned long long)
35243+ get_inode_oid(inode));
35244+ }
35245+ }
35246+ drop_exclusive_access(uf_info);
35247+ } else {
35248+ /*
35249+ we are within reiser4 context already. How latter is
35250+ possible? Simple:
35251+
35252+ (gdb) bt
35253+ #0 get_exclusive_access ()
35254+ #2 0xc01e56d3 in release_unix_file ()
35255+ #3 0xc01c3643 in reiser4_release ()
35256+ #4 0xc014cae0 in __fput ()
35257+ #5 0xc013ffc3 in remove_vm_struct ()
35258+ #6 0xc0141786 in exit_mmap ()
35259+ #7 0xc0118480 in mmput ()
35260+ #8 0xc0133205 in oom_kill ()
35261+ #9 0xc01332d1 in out_of_memory ()
35262+ #10 0xc013bc1d in try_to_free_pages ()
35263+ #11 0xc013427b in __alloc_pages ()
35264+ #12 0xc013f058 in do_anonymous_page ()
35265+ #13 0xc013f19d in do_no_page ()
35266+ #14 0xc013f60e in handle_mm_fault ()
35267+ #15 0xc01131e5 in do_page_fault ()
35268+ #16 0xc0104935 in error_code ()
35269+ #17 0xc025c0c6 in __copy_to_user_ll ()
35270+ #18 0xc01d496f in reiser4_read_tail ()
35271+ #19 0xc01e4def in read_unix_file ()
35272+ #20 0xc01c3504 in reiser4_read ()
35273+ #21 0xc014bd4f in vfs_read ()
35274+ #22 0xc014bf66 in sys_read ()
35275+ */
35276+ warning("vs-44", "out of memory?");
35277+ }
35278+
35279+ reiser4_free_file_fsdata(file);
35280+
35281+ reiser4_exit_context(ctx);
35282+ return result;
35283+}
35284+
35285+static void set_file_notail(struct inode *inode)
35286+{
35287+ reiser4_inode *state;
35288+ formatting_plugin *tplug;
35289+
35290+ state = reiser4_inode_data(inode);
35291+ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
35292+ force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
35293+}
35294+
35295+/* if file is built of tails - convert it to extents */
35296+static int unpack(struct file *filp, struct inode *inode, int forever)
35297+{
35298+ int result = 0;
35299+ struct unix_file_info *uf_info;
35300+
35301+ uf_info = unix_file_inode_data(inode);
35302+ assert("vs-1628", ea_obtained(uf_info));
35303+
35304+ result = find_file_state(inode, uf_info);
35305+ if (result)
35306+ return result;
35307+ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
35308+
35309+ if (uf_info->container == UF_CONTAINER_TAILS) {
35310+ /*
35311+ * if file is being convered by another process - wait until it
35312+ * completes
35313+ */
35314+ while (1) {
35315+ if (reiser4_inode_get_flag(inode,
35316+ REISER4_PART_IN_CONV)) {
35317+ drop_exclusive_access(uf_info);
35318+ schedule();
35319+ get_exclusive_access(uf_info);
35320+ continue;
35321+ }
35322+ break;
35323+ }
35324+ if (uf_info->container == UF_CONTAINER_TAILS) {
35325+ result = tail2extent(uf_info);
35326+ if (result)
35327+ return result;
35328+ }
35329+ }
35330+ if (forever) {
35331+ /* safe new formatting plugin in stat data */
35332+ __u64 tograb;
35333+
35334+ set_file_notail(inode);
35335+
35336+ grab_space_enable();
35337+ tograb = inode_file_plugin(inode)->estimate.update(inode);
35338+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
35339+ result = reiser4_update_sd(inode);
35340+ }
35341+
35342+ return result;
35343+}
35344+
35345+/* implentation of vfs' ioctl method of struct file_operations for unix file
35346+ plugin
35347+*/
35348+int
35349+ioctl_unix_file(struct inode *inode, struct file *filp,
35350+ unsigned int cmd, unsigned long arg UNUSED_ARG)
35351+{
35352+ reiser4_context *ctx;
35353+ int result;
35354+
35355+ ctx = reiser4_init_context(inode->i_sb);
35356+ if (IS_ERR(ctx))
35357+ return PTR_ERR(ctx);
35358+
35359+ switch (cmd) {
35360+ case REISER4_IOC_UNPACK:
35361+ get_exclusive_access(unix_file_inode_data(inode));
35362+ result = unpack(filp, inode, 1 /* forever */ );
35363+ drop_exclusive_access(unix_file_inode_data(inode));
35364+ break;
35365+
35366+ default:
35367+ result = RETERR(-ENOSYS);
35368+ break;
35369+ }
35370+ reiser4_exit_context(ctx);
35371+ return result;
35372+}
35373+
35374+/* implentation of vfs' bmap method of struct address_space_operations for unix
35375+ file plugin
35376+*/
35377+sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
35378+{
35379+ reiser4_context *ctx;
35380+ sector_t result;
35381+ reiser4_key key;
35382+ coord_t coord;
35383+ lock_handle lh;
35384+ struct inode *inode;
35385+ item_plugin *iplug;
35386+ sector_t block;
35387+
35388+ inode = mapping->host;
35389+
35390+ ctx = reiser4_init_context(inode->i_sb);
35391+ if (IS_ERR(ctx))
35392+ return PTR_ERR(ctx);
35393+ key_by_inode_and_offset_common(inode,
35394+ (loff_t) lblock * current_blocksize,
35395+ &key);
35396+
35397+ init_lh(&lh);
35398+ result =
35399+ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
35400+ if (cbk_errored(result)) {
35401+ done_lh(&lh);
35402+ reiser4_exit_context(ctx);
35403+ return result;
35404+ }
35405+
35406+ result = zload(coord.node);
35407+ if (result) {
35408+ done_lh(&lh);
35409+ reiser4_exit_context(ctx);
35410+ return result;
35411+ }
35412+
35413+ iplug = item_plugin_by_coord(&coord);
35414+ if (iplug->s.file.get_block) {
35415+ result = iplug->s.file.get_block(&coord, lblock, &block);
35416+ if (result == 0)
35417+ result = block;
35418+ } else
35419+ result = RETERR(-EINVAL);
35420+
35421+ zrelse(coord.node);
35422+ done_lh(&lh);
35423+ reiser4_exit_context(ctx);
35424+ return result;
35425+}
35426+
35427+/**
35428+ * flow_by_inode_unix_file - initizlize structure flow
35429+ * @inode: inode of file for which read or write is abou
35430+ * @buf: buffer to perform read to or write from
35431+ * @user: flag showing whether @buf is user space or kernel space
35432+ * @size: size of buffer @buf
35433+ * @off: start offset fro read or write
35434+ * @op: READ or WRITE
35435+ * @flow:
35436+ *
35437+ * Initializes fields of @flow: key, size of data, i/o mode (read or write).
35438+ */
35439+int flow_by_inode_unix_file(struct inode *inode,
35440+ const char __user *buf, int user,
35441+ loff_t size, loff_t off,
35442+ rw_op op, flow_t *flow)
35443+{
35444+ assert("nikita-1100", inode != NULL);
35445+
35446+ flow->length = size;
35447+ memcpy(&flow->data, &buf, sizeof(buf));
35448+ flow->user = user;
35449+ flow->op = op;
35450+ assert("nikita-1931", inode_file_plugin(inode) != NULL);
35451+ assert("nikita-1932",
35452+ inode_file_plugin(inode)->key_by_inode ==
35453+ key_by_inode_and_offset_common);
35454+ /* calculate key of write position and insert it into flow->key */
35455+ return key_by_inode_and_offset_common(inode, off, &flow->key);
35456+}
35457+
35458+/* plugin->u.file.set_plug_in_sd = NULL
35459+ plugin->u.file.set_plug_in_inode = NULL
35460+ plugin->u.file.create_blank_sd = NULL */
35461+/* plugin->u.file.delete */
35462+/*
35463+ plugin->u.file.add_link = reiser4_add_link_common
35464+ plugin->u.file.rem_link = NULL */
35465+
35466+/* plugin->u.file.owns_item
35467+ this is common_file_owns_item with assertion */
35468+/* Audited by: green(2002.06.15) */
35469+int
35470+owns_item_unix_file(const struct inode *inode /* object to check against */ ,
35471+ const coord_t * coord /* coord to check */ )
35472+{
35473+ int result;
35474+
35475+ result = owns_item_common(inode, coord);
35476+ if (!result)
35477+ return 0;
35478+ if (!plugin_of_group(item_plugin_by_coord(coord),
35479+ UNIX_FILE_METADATA_ITEM_TYPE))
35480+ return 0;
35481+ assert("vs-547",
35482+ item_id_by_coord(coord) == EXTENT_POINTER_ID ||
35483+ item_id_by_coord(coord) == FORMATTING_ID);
35484+ return 1;
35485+}
35486+
35487+static int setattr_truncate(struct inode *inode, struct iattr *attr)
35488+{
35489+ int result;
35490+ int s_result;
35491+ loff_t old_size;
35492+ reiser4_tree *tree;
35493+
35494+ inode_check_scale(inode, inode->i_size, attr->ia_size);
35495+
35496+ old_size = inode->i_size;
35497+ tree = reiser4_tree_by_inode(inode);
35498+
35499+ result = safe_link_grab(tree, BA_CAN_COMMIT);
35500+ if (result == 0)
35501+ result = safe_link_add(inode, SAFE_TRUNCATE);
35502+ if (result == 0)
35503+ result = truncate_file_body(inode, attr);
35504+ if (result)
35505+ warning("vs-1588", "truncate_file failed: oid %lli, "
35506+ "old size %lld, new size %lld, retval %d",
35507+ (unsigned long long)get_inode_oid(inode),
35508+ old_size, attr->ia_size, result);
35509+
35510+ s_result = safe_link_grab(tree, BA_CAN_COMMIT);
35511+ if (s_result == 0)
35512+ s_result =
35513+ safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
35514+ if (s_result != 0) {
35515+ warning("nikita-3417", "Cannot kill safelink %lli: %i",
35516+ (unsigned long long)get_inode_oid(inode), s_result);
35517+ }
35518+ safe_link_release(tree);
35519+ return result;
35520+}
35521+
35522+/* plugin->u.file.setattr method */
35523+/* This calls inode_setattr and if truncate is in effect it also takes
35524+ exclusive inode access to avoid races */
35525+int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
35526+ struct iattr *attr /* change description */ )
35527+{
35528+ int result;
35529+
35530+ if (attr->ia_valid & ATTR_SIZE) {
35531+ reiser4_context *ctx;
35532+ struct unix_file_info *uf_info;
35533+
35534+ /* truncate does reservation itself and requires exclusive
35535+ access obtained */
35536+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
35537+ if (IS_ERR(ctx))
35538+ return PTR_ERR(ctx);
35539+
35540+ uf_info = unix_file_inode_data(dentry->d_inode);
35541+ get_exclusive_access_careful(uf_info, dentry->d_inode);
35542+ result = setattr_truncate(dentry->d_inode, attr);
35543+ drop_exclusive_access(uf_info);
35544+ context_set_commit_async(ctx);
35545+ reiser4_exit_context(ctx);
35546+ } else
35547+ result = reiser4_setattr_common(dentry, attr);
35548+
35549+ return result;
35550+}
35551+
35552+/* plugin->u.file.init_inode_data */
35553+void
35554+init_inode_data_unix_file(struct inode *inode,
35555+ reiser4_object_create_data * crd, int create)
35556+{
35557+ struct unix_file_info *data;
35558+
35559+ data = unix_file_inode_data(inode);
35560+ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
35561+ init_rwsem(&data->latch);
35562+ data->tplug = inode_formatting_plugin(inode);
35563+ data->exclusive_use = 0;
35564+
35565+#if REISER4_DEBUG
35566+ data->ea_owner = NULL;
35567+ atomic_set(&data->nr_neas, 0);
35568+#endif
35569+ init_inode_ordering(inode, crd, create);
35570+}
35571+
35572+/**
35573+ * delete_unix_file - delete_object of file_plugin
35574+ * @inode: inode to be deleted
35575+ *
35576+ * Truncates file to length 0, removes stat data and safe link.
35577+ */
35578+int delete_object_unix_file(struct inode *inode)
35579+{
35580+ struct unix_file_info *uf_info;
35581+ int result;
35582+
35583+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
35584+ return 0;
35585+
35586+ /* truncate file bogy first */
35587+ uf_info = unix_file_inode_data(inode);
35588+ get_exclusive_access(uf_info);
35589+ result = shorten_file(inode, 0 /* size */ );
35590+ drop_exclusive_access(uf_info);
35591+
35592+ if (result)
35593+ warning("", "failed to truncate file (%llu) on removal: %d",
35594+ get_inode_oid(inode), result);
35595+
35596+ /* remove stat data and safe link */
35597+ return reiser4_delete_object_common(inode);
35598+}
35599+
35600+int
35601+prepare_write_unix_file(struct file *file, struct page *page,
35602+ unsigned from, unsigned to)
35603+{
35604+ reiser4_context *ctx;
35605+ struct unix_file_info *uf_info;
35606+ int ret;
35607+
35608+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
35609+ if (IS_ERR(ctx))
35610+ return PTR_ERR(ctx);
35611+
35612+ uf_info = unix_file_inode_data(file->f_dentry->d_inode);
35613+ get_exclusive_access(uf_info);
35614+ ret = find_file_state(file->f_dentry->d_inode, uf_info);
35615+ if (ret == 0) {
35616+ if (uf_info->container == UF_CONTAINER_TAILS)
35617+ ret = -EINVAL;
35618+ else
35619+ ret = do_prepare_write(file, page, from, to);
35620+ }
35621+ drop_exclusive_access(uf_info);
35622+
35623+ /* don't commit transaction under inode semaphore */
35624+ context_set_commit_async(ctx);
35625+ reiser4_exit_context(ctx);
35626+ return ret;
35627+}
35628+
35629+/*
35630+ * Local variables:
35631+ * c-indentation-style: "K&R"
35632+ * mode-name: "LC"
35633+ * c-basic-offset: 8
35634+ * tab-width: 8
35635+ * fill-column: 79
35636+ * scroll-step: 1
35637+ * End:
35638+ */
35639diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.23/fs/reiser4/plugin/file/file_conversion.c
35640--- linux-2.6.23.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 03:00:00.000000000 +0300
35641+++ linux-2.6.23/fs/reiser4/plugin/file/file_conversion.c 2007-12-04 16:49:30.000000000 +0300
35642@@ -0,0 +1,659 @@
35643+/* Copyright 2001, 2002, 2003 by Hans Reiser,
35644+ licensing governed by reiser4/README */
35645+
35646+/* *
35647+ * This file contains a converter cryptcompress->unix_file, and O(1)-heuristic,
35648+ * which allows to assign for a regular file the most reasonable plugin to be
35649+ * managed by. Note, that we don't use back conversion because of compatibility
35650+ * reasons (see http://dev.namesys.com/Version4.X.Y for details).
35651+ *
35652+ * Currently used heuristic is very simple: if first complete logical cluster
35653+ * (64K by default) of a file is incompressible, then we make a decision, that
35654+ * the whole file is incompressible (*). When creating a file the conversion
35655+ * is enabled by default via installing a special "permitting" compression mode
35656+ * plugin (**) (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c
35657+ * for details).
35658+ *
35659+ * The conversion is accompanied by rebuilding disk structures of a file, so it
35660+ * is important to protect them from being interacted with other plugins which
35661+ * don't expect them to be in such inconsistent state. For this to be protected
35662+ * we serialize readers and writers of pset. Writers are the processes which can
35663+ * change it with conversion purposes; other ones are readers. Serialization is
35664+ * performed via acquiring per-inode rw-semaphore (conv_sem).
35665+ *
35666+ * (*) This heuristic can be easily changed as soon as we have a new,
35667+ * better one.
35668+ * (**) Such solution allows to keep enable/disable state on disk.
35669+ */
35670+
35671+#include "../../inode.h"
35672+#include "../cluster.h"
35673+#include "file.h"
35674+
35675+#define conversion_enabled(inode) \
35676+ (inode_compression_mode_plugin(inode) == \
35677+ compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
35678+
35679+/**
35680+ * Located sections (readers and writers of @pset) are not permanently
35681+ * critical: cryptcompress file can be converted only if the conversion
35682+ * is enabled (see the macrio above). Also we don't perform back
35683+ * conversion. The following helper macro is a sanity check to decide
35684+ * if we need the protection (locks are always additional overheads).
35685+ */
35686+#define should_protect(inode) \
35687+ (inode_file_plugin(inode) == \
35688+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \
35689+ conversion_enabled(inode))
35690+/**
35691+ * We'll speak about "passive" protection for readers and "active"
35692+ * protection for writers. All methods with active or passive protection
35693+ * has suffix "careful".
35694+ */
35695+/* Macro for passive protection.
35696+ method_foo contains only readers */
35697+#define PROT_PASSIVE(type, method, args) \
35698+({ \
35699+ type _result; \
35700+ struct rw_semaphore * guard = \
35701+ &reiser4_inode_data(inode)->conv_sem; \
35702+ \
35703+ if (should_protect(inode)) { \
35704+ down_read(guard); \
35705+ if (!should_protect(inode)) \
35706+ up_read(guard); \
35707+ } \
35708+ if (inode_file_plugin(inode) == \
35709+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
35710+ _result = method ## _unix_file args; \
35711+ else \
35712+ _result = method ## _cryptcompress args; \
35713+ if (should_protect(inode)) \
35714+ up_read(guard); \
35715+ _result; \
35716+})
35717+
35718+#define PROT_PASSIVE_VOID(method, args) \
35719+({ \
35720+ struct rw_semaphore * guard = \
35721+ &reiser4_inode_data(inode)->conv_sem; \
35722+ \
35723+ if (should_protect(inode)) { \
35724+ down_read(guard); \
35725+ if (!should_protect(inode)) \
35726+ up_read(guard); \
35727+ } \
35728+ if (inode_file_plugin(inode) == \
35729+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
35730+ method ## _unix_file args; \
35731+ else \
35732+ method ## _cryptcompress args; \
35733+ if (should_protect(inode)) \
35734+ up_read(guard); \
35735+})
35736+
35737+/**
35738+ * Macro for active protection.
35739+ * active_expr contains writers of pset;
35740+ * NOTE: after evaluating active_expr conversion should be disabled.
35741+ */
35742+#define PROT_ACTIVE(type, method, args, active_expr) \
35743+({ \
35744+ type _result = 0; \
35745+ struct rw_semaphore * guard = \
35746+ &reiser4_inode_data(inode)->conv_sem; \
35747+ reiser4_context * ctx = reiser4_init_context(inode->i_sb); \
35748+ if (IS_ERR(ctx)) \
35749+ return PTR_ERR(ctx); \
35750+ \
35751+ if (should_protect(inode)) { \
35752+ down_write(guard); \
35753+ if (should_protect(inode)) \
35754+ _result = active_expr; \
35755+ up_write(guard); \
35756+ } \
35757+ if (_result == 0) { \
35758+ if (inode_file_plugin(inode) == \
35759+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
35760+ _result = method ## _unix_file args; \
35761+ else \
35762+ _result = method ## _cryptcompress args; \
35763+ } \
35764+ reiser4_exit_context(ctx); \
35765+ _result; \
35766+})
35767+
35768+/* Pass management to the unix-file plugin with "notail" policy */
35769+static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
35770+{
35771+ int result;
35772+ reiser4_inode *info;
35773+ struct unix_file_info * uf;
35774+ info = reiser4_inode_data(inode);
35775+
35776+ result = aset_set_unsafe(&info->pset,
35777+ PSET_FILE,
35778+ (reiser4_plugin *)
35779+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
35780+ if (result)
35781+ return result;
35782+ result = aset_set_unsafe(&info->pset,
35783+ PSET_FORMATTING,
35784+ (reiser4_plugin *)
35785+ formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
35786+ if (result)
35787+ return result;
35788+ /* get rid of non-standard plugins */
35789+ info->plugin_mask &= ~cryptcompress_mask;
35790+ /* get rid of plugin stat-data extension */
35791+ info->extmask &= ~(1 << PLUGIN_STAT);
35792+
35793+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
35794+
35795+ /* FIXME use init_inode_data_unix_file() instead,
35796+ but aviod init_inode_ordering() */
35797+ /* Init unix-file specific part of inode */
35798+ uf = unix_file_inode_data(inode);
35799+ uf->container = UF_CONTAINER_UNKNOWN;
35800+ init_rwsem(&uf->latch);
35801+ uf->tplug = inode_formatting_plugin(inode);
35802+ uf->exclusive_use = 0;
35803+#if REISER4_DEBUG
35804+ uf->ea_owner = NULL;
35805+ atomic_set(&uf->nr_neas, 0);
35806+#endif
35807+ /**
35808+ * we was carefull for file_ops, inode_ops and as_ops
35809+ * to be invariant for plugin conversion, so there is
35810+ * no need to update ones already installed in the
35811+ * vfs's residence.
35812+ */
35813+ return 0;
35814+}
35815+
35816+#if REISER4_DEBUG
35817+static int disabled_conversion_inode_ok(struct inode * inode)
35818+{
35819+ __u64 extmask = reiser4_inode_data(inode)->extmask;
35820+ __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
35821+
35822+ return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
35823+ (extmask & (1 << UNIX_STAT)) &&
35824+ (extmask & (1 << LARGE_TIMES_STAT)) &&
35825+ (extmask & (1 << PLUGIN_STAT)) &&
35826+ (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
35827+}
35828+#endif
35829+
35830+/* Assign another mode that will control
35831+ compression at flush time only */
35832+static int disable_conversion_no_update_sd(struct inode * inode)
35833+{
35834+ int result;
35835+ result =
35836+ force_plugin_pset(inode,
35837+ PSET_COMPRESSION_MODE,
35838+ (reiser4_plugin *)compression_mode_plugin_by_id
35839+ (LATTD_COMPRESSION_MODE_ID));
35840+ assert("edward-1500",
35841+ ergo(!result, disabled_conversion_inode_ok(inode)));
35842+ return result;
35843+}
35844+
35845+/* Disable future attempts to check/convert. This function is called by
35846+ conversion hooks. */
35847+static int disable_conversion(struct inode * inode)
35848+{
35849+ return disable_conversion_no_update_sd(inode);
35850+}
35851+
35852+static int check_position(struct inode * inode,
35853+ loff_t pos /* position in the file to write from */,
35854+ struct cluster_handle * clust,
35855+ int * check_compress)
35856+{
35857+ assert("edward-1505", conversion_enabled(inode));
35858+ /*
35859+ * if file size is more then cluster size, then compressible
35860+ * status must be figured out (i.e. compression was disabled,
35861+ * or file plugin was converted to unix_file)
35862+ */
35863+ assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
35864+
35865+ if (pos > inode->i_size)
35866+ /* first logical cluster will contain a (partial) hole */
35867+ return disable_conversion(inode);
35868+ if (pos < inode_cluster_size(inode))
35869+ /* writing to the first logical cluster */
35870+ return 0;
35871+ /*
35872+ * here we have:
35873+ * cluster_size <= pos <= i_size <= cluster_size,
35874+ * and, hence, pos == i_size == cluster_size
35875+ */
35876+ assert("edward-1498",
35877+ pos == inode->i_size &&
35878+ pos == inode_cluster_size(inode));
35879+
35880+ *check_compress = 1;
35881+ return 0;
35882+}
35883+
35884+static void start_check_compressibility(struct inode * inode,
35885+ struct cluster_handle * clust,
35886+ hint_t * hint)
35887+{
35888+ assert("edward-1507", clust->index == 1);
35889+ assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
35890+ assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
35891+
35892+ hint_init_zero(hint);
35893+ clust->hint = hint;
35894+ clust->index --;
35895+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
35896+
35897+ /* first logical cluster (of index #0) must be complete */
35898+ assert("edward-1510", lbytes(clust->index, inode) ==
35899+ inode_cluster_size(inode));
35900+}
35901+
35902+static void finish_check_compressibility(struct inode * inode,
35903+ struct cluster_handle * clust,
35904+ hint_t * hint)
35905+{
35906+ reiser4_unset_hint(clust->hint);
35907+ clust->hint = hint;
35908+ clust->index ++;
35909+}
35910+
35911+#if REISER4_DEBUG
35912+static int prepped_dclust_ok(hint_t * hint)
35913+{
35914+ reiser4_key key;
35915+ coord_t * coord = &hint->ext_coord.coord;
35916+
35917+ item_key_by_coord(coord, &key);
35918+ return (item_id_by_coord(coord) == CTAIL_ID &&
35919+ !coord_is_unprepped_ctail(coord) &&
35920+ (get_key_offset(&key) + nr_units_ctail(coord) ==
35921+ dclust_get_extension_dsize(hint)));
35922+}
35923+#endif
35924+
35925+#define fifty_persent(size) (size >> 1)
35926+/* evaluation of data compressibility */
35927+#define data_is_compressible(osize, isize) \
35928+ (osize < fifty_persent(isize))
35929+
35930+/**
35931+ * A simple O(1)-heuristic for compressibility.
35932+ * This is called not more then one time per file's life.
35933+ * Read first logical cluster (of index #0) and estimate its compressibility.
35934+ * Save estimation result in @compressible.
35935+ */
35936+static int read_check_compressibility(struct inode * inode,
35937+ struct cluster_handle * clust,
35938+ int * compressible)
35939+{
35940+ int i;
35941+ int result;
35942+ __u32 dst_len;
35943+ hint_t tmp_hint;
35944+ hint_t * cur_hint = clust->hint;
35945+
35946+ start_check_compressibility(inode, clust, &tmp_hint);
35947+
35948+ reset_cluster_pgset(clust, cluster_nrpages(inode));
35949+ result = grab_page_cluster(inode, clust, READ_OP);
35950+ if (result)
35951+ return result;
35952+ /* Read page cluster here */
35953+ for (i = 0; i < clust->nr_pages; i++) {
35954+ struct page *page = clust->pages[i];
35955+ lock_page(page);
35956+ result = do_readpage_ctail(inode, clust, page,
35957+ ZNODE_READ_LOCK);
35958+ unlock_page(page);
35959+ if (result)
35960+ goto error;
35961+ }
35962+ tfm_cluster_clr_uptodate(&clust->tc);
35963+
35964+ cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
35965+
35966+ if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
35967+ /* lenght of compressed data is known, no need to compress */
35968+ assert("edward-1511",
35969+ znode_is_any_locked(tmp_hint.lh.node));
35970+ assert("edward-1512",
35971+ WITH_DATA(tmp_hint.ext_coord.coord.node,
35972+ prepped_dclust_ok(&tmp_hint)));
35973+ dst_len = dclust_get_extension_dsize(&tmp_hint);
35974+ }
35975+ else {
35976+ struct tfm_cluster * tc = &clust->tc;
35977+ compression_plugin * cplug = inode_compression_plugin(inode);
35978+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
35979+ if (result)
35980+ goto error;
35981+ for (i = 0; i < clust->nr_pages; i++) {
35982+ char *data;
35983+ lock_page(clust->pages[i]);
35984+ BUG_ON(!PageUptodate(clust->pages[i]));
35985+ data = kmap(clust->pages[i]);
35986+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
35987+ data, PAGE_CACHE_SIZE);
35988+ kunmap(clust->pages[i]);
35989+ unlock_page(clust->pages[i]);
35990+ }
35991+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
35992+ if (result)
35993+ goto error;
35994+ result = grab_coa(tc, cplug);
35995+ if (result)
35996+ goto error;
35997+ tc->len = tc->lsize = lbytes(clust->index, inode);
35998+ assert("edward-1513", tc->len == inode_cluster_size(inode));
35999+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
36000+ cplug->compress(get_coa(tc, cplug->h.id, tc->act),
36001+ tfm_input_data(clust), tc->len,
36002+ tfm_output_data(clust), &dst_len);
36003+ assert("edward-1514",
36004+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
36005+ }
36006+ finish_check_compressibility(inode, clust, cur_hint);
36007+ *compressible = data_is_compressible(dst_len,
36008+ inode_cluster_size(inode));
36009+ return 0;
36010+ error:
36011+ put_page_cluster(clust, inode, READ_OP);
36012+ return result;
36013+}
36014+
36015+/* Cut disk cluster of index @idx */
36016+static int cut_disk_cluster(struct inode * inode, cloff_t idx)
36017+{
36018+ reiser4_key from, to;
36019+ assert("edward-1515", inode_file_plugin(inode) ==
36020+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
36021+ key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
36022+ to = from;
36023+ set_key_offset(&to,
36024+ get_key_offset(&from) + inode_cluster_size(inode) - 1);
36025+ return reiser4_cut_tree(reiser4_tree_by_inode(inode),
36026+ &from, &to, inode, 0);
36027+}
36028+
36029+static int reserve_cryptcompress2unixfile(struct inode *inode)
36030+{
36031+ reiser4_block_nr unformatted_nodes;
36032+ reiser4_tree *tree;
36033+
36034+ tree = reiser4_tree_by_inode(inode);
36035+
36036+ /* number of unformatted nodes which will be created */
36037+ unformatted_nodes = cluster_nrpages(inode); /* N */
36038+
36039+ /*
36040+ * space required for one iteration of extent->tail conversion:
36041+ *
36042+ * 1. kill ctail items
36043+ *
36044+ * 2. insert N unformatted nodes
36045+ *
36046+ * 3. insert N (worst-case single-block
36047+ * extents) extent units.
36048+ *
36049+ * 4. drilling to the leaf level by coord_by_key()
36050+ *
36051+ * 5. possible update of stat-data
36052+ *
36053+ */
36054+ grab_space_enable();
36055+ return reiser4_grab_space
36056+ (2 * tree->height +
36057+ unformatted_nodes +
36058+ unformatted_nodes * estimate_one_insert_into_item(tree) +
36059+ 1 + estimate_one_insert_item(tree) +
36060+ inode_file_plugin(inode)->estimate.update(inode),
36061+ BA_CAN_COMMIT);
36062+}
36063+
36064+/* clear flag that indicated conversion and update
36065+ stat-data with new (unix-file - specific) info */
36066+static int complete_file_conversion(struct inode *inode)
36067+{
36068+ int result;
36069+
36070+ grab_space_enable();
36071+ result =
36072+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
36073+ BA_CAN_COMMIT);
36074+ if (result == 0) {
36075+ reiser4_inode_clr_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
36076+ result = reiser4_update_sd(inode);
36077+ }
36078+ if (result)
36079+ warning("edward-1452",
36080+ "Converting %llu to unix-file: update sd failed (%i)",
36081+ (unsigned long long)get_inode_oid(inode), result);
36082+ return 0;
36083+}
36084+
36085+
36086+/* do conversion */
36087+static int cryptcompress2unixfile(struct file * file, struct inode * inode,
36088+ struct cluster_handle * clust)
36089+{
36090+ int i;
36091+ int result = 0;
36092+ struct cryptcompress_info *cr_info;
36093+ struct unix_file_info *uf_info;
36094+
36095+ assert("edward-1516", clust->pages[0]->index == 0);
36096+ assert("edward-1517", clust->hint != NULL);
36097+
36098+ /* release all cryptcompress-specific resources */
36099+ cr_info = cryptcompress_inode_data(inode);
36100+ result = reserve_cryptcompress2unixfile(inode);
36101+ if (result)
36102+ goto out;
36103+ reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
36104+ reiser4_unset_hint(clust->hint);
36105+ result = cut_disk_cluster(inode, 0);
36106+ if (result)
36107+ goto out;
36108+ /* captured jnode of cluster and assotiated resources (pages,
36109+ reserved disk space) were released by ->kill_hook() method
36110+ of the item plugin */
36111+
36112+ result = __cryptcompress2unixfile(file, inode);
36113+ if (result)
36114+ goto out;
36115+ /* At this point file is managed by unix file plugin */
36116+
36117+ uf_info = unix_file_inode_data(inode);
36118+
36119+ assert("edward-1518",
36120+ ergo(jprivate(clust->pages[0]),
36121+ !jnode_is_cluster_page(jprivate(clust->pages[0]))));
36122+ for(i = 0; i < clust->nr_pages; i++) {
36123+ assert("edward-1519", clust->pages[i]);
36124+ assert("edward-1520", PageUptodate(clust->pages[i]));
36125+
36126+ result = find_or_create_extent(clust->pages[i]);
36127+ if (result)
36128+ break;
36129+ }
36130+ if (!result) {
36131+ uf_info->container = UF_CONTAINER_EXTENTS;
36132+ complete_file_conversion(inode);
36133+ }
36134+ out:
36135+ all_grabbed2free();
36136+ if (result)
36137+ warning("edward-1453", "Failed to convert file %llu: ret=%i",
36138+ (unsigned long long)get_inode_oid(inode), result);
36139+ return result;
36140+}
36141+
36142+/* Check, then perform or disable conversion if needed */
36143+int write_conversion_hook(struct file * file, struct inode * inode, loff_t pos,
36144+ struct cluster_handle * clust, int * progress)
36145+{
36146+ int result;
36147+ int check_compress = 0;
36148+ int compressible = 0;
36149+
36150+ if (!conversion_enabled(inode))
36151+ return 0;
36152+ result = check_position(inode, pos, clust, &check_compress);
36153+ if (result || !check_compress)
36154+ return result;
36155+ result = read_check_compressibility(inode, clust, &compressible);
36156+ if (result)
36157+ return result;
36158+
36159+ /* At this point page cluster is grabbed and uptodate */
36160+ if (!compressible) {
36161+ result = cryptcompress2unixfile(file, inode, clust);
36162+ if (result == 0)
36163+ *progress = 1;
36164+ }
36165+ else
36166+ result = disable_conversion(inode);
36167+
36168+ reiser4_txn_restart_current();
36169+ put_page_cluster(clust, inode, READ_OP);
36170+ return result;
36171+}
36172+
36173+static int setattr_conversion_hook(struct inode * inode, struct iattr *attr)
36174+{
36175+ return (attr->ia_valid & ATTR_SIZE ? disable_conversion(inode) : 0);
36176+}
36177+
36178+/**
36179+ * Here are wrappers with "protection", aka Reiser4 "careful" methods.
36180+ * They are used by vfs (as methods of file_ops, inode_ops or as_ops),
36181+ * which is not aware of plugin conversion performed by Reiser4.
36182+ */
36183+
36184+/*
36185+ * Wrappers with active protection for:
36186+ *
36187+ * ->write();
36188+ * ->setattr();
36189+ */
36190+
36191+/*
36192+ * Reiser4 write "careful" method. Write a file in 2 steps:
36193+ * . start write with initial file plugin,
36194+ * switch to a new (more resonable) file plugin (if any);
36195+ * . finish write with the new plugin.
36196+ */
36197+ssize_t reiser4_write_careful(struct file *file, const char __user *buf,
36198+ size_t count, loff_t *off)
36199+{
36200+ int prot = 0;
36201+ int conv = 0;
36202+ ssize_t written_old = 0; /* bytes written with old plugin */
36203+ ssize_t written_new = 0; /* bytes written with new plugin */
36204+ struct inode * inode = file->f_dentry->d_inode;
36205+ struct rw_semaphore * guard = &reiser4_inode_data(inode)->conv_sem;
36206+
36207+ /**
36208+ * First step.
36209+ * Sanity check: if conversion is possible,
36210+ * then protect pset.
36211+ */
36212+ if (should_protect(inode)) {
36213+ prot = 1;
36214+ down_write(guard);
36215+ }
36216+ written_old = inode_file_plugin(inode)->write(file,
36217+ buf,
36218+ count,
36219+ off, &conv);
36220+ if (prot)
36221+ up_write(guard);
36222+ if (written_old < 0 || conv == 0)
36223+ return written_old;
36224+ /**
36225+ * Conversion occurred.
36226+ * Back conversion is impossible,
36227+ * so don't protect at this step.
36228+ */
36229+ assert("edward-1532",
36230+ inode_file_plugin(inode) ==
36231+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
36232+
36233+ written_new = inode_file_plugin(inode)->write(file,
36234+ buf + written_old,
36235+ count - written_old,
36236+ off, NULL);
36237+ return written_old + (written_new < 0 ? 0 : written_new);
36238+}
36239+
36240+int reiser4_setattr_careful(struct dentry *dentry, struct iattr *attr)
36241+{
36242+ struct inode * inode = dentry->d_inode;
36243+ return PROT_ACTIVE(int, setattr, (dentry, attr),
36244+ setattr_conversion_hook(inode, attr));
36245+}
36246+
36247+/* Wrappers with passive protection for:
36248+ *
36249+ * ->open();
36250+ * ->read();
36251+ * ->ioctl();
36252+ * ->mmap();
36253+ * ->release();
36254+ * ->bmap().
36255+ */
36256+
36257+int reiser4_open_careful(struct inode *inode, struct file *file)
36258+{
36259+ return PROT_PASSIVE(int, open, (inode, file));
36260+}
36261+
36262+ssize_t reiser4_read_careful(struct file * file, char __user * buf,
36263+ size_t size, loff_t * off)
36264+{
36265+ struct inode * inode = file->f_dentry->d_inode;
36266+ return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
36267+}
36268+
36269+int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36270+ unsigned int cmd, unsigned long arg)
36271+{
36272+ return PROT_PASSIVE(int, ioctl, (inode, filp, cmd, arg));
36273+}
36274+
36275+int reiser4_mmap_careful(struct file *file, struct vm_area_struct *vma)
36276+{
36277+ struct inode *inode = file->f_dentry->d_inode;
36278+ return PROT_PASSIVE(int, mmap, (file, vma));
36279+}
36280+
36281+int reiser4_release_careful(struct inode *inode, struct file *file)
36282+{
36283+ return PROT_PASSIVE(int, release, (inode, file));
36284+}
36285+
36286+sector_t reiser4_bmap_careful(struct address_space * mapping, sector_t lblock)
36287+{
36288+ struct inode *inode = mapping->host;
36289+ return PROT_PASSIVE(sector_t, bmap, (mapping, lblock));
36290+}
36291+
36292+/*
36293+ Local variables:
36294+ c-indentation-style: "K&R"
36295+ mode-name: "LC"
36296+ c-basic-offset: 8
36297+ tab-width: 8
36298+ fill-column: 80
36299+ scroll-step: 1
36300+ End:
36301+*/
36302diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/file.h linux-2.6.23/fs/reiser4/plugin/file/file.h
36303--- linux-2.6.23.orig/fs/reiser4/plugin/file/file.h 1970-01-01 03:00:00.000000000 +0300
36304+++ linux-2.6.23/fs/reiser4/plugin/file/file.h 2007-12-04 16:49:30.000000000 +0300
36305@@ -0,0 +1,316 @@
36306+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
36307+ * reiser4/README */
36308+
36309+/* this file contains declarations of methods implementing
36310+ file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
36311+ and SYMLINK_FILE_PLUGIN_ID) */
36312+
36313+#if !defined( __REISER4_FILE_H__ )
36314+#define __REISER4_FILE_H__
36315+
36316+/**
36317+ * Declarations of common/careful/generic methods.
36318+ * Suppose ->foo() is a vs method (of f_ops, i_ops, or a_ops);
36319+ * Then common reiser4 method for foo looks like reiser4_foo_common;
36320+ * careful method looks like reiser4_foo_careful;
36321+ * generic method looks like reiser4_foo.
36322+ *
36323+ * Common method is a simple instruction set eligible for more
36324+ * then one plugin id.
36325+ *
36326+ * Generic method looks at the plugin installed in inode's
36327+ * plugin set and calls its appropriate method.
36328+ *
36329+ * Careful method looks like generic method with protected pset
36330+ * (see plugin/file/file_conversion.c for details).
36331+ */
36332+
36333+/* inode operations */
36334+int reiser4_setattr_careful(struct dentry *, struct iattr *);
36335+
36336+/* file operations */
36337+ssize_t reiser4_read_careful(struct file *, char __user *buf,
36338+ size_t count, loff_t *off);
36339+ssize_t reiser4_write_careful(struct file *, const char __user *buf,
36340+ size_t count, loff_t * off);
36341+int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36342+ unsigned int cmd, unsigned long arg);
36343+int reiser4_mmap_careful(struct file *, struct vm_area_struct *);
36344+int reiser4_open_careful(struct inode *inode, struct file *file);
36345+int reiser4_release_careful(struct inode *, struct file *);
36346+int reiser4_sync_file_common(struct file *, struct dentry *, int datasync);
36347+
36348+/* address space operations */
36349+int reiser4_readpage(struct file *, struct page *);
36350+int reiser4_readpages(struct file*, struct address_space*, struct list_head*,
36351+ unsigned);
36352+int reiser4_writepages(struct address_space *, struct writeback_control *);
36353+int reiser4_prepare_write(struct file *, struct page *, unsigned from,
36354+ unsigned to);
36355+int reiser4_commit_write(struct file *, struct page *, unsigned from,
36356+ unsigned to);
36357+sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock);
36358+
36359+/*
36360+ * Private methods of unix-file plugin
36361+ * (UNIX_FILE_PLUGIN_ID)
36362+ */
36363+
36364+/* private inode operations */
36365+int setattr_unix_file(struct dentry *, struct iattr *);
36366+
36367+/* private file operations */
36368+
36369+ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
36370+ loff_t *off);
36371+ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
36372+ loff_t * off, int * conv);
36373+int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
36374+ unsigned long arg);
36375+int mmap_unix_file(struct file *, struct vm_area_struct *);
36376+int open_unix_file(struct inode *, struct file *);
36377+int release_unix_file(struct inode *, struct file *);
36378+
36379+/* private address space operations */
36380+int readpage_unix_file(struct file *, struct page *);
36381+int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
36382+int writepages_unix_file(struct address_space *, struct writeback_control *);
36383+int prepare_write_unix_file(struct file *, struct page *, unsigned from,
36384+ unsigned to);
36385+int commit_write_unix_file(struct file *, struct page *, unsigned from,
36386+ unsigned to);
36387+sector_t bmap_unix_file(struct address_space *, sector_t lblock);
36388+
36389+/* other private methods */
36390+int delete_object_unix_file(struct inode *);
36391+int flow_by_inode_unix_file(struct inode *, const char __user *buf,
36392+ int user, loff_t, loff_t, rw_op, flow_t *);
36393+int owns_item_unix_file(const struct inode *, const coord_t *);
36394+void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
36395+ int create);
36396+
36397+/*
36398+ * Private methods of cryptcompress file plugin
36399+ * (CRYPTCOMPRESS_FILE_PLUGIN_ID)
36400+ */
36401+
36402+/* private inode operations */
36403+int setattr_cryptcompress(struct dentry *, struct iattr *);
36404+
36405+/* private file operations */
36406+ssize_t read_cryptcompress(struct file *, char __user *buf,
36407+ size_t count, loff_t *off);
36408+ssize_t write_cryptcompress(struct file *, const char __user *buf,
36409+ size_t count, loff_t * off, int *conv);
36410+int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd,
36411+ unsigned long arg);
36412+int mmap_cryptcompress(struct file *, struct vm_area_struct *);
36413+int open_cryptcompress(struct inode *, struct file *);
36414+int release_cryptcompress(struct inode *, struct file *);
36415+
36416+/* private address space operations */
36417+int readpage_cryptcompress(struct file *, struct page *);
36418+int readpages_cryptcompress(struct file*, struct address_space*,
36419+ struct list_head*, unsigned);
36420+int writepages_cryptcompress(struct address_space *,
36421+ struct writeback_control *);
36422+int prepare_write_cryptcompress(struct file *, struct page *, unsigned from,
36423+ unsigned to);
36424+int commit_write_cryptcompress(struct file *, struct page *, unsigned from,
36425+ unsigned to);
36426+sector_t bmap_cryptcompress(struct address_space *, sector_t lblock);
36427+
36428+/* other private methods */
36429+int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
36430+ int user, loff_t, loff_t, rw_op, flow_t *);
36431+int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
36432+int create_object_cryptcompress(struct inode *, struct inode *,
36433+ reiser4_object_create_data *);
36434+int delete_object_cryptcompress(struct inode *);
36435+void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
36436+ int create);
36437+int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
36438+ const reiser4_key * to_key,
36439+ reiser4_key * smallest_removed,
36440+ struct inode *object, int truncate,
36441+ int *progress);
36442+void destroy_inode_cryptcompress(struct inode *);
36443+
36444+/*
36445+ * Private methods of symlink file plugin
36446+ * (SYMLINK_FILE_PLUGIN_ID)
36447+ */
36448+int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
36449+ reiser4_object_create_data *);
36450+void destroy_inode_symlink(struct inode *);
36451+
36452+/*
36453+ * all the write into unix file is performed by item write method. Write method
36454+ * of unix file plugin only decides which item plugin (extent or tail) and in
36455+ * which mode (one from the enum below) to call
36456+ */
36457+typedef enum {
36458+ FIRST_ITEM = 1,
36459+ APPEND_ITEM = 2,
36460+ OVERWRITE_ITEM = 3
36461+} write_mode_t;
36462+
36463+/* unix file may be in one the following states */
36464+typedef enum {
36465+ UF_CONTAINER_UNKNOWN = 0,
36466+ UF_CONTAINER_TAILS = 1,
36467+ UF_CONTAINER_EXTENTS = 2,
36468+ UF_CONTAINER_EMPTY = 3
36469+} file_container_t;
36470+
36471+struct formatting_plugin;
36472+struct inode;
36473+
36474+/* unix file plugin specific part of reiser4 inode */
36475+struct unix_file_info {
36476+ /*
36477+ * this read-write lock protects file containerization change. Accesses
36478+ * which do not change file containerization (see file_container_t)
36479+ * (read, readpage, writepage, write (until tail conversion is
36480+ * involved)) take read-lock. Accesses which modify file
36481+ * containerization (truncate, conversion from tail to extent and back)
36482+ * take write-lock.
36483+ */
36484+ struct rw_semaphore latch;
36485+ /* this enum specifies which items are used to build the file */
36486+ file_container_t container;
36487+ /*
36488+ * plugin which controls when file is to be converted to extents and
36489+ * back to tail
36490+ */
36491+ struct formatting_plugin *tplug;
36492+ /* if this is set, file is in exclusive use */
36493+ int exclusive_use;
36494+#if REISER4_DEBUG
36495+ /* pointer to task struct of thread owning exclusive access to file */
36496+ void *ea_owner;
36497+ atomic_t nr_neas;
36498+ void *last_reader;
36499+#endif
36500+};
36501+
36502+struct unix_file_info *unix_file_inode_data(const struct inode *inode);
36503+void get_exclusive_access(struct unix_file_info *);
36504+void drop_exclusive_access(struct unix_file_info *);
36505+void get_nonexclusive_access(struct unix_file_info *);
36506+void drop_nonexclusive_access(struct unix_file_info *);
36507+int try_to_get_nonexclusive_access(struct unix_file_info *);
36508+int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
36509+ struct inode *);
36510+int find_file_item_nohint(coord_t *, lock_handle *,
36511+ const reiser4_key *, znode_lock_mode,
36512+ struct inode *);
36513+
36514+int load_file_hint(struct file *, hint_t *);
36515+void save_file_hint(struct file *, const hint_t *);
36516+
36517+#include "../item/extent.h"
36518+#include "../item/tail.h"
36519+#include "../item/ctail.h"
36520+
36521+struct uf_coord {
36522+ coord_t coord;
36523+ lock_handle *lh;
36524+ int valid;
36525+ union {
36526+ struct extent_coord_extension extent;
36527+ struct tail_coord_extension tail;
36528+ struct ctail_coord_extension ctail;
36529+ } extension;
36530+};
36531+
36532+#include "../../forward.h"
36533+#include "../../seal.h"
36534+#include "../../lock.h"
36535+
36536+/*
36537+ * This structure is used to speed up file operations (reads and writes). A
36538+ * hint is a suggestion about where a key resolved to last time. A seal
36539+ * indicates whether a node has been modified since a hint was last recorded.
36540+ * You check the seal, and if the seal is still valid, you can use the hint
36541+ * without traversing the tree again.
36542+ */
36543+struct hint {
36544+ seal_t seal; /* a seal over last file item accessed */
36545+ uf_coord_t ext_coord;
36546+ loff_t offset;
36547+ znode_lock_mode mode;
36548+ lock_handle lh;
36549+};
36550+
36551+static inline int hint_is_valid(hint_t * hint)
36552+{
36553+ return hint->ext_coord.valid;
36554+}
36555+
36556+static inline void hint_set_valid(hint_t * hint)
36557+{
36558+ hint->ext_coord.valid = 1;
36559+}
36560+
36561+static inline void hint_clr_valid(hint_t * hint)
36562+{
36563+ hint->ext_coord.valid = 0;
36564+}
36565+
36566+int load_file_hint(struct file *, hint_t *);
36567+void save_file_hint(struct file *, const hint_t *);
36568+void hint_init_zero(hint_t *);
36569+void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
36570+int hint_is_set(const hint_t *);
36571+void reiser4_unset_hint(hint_t *);
36572+
36573+int reiser4_update_file_size(struct inode *, reiser4_key *, int update_sd);
36574+int cut_file_items(struct inode *, loff_t new_size, int update_sd,
36575+ loff_t cur_size, int (*update_actor) (struct inode *,
36576+ reiser4_key *, int));
36577+#if REISER4_DEBUG
36578+
36579+/* return 1 is exclusive access is obtained, 0 - otherwise */
36580+static inline int ea_obtained(struct unix_file_info * uf_info)
36581+{
36582+ int ret;
36583+
36584+ ret = down_read_trylock(&uf_info->latch);
36585+ if (ret)
36586+ up_read(&uf_info->latch);
36587+ return !ret;
36588+}
36589+
36590+#endif
36591+
36592+#define WRITE_GRANULARITY 32
36593+
36594+int tail2extent(struct unix_file_info *);
36595+int extent2tail(struct file *, struct unix_file_info *);
36596+
36597+int goto_right_neighbor(coord_t *, lock_handle *);
36598+int find_or_create_extent(struct page *);
36599+int equal_to_ldk(znode *, const reiser4_key *);
36600+
36601+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
36602+
36603+static inline int cbk_errored(int cbk_result)
36604+{
36605+ return (cbk_result != CBK_COORD_NOTFOUND
36606+ && cbk_result != CBK_COORD_FOUND);
36607+}
36608+
36609+/* __REISER4_FILE_H__ */
36610+#endif
36611+
36612+/*
36613+ * Local variables:
36614+ * c-indentation-style: "K&R"
36615+ * mode-name: "LC"
36616+ * c-basic-offset: 8
36617+ * tab-width: 8
36618+ * fill-column: 79
36619+ * scroll-step: 1
36620+ * End:
36621+*/
36622diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/Makefile linux-2.6.23/fs/reiser4/plugin/file/Makefile
36623--- linux-2.6.23.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 03:00:00.000000000 +0300
36624+++ linux-2.6.23/fs/reiser4/plugin/file/Makefile 2007-12-04 16:49:30.000000000 +0300
36625@@ -0,0 +1,7 @@
36626+obj-$(CONFIG_REISER4_FS) += file_plugins.o
36627+
36628+file_plugins-objs := \
36629+ file.o \
36630+ tail_conversion.o \
36631+ symlink.o \
36632+ cryptcompress.o
36633diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.23/fs/reiser4/plugin/file/symfile.c
36634--- linux-2.6.23.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 03:00:00.000000000 +0300
36635+++ linux-2.6.23/fs/reiser4/plugin/file/symfile.c 2007-12-04 16:49:30.000000000 +0300
36636@@ -0,0 +1,87 @@
36637+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
36638+
36639+/* Symfiles are a generalization of Unix symlinks.
36640+
36641+ A symfile when read behaves as though you took its contents and
36642+ substituted them into the reiser4 naming system as the right hand side
36643+ of an assignment, and then read that which you had assigned to it.
36644+
36645+ A key issue for symfiles is how to implement writes through to
36646+ subfiles. In general, one must have some method of determining what
36647+ of that which is written to the symfile is written to what subfile.
36648+ This can be done by use of custom plugin methods written by users, or
36649+ by using a few general methods we provide for those willing to endure
36650+ the insertion of delimiters into what is read.
36651+
36652+ Writing to symfiles without delimiters to denote what is written to
36653+ what subfile is not supported by any plugins we provide in this
36654+ release. Our most sophisticated support for writes is that embodied
36655+ by the invert plugin (see invert.c).
36656+
36657+ A read only version of the /etc/passwd file might be
36658+ constructed as a symfile whose contents are as follows:
36659+
36660+ /etc/passwd/userlines/*
36661+
36662+ or
36663+
36664+ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
36665+
36666+ or
36667+
36668+ /etc/passwd/userlines/(demidov+edward+reiser+root)
36669+
36670+ A symfile with contents
36671+
36672+ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
36673+
36674+ will return when read
36675+
36676+ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
36677+
36678+ and write of what has been read will not be possible to implement as
36679+ an identity operation because there are no delimiters denoting the
36680+ boundaries of what is to be written to what subfile.
36681+
36682+ Note that one could make this a read/write symfile if one specified
36683+ delimiters, and the write method understood those delimiters delimited
36684+ what was written to subfiles.
36685+
36686+ So, specifying the symfile in a manner that allows writes:
36687+
36688+ /etc/passwd/userlines/demidov+"(
36689+ )+/etc/passwd/userlines/edward+"(
36690+ )+/etc/passwd/userlines/reiser+"(
36691+ )+/etc/passwd/userlines/root+"(
36692+ )
36693+
36694+ or
36695+
36696+ /etc/passwd/userlines/(demidov+"(
36697+ )+edward+"(
36698+ )+reiser+"(
36699+ )+root+"(
36700+ ))
36701+
36702+ and the file demidov might be specified as:
36703+
36704+ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
36705+
36706+ or
36707+
36708+ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
36709+
36710+ Notice that if the file demidov has a carriage return in it, the
36711+ parsing fails, but then if you put carriage returns in the wrong place
36712+ in a normal /etc/passwd file it breaks things also.
36713+
36714+ Note that it is forbidden to have no text between two interpolations
36715+ if one wants to be able to define what parts of a write go to what
36716+ subfiles referenced in an interpolation.
36717+
36718+ If one wants to be able to add new lines by writing to the file, one
36719+ must either write a custom plugin for /etc/passwd that knows how to
36720+ name an added line, or one must use an invert, or one must use a more
36721+ sophisticated symfile syntax that we are not planning to write for
36722+ version 4.0.
36723+*/
36724diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.23/fs/reiser4/plugin/file/symlink.c
36725--- linux-2.6.23.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 03:00:00.000000000 +0300
36726+++ linux-2.6.23/fs/reiser4/plugin/file/symlink.c 2007-12-04 16:49:30.000000000 +0300
36727@@ -0,0 +1,95 @@
36728+/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
36729+
36730+#include "../../inode.h"
36731+
36732+#include <linux/types.h>
36733+#include <linux/fs.h>
36734+
36735+/* file plugin methods specific for symlink files
36736+ (SYMLINK_FILE_PLUGIN_ID) */
36737+
36738+/* this is implementation of create_object method of file plugin for
36739+ SYMLINK_FILE_PLUGIN_ID
36740+ */
36741+
36742+/**
36743+ * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
36744+ * @symlink: inode of symlink object
36745+ * @dir: inode of parent directory
36746+ * @info: parameters of new object
36747+ *
36748+ * Inserts stat data with symlink extension where into the tree.
36749+ */
36750+int reiser4_create_symlink(struct inode *symlink,
36751+ struct inode *dir UNUSED_ARG,
36752+ reiser4_object_create_data *data /* info passed to us
36753+ * this is filled by
36754+ * reiser4() syscall
36755+ * in particular */)
36756+{
36757+ int result;
36758+
36759+ assert("nikita-680", symlink != NULL);
36760+ assert("nikita-681", S_ISLNK(symlink->i_mode));
36761+ assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
36762+ assert("nikita-682", dir != NULL);
36763+ assert("nikita-684", data != NULL);
36764+ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
36765+
36766+ /*
36767+ * stat data of symlink has symlink extension in which we store
36768+ * symlink content, that is, path symlink is pointing to.
36769+ */
36770+ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
36771+
36772+ assert("vs-838", symlink->i_private == NULL);
36773+ symlink->i_private = (void *)data->name;
36774+
36775+ assert("vs-843", symlink->i_size == 0);
36776+ INODE_SET_FIELD(symlink, i_size, strlen(data->name));
36777+
36778+ /* insert stat data appended with data->name */
36779+ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
36780+ if (result) {
36781+ /* FIXME-VS: Make sure that symlink->i_private is not attached
36782+ to kmalloced data */
36783+ INODE_SET_FIELD(symlink, i_size, 0);
36784+ } else {
36785+ assert("vs-849", symlink->i_private
36786+ && reiser4_inode_get_flag(symlink,
36787+ REISER4_GENERIC_PTR_USED));
36788+ assert("vs-850",
36789+ !memcmp((char *)symlink->i_private, data->name,
36790+ (size_t) symlink->i_size + 1));
36791+ }
36792+ return result;
36793+}
36794+
36795+/* this is implementation of destroy_inode method of file plugin for
36796+ SYMLINK_FILE_PLUGIN_ID
36797+ */
36798+void destroy_inode_symlink(struct inode *inode)
36799+{
36800+ assert("edward-799",
36801+ inode_file_plugin(inode) ==
36802+ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
36803+ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
36804+ assert("edward-801", reiser4_inode_get_flag(inode,
36805+ REISER4_GENERIC_PTR_USED));
36806+ assert("vs-839", S_ISLNK(inode->i_mode));
36807+
36808+ kfree(inode->i_private);
36809+ inode->i_private = NULL;
36810+ reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
36811+}
36812+
36813+/*
36814+ Local variables:
36815+ c-indentation-style: "K&R"
36816+ mode-name: "LC"
36817+ c-basic-offset: 8
36818+ tab-width: 8
36819+ fill-column: 80
36820+ scroll-step: 1
36821+ End:
36822+*/
36823diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.23/fs/reiser4/plugin/file/tail_conversion.c
36824--- linux-2.6.23.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 03:00:00.000000000 +0300
36825+++ linux-2.6.23/fs/reiser4/plugin/file/tail_conversion.c 2007-12-04 16:49:30.000000000 +0300
36826@@ -0,0 +1,726 @@
36827+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
36828+
36829+#include "../../inode.h"
36830+#include "../../super.h"
36831+#include "../../page_cache.h"
36832+#include "../../carry.h"
36833+#include "../../safe_link.h"
36834+#include "../../vfs_ops.h"
36835+
36836+#include <linux/writeback.h>
36837+
36838+/* this file contains:
36839+ tail2extent and extent2tail */
36840+
36841+/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
36842+void get_exclusive_access(struct unix_file_info * uf_info)
36843+{
36844+ assert("nikita-3028", reiser4_schedulable());
36845+ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
36846+ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
36847+ /*
36848+ * "deadlock avoidance": sometimes we commit a transaction under
36849+ * rw-semaphore on a file. Such commit can deadlock with another
36850+ * thread that captured some block (hence preventing atom from being
36851+ * committed) and waits on rw-semaphore.
36852+ */
36853+ reiser4_txn_restart_current();
36854+ LOCK_CNT_INC(inode_sem_w);
36855+ down_write(&uf_info->latch);
36856+ uf_info->exclusive_use = 1;
36857+ assert("vs-1713", uf_info->ea_owner == NULL);
36858+ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
36859+ ON_DEBUG(uf_info->ea_owner = current);
36860+}
36861+
36862+void drop_exclusive_access(struct unix_file_info * uf_info)
36863+{
36864+ assert("vs-1714", uf_info->ea_owner == current);
36865+ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
36866+ ON_DEBUG(uf_info->ea_owner = NULL);
36867+ uf_info->exclusive_use = 0;
36868+ up_write(&uf_info->latch);
36869+ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
36870+ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
36871+ LOCK_CNT_DEC(inode_sem_w);
36872+ reiser4_txn_restart_current();
36873+}
36874+
36875+/**
36876+ * nea_grabbed - do something when file semaphore is down_read-ed
36877+ * @uf_info:
36878+ *
36879+ * This is called when nonexclisive access is obtained on file. All it does is
36880+ * for debugging purposes.
36881+ */
36882+static void nea_grabbed(struct unix_file_info *uf_info)
36883+{
36884+#if REISER4_DEBUG
36885+ LOCK_CNT_INC(inode_sem_r);
36886+ assert("vs-1716", uf_info->ea_owner == NULL);
36887+ atomic_inc(&uf_info->nr_neas);
36888+ uf_info->last_reader = current;
36889+#endif
36890+}
36891+
36892+/**
36893+ * get_nonexclusive_access - get nonexclusive access to a file
36894+ * @uf_info: unix file specific part of inode to obtain access to
36895+ *
36896+ * Nonexclusive access is obtained on a file before read, write, readpage.
36897+ */
36898+void get_nonexclusive_access(struct unix_file_info *uf_info)
36899+{
36900+ assert("nikita-3029", reiser4_schedulable());
36901+ assert("nikita-3361", get_current_context()->trans->atom == NULL);
36902+
36903+ down_read(&uf_info->latch);
36904+ nea_grabbed(uf_info);
36905+}
36906+
36907+/**
36908+ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
36909+ * @uf_info: unix file specific part of inode to obtain access to
36910+ *
36911+ * Non-blocking version of nonexclusive access obtaining.
36912+ */
36913+int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
36914+{
36915+ int result;
36916+
36917+ result = down_read_trylock(&uf_info->latch);
36918+ if (result)
36919+ nea_grabbed(uf_info);
36920+ return result;
36921+}
36922+
36923+void drop_nonexclusive_access(struct unix_file_info * uf_info)
36924+{
36925+ assert("vs-1718", uf_info->ea_owner == NULL);
36926+ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
36927+ ON_DEBUG(atomic_dec(&uf_info->nr_neas));
36928+
36929+ up_read(&uf_info->latch);
36930+
36931+ LOCK_CNT_DEC(inode_sem_r);
36932+ reiser4_txn_restart_current();
36933+}
36934+
36935+/* part of tail2extent. Cut all items covering @count bytes starting from
36936+ @offset */
36937+/* Audited by: green(2002.06.15) */
36938+static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
36939+{
36940+ reiser4_key from, to;
36941+
36942+ /* AUDIT: How about putting an assertion here, what would check
36943+ all provided range is covered by tail items only? */
36944+ /* key of first byte in the range to be cut */
36945+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
36946+
36947+ /* key of last byte in that range */
36948+ to = from;
36949+ set_key_offset(&to, (__u64) (offset + count - 1));
36950+
36951+ /* cut everything between those keys */
36952+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
36953+ inode, 0);
36954+}
36955+
36956+static void release_all_pages(struct page **pages, unsigned nr_pages)
36957+{
36958+ unsigned i;
36959+
36960+ for (i = 0; i < nr_pages; i++) {
36961+ if (pages[i] == NULL) {
36962+ unsigned j;
36963+ for (j = i + 1; j < nr_pages; j++)
36964+ assert("vs-1620", pages[j] == NULL);
36965+ break;
36966+ }
36967+ page_cache_release(pages[i]);
36968+ pages[i] = NULL;
36969+ }
36970+}
36971+
36972+/* part of tail2extent. replace tail items with extent one. Content of tail
36973+ items (@count bytes) being cut are copied already into
36974+ pages. extent_writepage method is called to create extents corresponding to
36975+ those pages */
36976+static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
36977+{
36978+ int result;
36979+ unsigned i;
36980+ STORE_COUNTERS;
36981+
36982+ if (nr_pages == 0)
36983+ return 0;
36984+
36985+ assert("vs-596", pages[0]);
36986+
36987+ /* cut copied items */
36988+ result = cut_formatting_items(inode, page_offset(pages[0]), count);
36989+ if (result)
36990+ return result;
36991+
36992+ CHECK_COUNTERS;
36993+
36994+ /* put into tree replacement for just removed items: extent item, namely */
36995+ for (i = 0; i < nr_pages; i++) {
36996+ result = add_to_page_cache_lru(pages[i], inode->i_mapping,
36997+ pages[i]->index,
36998+ mapping_gfp_mask(inode->
36999+ i_mapping));
37000+ if (result)
37001+ break;
37002+ unlock_page(pages[i]);
37003+ result = find_or_create_extent(pages[i]);
37004+ if (result)
37005+ break;
37006+ SetPageUptodate(pages[i]);
37007+ }
37008+ return result;
37009+}
37010+
37011+#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
37012+ * items */
37013+
37014+static int reserve_tail2extent_iteration(struct inode *inode)
37015+{
37016+ reiser4_block_nr unformatted_nodes;
37017+ reiser4_tree *tree;
37018+
37019+ tree = reiser4_tree_by_inode(inode);
37020+
37021+ /* number of unformatted nodes which will be created */
37022+ unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
37023+
37024+ /*
37025+ * space required for one iteration of extent->tail conversion:
37026+ *
37027+ * 1. kill N tail items
37028+ *
37029+ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
37030+ *
37031+ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
37032+ * extents) extent units.
37033+ *
37034+ * 4. drilling to the leaf level by coord_by_key()
37035+ *
37036+ * 5. possible update of stat-data
37037+ *
37038+ */
37039+ grab_space_enable();
37040+ return reiser4_grab_space
37041+ (2 * tree->height +
37042+ TAIL2EXTENT_PAGE_NUM +
37043+ TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
37044+ 1 + estimate_one_insert_item(tree) +
37045+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37046+}
37047+
37048+/* clear stat data's flag indicating that conversion is being converted */
37049+static int complete_conversion(struct inode *inode)
37050+{
37051+ int result;
37052+
37053+ grab_space_enable();
37054+ result =
37055+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
37056+ BA_CAN_COMMIT);
37057+ if (result == 0) {
37058+ reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
37059+ result = reiser4_update_sd(inode);
37060+ }
37061+ if (result)
37062+ warning("vs-1696", "Failed to clear converting bit of %llu: %i",
37063+ (unsigned long long)get_inode_oid(inode), result);
37064+ return 0;
37065+}
37066+
37067+/**
37068+ * find_start
37069+ * @inode:
37070+ * @id:
37071+ * @offset:
37072+ *
37073+ * this is used by tail2extent and extent2tail to detect where previous
37074+ * uncompleted conversion stopped
37075+ */
37076+static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
37077+{
37078+ int result;
37079+ lock_handle lh;
37080+ coord_t coord;
37081+ struct unix_file_info *ufo;
37082+ int found;
37083+ reiser4_key key;
37084+
37085+ ufo = unix_file_inode_data(inode);
37086+ init_lh(&lh);
37087+ result = 0;
37088+ found = 0;
37089+ inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
37090+ do {
37091+ init_lh(&lh);
37092+ result = find_file_item_nohint(&coord, &lh, &key,
37093+ ZNODE_READ_LOCK, inode);
37094+
37095+ if (result == CBK_COORD_FOUND) {
37096+ if (coord.between == AT_UNIT) {
37097+ /*coord_clear_iplug(&coord); */
37098+ result = zload(coord.node);
37099+ if (result == 0) {
37100+ if (item_id_by_coord(&coord) == id)
37101+ found = 1;
37102+ else
37103+ item_plugin_by_coord(&coord)->s.
37104+ file.append_key(&coord,
37105+ &key);
37106+ zrelse(coord.node);
37107+ }
37108+ } else
37109+ result = RETERR(-ENOENT);
37110+ }
37111+ done_lh(&lh);
37112+ } while (result == 0 && !found);
37113+ *offset = get_key_offset(&key);
37114+ return result;
37115+}
37116+
37117+/**
37118+ * tail2extent
37119+ * @uf_info:
37120+ *
37121+ *
37122+ */
37123+int tail2extent(struct unix_file_info *uf_info)
37124+{
37125+ int result;
37126+ reiser4_key key; /* key of next byte to be moved to page */
37127+ char *p_data; /* data of page */
37128+ unsigned page_off = 0, /* offset within the page where to copy data */
37129+ count; /* number of bytes of item which can be
37130+ * copied to page */
37131+ struct page *pages[TAIL2EXTENT_PAGE_NUM];
37132+ struct page *page;
37133+ int done; /* set to 1 when all file is read */
37134+ char *item;
37135+ int i;
37136+ struct inode *inode;
37137+ int first_iteration;
37138+ int bytes;
37139+ __u64 offset;
37140+
37141+ assert("nikita-3362", ea_obtained(uf_info));
37142+ inode = unix_file_info_to_inode(uf_info);
37143+ assert("nikita-3412", !IS_RDONLY(inode));
37144+ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
37145+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37146+
37147+ offset = 0;
37148+ first_iteration = 1;
37149+ result = 0;
37150+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37151+ /*
37152+ * file is marked on disk as there was a conversion which did
37153+ * not complete due to either crash or some error. Find which
37154+ * offset tail conversion stopped at
37155+ */
37156+ result = find_start(inode, FORMATTING_ID, &offset);
37157+ if (result == -ENOENT) {
37158+ /* no tail items found, everything is converted */
37159+ uf_info->container = UF_CONTAINER_EXTENTS;
37160+ complete_conversion(inode);
37161+ return 0;
37162+ } else if (result != 0)
37163+ /* some other error */
37164+ return result;
37165+ first_iteration = 0;
37166+ }
37167+
37168+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37169+
37170+ /* get key of first byte of a file */
37171+ inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
37172+
37173+ done = 0;
37174+ while (done == 0) {
37175+ memset(pages, 0, sizeof(pages));
37176+ result = reserve_tail2extent_iteration(inode);
37177+ if (result != 0)
37178+ goto out;
37179+ if (first_iteration) {
37180+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37181+ reiser4_update_sd(inode);
37182+ first_iteration = 0;
37183+ }
37184+ bytes = 0;
37185+ for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
37186+ assert("vs-598",
37187+ (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
37188+ page = alloc_page(reiser4_ctx_gfp_mask_get());
37189+ if (!page) {
37190+ result = RETERR(-ENOMEM);
37191+ goto error;
37192+ }
37193+
37194+ page->index =
37195+ (unsigned long)(get_key_offset(&key) >>
37196+ PAGE_CACHE_SHIFT);
37197+ /*
37198+ * usually when one is going to longterm lock znode (as
37199+ * find_file_item does, for instance) he must not hold
37200+ * locked pages. However, there is an exception for
37201+ * case tail2extent. Pages appearing here are not
37202+ * reachable to everyone else, they are clean, they do
37203+ * not have jnodes attached so keeping them locked do
37204+ * not risk deadlock appearance
37205+ */
37206+ assert("vs-983", !PagePrivate(page));
37207+ reiser4_invalidate_pages(inode->i_mapping, page->index,
37208+ 1, 0);
37209+
37210+ for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
37211+ coord_t coord;
37212+ lock_handle lh;
37213+
37214+ /* get next item */
37215+ /* FIXME: we might want to readahead here */
37216+ init_lh(&lh);
37217+ result =
37218+ find_file_item_nohint(&coord, &lh, &key,
37219+ ZNODE_READ_LOCK,
37220+ inode);
37221+ if (result != CBK_COORD_FOUND) {
37222+ /*
37223+ * error happened of not items of file
37224+ * were found
37225+ */
37226+ done_lh(&lh);
37227+ page_cache_release(page);
37228+ goto error;
37229+ }
37230+
37231+ if (coord.between == AFTER_UNIT) {
37232+ /*
37233+ * end of file is reached. Padd page
37234+ * with zeros
37235+ */
37236+ done_lh(&lh);
37237+ done = 1;
37238+ p_data = kmap_atomic(page, KM_USER0);
37239+ memset(p_data + page_off, 0,
37240+ PAGE_CACHE_SIZE - page_off);
37241+ kunmap_atomic(p_data, KM_USER0);
37242+ break;
37243+ }
37244+
37245+ result = zload(coord.node);
37246+ if (result) {
37247+ page_cache_release(page);
37248+ done_lh(&lh);
37249+ goto error;
37250+ }
37251+ assert("vs-856", coord.between == AT_UNIT);
37252+ item = ((char *)item_body_by_coord(&coord)) +
37253+ coord.unit_pos;
37254+
37255+ /* how many bytes to copy */
37256+ count =
37257+ item_length_by_coord(&coord) -
37258+ coord.unit_pos;
37259+ /* limit length of copy to end of page */
37260+ if (count > PAGE_CACHE_SIZE - page_off)
37261+ count = PAGE_CACHE_SIZE - page_off;
37262+
37263+ /*
37264+ * copy item (as much as will fit starting from
37265+ * the beginning of the item) into the page
37266+ */
37267+ p_data = kmap_atomic(page, KM_USER0);
37268+ memcpy(p_data + page_off, item, count);
37269+ kunmap_atomic(p_data, KM_USER0);
37270+
37271+ page_off += count;
37272+ bytes += count;
37273+ set_key_offset(&key,
37274+ get_key_offset(&key) + count);
37275+
37276+ zrelse(coord.node);
37277+ done_lh(&lh);
37278+ } /* end of loop which fills one page by content of
37279+ * formatting items */
37280+
37281+ if (page_off) {
37282+ /* something was copied into page */
37283+ pages[i] = page;
37284+ } else {
37285+ page_cache_release(page);
37286+ assert("vs-1648", done == 1);
37287+ break;
37288+ }
37289+ } /* end of loop through pages of one conversion iteration */
37290+
37291+ if (i > 0) {
37292+ result = replace(inode, pages, i, bytes);
37293+ release_all_pages(pages, sizeof_array(pages));
37294+ if (result)
37295+ goto error;
37296+ /*
37297+ * We have to drop exclusive access to avoid deadlock
37298+ * which may happen because called by reiser4_writepages
37299+ * capture_unix_file requires to get non-exclusive
37300+ * access to a file. It is safe to drop EA in the middle
37301+ * of tail2extent conversion because write_unix_file,
37302+ * setattr_unix_file(truncate), mmap_unix_file,
37303+ * release_unix_file(extent2tail) checks if conversion
37304+ * is not in progress (see comments before
37305+ * get_exclusive_access_careful().
37306+ * Other processes that acquire non-exclusive access
37307+ * (read_unix_file, reiser4_writepages, etc) should work
37308+ * on partially converted files.
37309+ */
37310+ drop_exclusive_access(uf_info);
37311+ /* throttle the conversion */
37312+ reiser4_throttle_write(inode);
37313+ get_exclusive_access(uf_info);
37314+
37315+ /*
37316+ * nobody is allowed to complete conversion but a
37317+ * process which started it
37318+ */
37319+ assert("", reiser4_inode_get_flag(inode,
37320+ REISER4_PART_MIXED));
37321+ }
37322+ }
37323+
37324+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37325+
37326+ if (result == 0) {
37327+ /* file is converted to extent items */
37328+ assert("vs-1697", reiser4_inode_get_flag(inode,
37329+ REISER4_PART_MIXED));
37330+
37331+ uf_info->container = UF_CONTAINER_EXTENTS;
37332+ complete_conversion(inode);
37333+ } else {
37334+ /*
37335+ * conversion is not complete. Inode was already marked as
37336+ * REISER4_PART_CONV and stat-data were updated at the first
37337+ * iteration of the loop above.
37338+ */
37339+ error:
37340+ release_all_pages(pages, sizeof_array(pages));
37341+ warning("nikita-2282", "Partial conversion of %llu: %i",
37342+ (unsigned long long)get_inode_oid(inode), result);
37343+ }
37344+
37345+ out:
37346+ return result;
37347+}
37348+
37349+static int reserve_extent2tail_iteration(struct inode *inode)
37350+{
37351+ reiser4_tree *tree;
37352+
37353+ tree = reiser4_tree_by_inode(inode);
37354+ /*
37355+ * reserve blocks for (in this order):
37356+ *
37357+ * 1. removal of extent item
37358+ *
37359+ * 2. insertion of tail by insert_flow()
37360+ *
37361+ * 3. drilling to the leaf level by coord_by_key()
37362+ *
37363+ * 4. possible update of stat-data
37364+ */
37365+ grab_space_enable();
37366+ return reiser4_grab_space
37367+ (estimate_one_item_removal(tree) +
37368+ estimate_insert_flow(tree->height) +
37369+ 1 + estimate_one_insert_item(tree) +
37370+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37371+}
37372+
37373+/* for every page of file: read page, cut part of extent pointing to this page,
37374+ put data of page tree by tail item */
37375+int extent2tail(struct file * file, struct unix_file_info *uf_info)
37376+{
37377+ int result;
37378+ struct inode *inode;
37379+ struct page *page;
37380+ unsigned long num_pages, i;
37381+ unsigned long start_page;
37382+ reiser4_key from;
37383+ reiser4_key to;
37384+ unsigned count;
37385+ __u64 offset;
37386+
37387+ assert("nikita-3362", ea_obtained(uf_info));
37388+ inode = unix_file_info_to_inode(uf_info);
37389+ assert("nikita-3412", !IS_RDONLY(inode));
37390+ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
37391+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37392+
37393+ offset = 0;
37394+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37395+ /*
37396+ * file is marked on disk as there was a conversion which did
37397+ * not complete due to either crash or some error. Find which
37398+ * offset tail conversion stopped at
37399+ */
37400+ result = find_start(inode, EXTENT_POINTER_ID, &offset);
37401+ if (result == -ENOENT) {
37402+ /* no extent found, everything is converted */
37403+ uf_info->container = UF_CONTAINER_TAILS;
37404+ complete_conversion(inode);
37405+ return 0;
37406+ } else if (result != 0)
37407+ /* some other error */
37408+ return result;
37409+ }
37410+
37411+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37412+
37413+ /* number of pages in the file */
37414+ num_pages =
37415+ (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
37416+ start_page = offset >> PAGE_CACHE_SHIFT;
37417+
37418+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
37419+ to = from;
37420+
37421+ result = 0;
37422+ for (i = 0; i < num_pages; i++) {
37423+ __u64 start_byte;
37424+
37425+ result = reserve_extent2tail_iteration(inode);
37426+ if (result != 0)
37427+ break;
37428+ if (i == 0 && offset == 0) {
37429+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37430+ reiser4_update_sd(inode);
37431+ }
37432+
37433+ page = read_mapping_page(inode->i_mapping,
37434+ (unsigned)(i + start_page), NULL);
37435+ if (IS_ERR(page)) {
37436+ result = PTR_ERR(page);
37437+ break;
37438+ }
37439+
37440+ wait_on_page_locked(page);
37441+
37442+ if (!PageUptodate(page)) {
37443+ page_cache_release(page);
37444+ result = RETERR(-EIO);
37445+ break;
37446+ }
37447+
37448+ /* cut part of file we have read */
37449+ start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
37450+ set_key_offset(&from, start_byte);
37451+ set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
37452+ /*
37453+ * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
37454+ * commits during over-long truncates. But
37455+ * extent->tail conversion should be performed in one
37456+ * transaction.
37457+ */
37458+ result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
37459+ &to, inode, 0);
37460+
37461+ if (result) {
37462+ page_cache_release(page);
37463+ break;
37464+ }
37465+
37466+ /* put page data into tree via tail_write */
37467+ count = PAGE_CACHE_SIZE;
37468+ if ((i == (num_pages - 1)) &&
37469+ (inode->i_size & ~PAGE_CACHE_MASK))
37470+ /* last page can be incompleted */
37471+ count = (inode->i_size & ~PAGE_CACHE_MASK);
37472+ while (count) {
37473+ loff_t pos = start_byte;
37474+
37475+ assert("edward-1533",
37476+ file != NULL && file->f_dentry != NULL);
37477+ assert("edward-1534",
37478+ file->f_dentry->d_inode == inode);
37479+
37480+ result = reiser4_write_tail(file,
37481+ (char __user *)kmap(page),
37482+ count, &pos);
37483+ reiser4_free_file_fsdata(file);
37484+ if (result <= 0) {
37485+ warning("", "reiser4_write_tail failed");
37486+ page_cache_release(page);
37487+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37488+ return result;
37489+ }
37490+ count -= result;
37491+ }
37492+
37493+ /* release page */
37494+ lock_page(page);
37495+ /* page is already detached from jnode and mapping. */
37496+ assert("vs-1086", page->mapping == NULL);
37497+ assert("nikita-2690",
37498+ (!PagePrivate(page) && jprivate(page) == 0));
37499+ /* waiting for writeback completion with page lock held is
37500+ * perfectly valid. */
37501+ wait_on_page_writeback(page);
37502+ reiser4_drop_page(page);
37503+ /* release reference taken by read_cache_page() above */
37504+ page_cache_release(page);
37505+
37506+ drop_exclusive_access(uf_info);
37507+ /* throttle the conversion */
37508+ reiser4_throttle_write(inode);
37509+ get_exclusive_access(uf_info);
37510+ /*
37511+ * nobody is allowed to complete conversion but a process which
37512+ * started it
37513+ */
37514+ assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
37515+ }
37516+
37517+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37518+
37519+ if (i == num_pages) {
37520+ /* file is converted to formatted items */
37521+ assert("vs-1698", reiser4_inode_get_flag(inode,
37522+ REISER4_PART_MIXED));
37523+ assert("vs-1260",
37524+ inode_has_no_jnodes(reiser4_inode_data(inode)));
37525+
37526+ uf_info->container = UF_CONTAINER_TAILS;
37527+ complete_conversion(inode);
37528+ return 0;
37529+ }
37530+ /*
37531+ * conversion is not complete. Inode was already marked as
37532+ * REISER4_PART_MIXED and stat-data were updated at the first *
37533+ * iteration of the loop above.
37534+ */
37535+ warning("nikita-2282",
37536+ "Partial conversion of %llu: %lu of %lu: %i",
37537+ (unsigned long long)get_inode_oid(inode), i,
37538+ num_pages, result);
37539+
37540+ return result;
37541+}
37542+
37543+/*
37544+ * Local variables:
37545+ * c-indentation-style: "K&R"
37546+ * mode-name: "LC"
37547+ * c-basic-offset: 8
37548+ * tab-width: 8
37549+ * fill-column: 79
37550+ * scroll-step: 1
37551+ * End:
37552+ */
37553diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file_ops.c linux-2.6.23/fs/reiser4/plugin/file_ops.c
37554--- linux-2.6.23.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 03:00:00.000000000 +0300
37555+++ linux-2.6.23/fs/reiser4/plugin/file_ops.c 2007-12-04 16:49:30.000000000 +0300
37556@@ -0,0 +1,205 @@
37557+/* Copyright 2005 by Hans Reiser, licensing governed by
37558+ reiser4/README */
37559+
37560+/* this file contains typical implementations for some of methods of
37561+ struct file_operations and of struct address_space_operations
37562+*/
37563+
37564+#include "../inode.h"
37565+#include "object.h"
37566+
37567+/* file operations */
37568+
37569+/* implementation of vfs's llseek method of struct file_operations for
37570+ typical directory can be found in readdir_common.c
37571+*/
37572+loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
37573+
37574+/* implementation of vfs's readdir method of struct file_operations for
37575+ typical directory can be found in readdir_common.c
37576+*/
37577+int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
37578+
37579+/**
37580+ * reiser4_release_dir_common - release of struct file_operations
37581+ * @inode: inode of released file
37582+ * @file: file to release
37583+ *
37584+ * Implementation of release method of struct file_operations for typical
37585+ * directory. All it does is freeing of reiser4 specific file data.
37586+*/
37587+int reiser4_release_dir_common(struct inode *inode, struct file *file)
37588+{
37589+ reiser4_context *ctx;
37590+
37591+ ctx = reiser4_init_context(inode->i_sb);
37592+ if (IS_ERR(ctx))
37593+ return PTR_ERR(ctx);
37594+ reiser4_free_file_fsdata(file);
37595+ reiser4_exit_context(ctx);
37596+ return 0;
37597+}
37598+
37599+/* this is common implementation of vfs's fsync method of struct
37600+ file_operations
37601+*/
37602+int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
37603+{
37604+ reiser4_context *ctx;
37605+ int result;
37606+
37607+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
37608+ if (IS_ERR(ctx))
37609+ return PTR_ERR(ctx);
37610+ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
37611+
37612+ context_set_commit_async(ctx);
37613+ reiser4_exit_context(ctx);
37614+ return result;
37615+}
37616+
37617+/*
37618+ * common sync method for regular files.
37619+ *
37620+ * We are trying to be smart here. Instead of committing all atoms (original
37621+ * solution), we scan dirty pages of this file and commit all atoms they are
37622+ * part of.
37623+ *
37624+ * Situation is complicated by anonymous pages: i.e., extent-less pages
37625+ * dirtied through mmap. Fortunately sys_fsync() first calls
37626+ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
37627+ * all missing extents and capture anonymous pages.
37628+ */
37629+int reiser4_sync_file_common(struct file *file,
37630+ struct dentry *dentry, int datasync)
37631+{
37632+ reiser4_context *ctx;
37633+ txn_atom *atom;
37634+ reiser4_block_nr reserve;
37635+
37636+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
37637+ if (IS_ERR(ctx))
37638+ return PTR_ERR(ctx);
37639+
37640+ reserve = estimate_update_common(dentry->d_inode);
37641+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
37642+ reiser4_exit_context(ctx);
37643+ return RETERR(-ENOSPC);
37644+ }
37645+ write_sd_by_inode_common(dentry->d_inode);
37646+
37647+ atom = get_current_atom_locked();
37648+ spin_lock_txnh(ctx->trans);
37649+ force_commit_atom(ctx->trans);
37650+ reiser4_exit_context(ctx);
37651+ return 0;
37652+}
37653+
37654+/* this is common implementation of vfs's sendfile method of struct
37655+ file_operations
37656+
37657+ Reads @count bytes from @file and calls @actor for every page read. This is
37658+ needed for loop back devices support.
37659+*/
37660+#if 0
37661+ssize_t
37662+sendfile_common(struct file *file, loff_t *ppos, size_t count,
37663+ read_actor_t actor, void *target)
37664+{
37665+ reiser4_context *ctx;
37666+ ssize_t result;
37667+
37668+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
37669+ if (IS_ERR(ctx))
37670+ return PTR_ERR(ctx);
37671+ result = generic_file_sendfile(file, ppos, count, actor, target);
37672+ reiser4_exit_context(ctx);
37673+ return result;
37674+}
37675+#endif /* 0 */
37676+
37677+/* address space operations */
37678+
37679+/* this is common implementation of vfs's prepare_write method of struct
37680+ address_space_operations
37681+*/
37682+int
37683+prepare_write_common(struct file *file, struct page *page, unsigned from,
37684+ unsigned to)
37685+{
37686+ reiser4_context *ctx;
37687+ int result;
37688+
37689+ ctx = reiser4_init_context(page->mapping->host->i_sb);
37690+ result = do_prepare_write(file, page, from, to);
37691+
37692+ /* don't commit transaction under inode semaphore */
37693+ context_set_commit_async(ctx);
37694+ reiser4_exit_context(ctx);
37695+
37696+ return result;
37697+}
37698+
37699+/* this is helper for prepare_write_common and prepare_write_unix_file
37700+ */
37701+int
37702+do_prepare_write(struct file *file, struct page *page, unsigned from,
37703+ unsigned to)
37704+{
37705+ int result;
37706+ file_plugin *fplug;
37707+ struct inode *inode;
37708+
37709+ assert("umka-3099", file != NULL);
37710+ assert("umka-3100", page != NULL);
37711+ assert("umka-3095", PageLocked(page));
37712+
37713+ if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
37714+ return 0;
37715+
37716+ inode = page->mapping->host;
37717+ fplug = inode_file_plugin(inode);
37718+
37719+ if (page->mapping->a_ops->readpage == NULL)
37720+ return RETERR(-EINVAL);
37721+
37722+ result = page->mapping->a_ops->readpage(file, page);
37723+ if (result != 0) {
37724+ SetPageError(page);
37725+ ClearPageUptodate(page);
37726+ /* All reiser4 readpage() implementations should return the
37727+ * page locked in case of error. */
37728+ assert("nikita-3472", PageLocked(page));
37729+ } else {
37730+ /*
37731+ * ->readpage() either:
37732+ *
37733+ * 1. starts IO against @page. @page is locked for IO in
37734+ * this case.
37735+ *
37736+ * 2. doesn't start IO. @page is unlocked.
37737+ *
37738+ * In either case, page should be locked.
37739+ */
37740+ lock_page(page);
37741+ /*
37742+ * IO (if any) is completed at this point. Check for IO
37743+ * errors.
37744+ */
37745+ if (!PageUptodate(page))
37746+ result = RETERR(-EIO);
37747+ }
37748+ assert("umka-3098", PageLocked(page));
37749+ return result;
37750+}
37751+
37752+/*
37753+ * Local variables:
37754+ * c-indentation-style: "K&R"
37755+ * mode-name: "LC"
37756+ * c-basic-offset: 8
37757+ * tab-width: 8
37758+ * fill-column: 79
37759+ * scroll-step: 1
37760+ * End:
37761+ */
37762diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.23/fs/reiser4/plugin/file_ops_readdir.c
37763--- linux-2.6.23.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 03:00:00.000000000 +0300
37764+++ linux-2.6.23/fs/reiser4/plugin/file_ops_readdir.c 2007-12-04 16:49:30.000000000 +0300
37765@@ -0,0 +1,658 @@
37766+/* Copyright 2005 by Hans Reiser, licensing governed by
37767+ * reiser4/README */
37768+
37769+#include "../inode.h"
37770+
37771+/* return true, iff @coord points to the valid directory item that is part of
37772+ * @inode directory. */
37773+static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
37774+{
37775+ return plugin_of_group(item_plugin_by_coord(coord),
37776+ DIR_ENTRY_ITEM_TYPE) &&
37777+ inode_file_plugin(inode)->owns_item(inode, coord);
37778+}
37779+
37780+/* compare two logical positions within the same directory */
37781+static cmp_t dir_pos_cmp(const struct dir_pos * p1, const struct dir_pos * p2)
37782+{
37783+ cmp_t result;
37784+
37785+ assert("nikita-2534", p1 != NULL);
37786+ assert("nikita-2535", p2 != NULL);
37787+
37788+ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
37789+ if (result == EQUAL_TO) {
37790+ int diff;
37791+
37792+ diff = p1->pos - p2->pos;
37793+ result =
37794+ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
37795+ }
37796+ return result;
37797+}
37798+
37799+/* see comment before reiser4_readdir_common() for overview of why "adjustment" is
37800+ * necessary. */
37801+static void
37802+adjust_dir_pos(struct file *dir, struct readdir_pos * readdir_spot,
37803+ const struct dir_pos * mod_point, int adj)
37804+{
37805+ struct dir_pos *pos;
37806+
37807+ /*
37808+ * new directory entry was added (adj == +1) or removed (adj == -1) at
37809+ * the @mod_point. Directory file descriptor @dir is doing readdir and
37810+ * is currently positioned at @readdir_spot. Latter has to be updated
37811+ * to maintain stable readdir.
37812+ */
37813+ /* directory is positioned to the beginning. */
37814+ if (readdir_spot->entry_no == 0)
37815+ return;
37816+
37817+ pos = &readdir_spot->position;
37818+ switch (dir_pos_cmp(mod_point, pos)) {
37819+ case LESS_THAN:
37820+ /* @mod_pos is _before_ @readdir_spot, that is, entry was
37821+ * added/removed on the left (in key order) of current
37822+ * position. */
37823+ /* logical number of directory entry readdir is "looking" at
37824+ * changes */
37825+ readdir_spot->entry_no += adj;
37826+ assert("nikita-2577",
37827+ ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
37828+ if (de_id_cmp(&pos->dir_entry_key,
37829+ &mod_point->dir_entry_key) == EQUAL_TO) {
37830+ assert("nikita-2575", mod_point->pos < pos->pos);
37831+ /*
37832+ * if entry added/removed has the same key as current
37833+ * for readdir, update counter of duplicate keys in
37834+ * @readdir_spot.
37835+ */
37836+ pos->pos += adj;
37837+ }
37838+ break;
37839+ case GREATER_THAN:
37840+ /* directory is modified after @pos: nothing to do. */
37841+ break;
37842+ case EQUAL_TO:
37843+ /* cannot insert an entry readdir is looking at, because it
37844+ already exists. */
37845+ assert("nikita-2576", adj < 0);
37846+ /* directory entry to which @pos points to is being
37847+ removed.
37848+
37849+ NOTE-NIKITA: Right thing to do is to update @pos to point
37850+ to the next entry. This is complex (we are under spin-lock
37851+ for one thing). Just rewind it to the beginning. Next
37852+ readdir will have to scan the beginning of
37853+ directory. Proper solution is to use semaphore in
37854+ spin lock's stead and use rewind_right() here.
37855+
37856+ NOTE-NIKITA: now, semaphore is used, so...
37857+ */
37858+ memset(readdir_spot, 0, sizeof *readdir_spot);
37859+ }
37860+}
37861+
37862+/* scan all file-descriptors for this directory and adjust their
37863+ positions respectively. Should be used by implementations of
37864+ add_entry and rem_entry of dir plugin */
37865+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
37866+ int offset, int adj)
37867+{
37868+ reiser4_file_fsdata *scan;
37869+ struct dir_pos mod_point;
37870+
37871+ assert("nikita-2536", dir != NULL);
37872+ assert("nikita-2538", de != NULL);
37873+ assert("nikita-2539", adj != 0);
37874+
37875+ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
37876+ mod_point.pos = offset;
37877+
37878+ spin_lock_inode(dir);
37879+
37880+ /*
37881+ * new entry was added/removed in directory @dir. Scan all file
37882+ * descriptors for @dir that are currently involved into @readdir and
37883+ * update them.
37884+ */
37885+
37886+ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
37887+ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
37888+
37889+ spin_unlock_inode(dir);
37890+}
37891+
37892+/*
37893+ * traverse tree to start/continue readdir from the readdir position @pos.
37894+ */
37895+static int dir_go_to(struct file *dir, struct readdir_pos * pos, tap_t * tap)
37896+{
37897+ reiser4_key key;
37898+ int result;
37899+ struct inode *inode;
37900+
37901+ assert("nikita-2554", pos != NULL);
37902+
37903+ inode = dir->f_dentry->d_inode;
37904+ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
37905+ if (result != 0)
37906+ return result;
37907+ result = reiser4_object_lookup(inode,
37908+ &key,
37909+ tap->coord,
37910+ tap->lh,
37911+ tap->mode,
37912+ FIND_EXACT,
37913+ LEAF_LEVEL, LEAF_LEVEL,
37914+ 0, &tap->ra_info);
37915+ if (result == CBK_COORD_FOUND)
37916+ result = rewind_right(tap, (int)pos->position.pos);
37917+ else {
37918+ tap->coord->node = NULL;
37919+ done_lh(tap->lh);
37920+ result = RETERR(-EIO);
37921+ }
37922+ return result;
37923+}
37924+
37925+/*
37926+ * handling of non-unique keys: calculate at what ordinal position within
37927+ * sequence of directory items with identical keys @pos is.
37928+ */
37929+static int set_pos(struct inode *inode, struct readdir_pos * pos, tap_t * tap)
37930+{
37931+ int result;
37932+ coord_t coord;
37933+ lock_handle lh;
37934+ tap_t scan;
37935+ de_id *did;
37936+ reiser4_key de_key;
37937+
37938+ coord_init_zero(&coord);
37939+ init_lh(&lh);
37940+ reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
37941+ reiser4_tap_copy(&scan, tap);
37942+ reiser4_tap_load(&scan);
37943+ pos->position.pos = 0;
37944+
37945+ did = &pos->position.dir_entry_key;
37946+
37947+ if (is_valid_dir_coord(inode, scan.coord)) {
37948+
37949+ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
37950+
37951+ while (1) {
37952+
37953+ result = go_prev_unit(&scan);
37954+ if (result != 0)
37955+ break;
37956+
37957+ if (!is_valid_dir_coord(inode, scan.coord)) {
37958+ result = -EINVAL;
37959+ break;
37960+ }
37961+
37962+ /* get key of directory entry */
37963+ unit_key_by_coord(scan.coord, &de_key);
37964+ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
37965+ /* duplicate-sequence is over */
37966+ break;
37967+ }
37968+ pos->position.pos++;
37969+ }
37970+ } else
37971+ result = RETERR(-ENOENT);
37972+ reiser4_tap_relse(&scan);
37973+ reiser4_tap_done(&scan);
37974+ return result;
37975+}
37976+
37977+/*
37978+ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
37979+ */
37980+static int dir_rewind(struct file *dir, struct readdir_pos * pos, tap_t * tap)
37981+{
37982+ __u64 destination;
37983+ __s64 shift;
37984+ int result;
37985+ struct inode *inode;
37986+ loff_t dirpos;
37987+
37988+ assert("nikita-2553", dir != NULL);
37989+ assert("nikita-2548", pos != NULL);
37990+ assert("nikita-2551", tap->coord != NULL);
37991+ assert("nikita-2552", tap->lh != NULL);
37992+
37993+ dirpos = reiser4_get_dir_fpos(dir);
37994+ shift = dirpos - pos->fpos;
37995+ /* this is logical directory entry within @dir which we are rewinding
37996+ * to */
37997+ destination = pos->entry_no + shift;
37998+
37999+ inode = dir->f_dentry->d_inode;
38000+ if (dirpos < 0)
38001+ return RETERR(-EINVAL);
38002+ else if (destination == 0ll || dirpos == 0) {
38003+ /* rewind to the beginning of directory */
38004+ memset(pos, 0, sizeof *pos);
38005+ return dir_go_to(dir, pos, tap);
38006+ } else if (destination >= inode->i_size)
38007+ return RETERR(-ENOENT);
38008+
38009+ if (shift < 0) {
38010+ /* I am afraid of negative numbers */
38011+ shift = -shift;
38012+ /* rewinding to the left */
38013+ if (shift <= (int)pos->position.pos) {
38014+ /* destination is within sequence of entries with
38015+ duplicate keys. */
38016+ result = dir_go_to(dir, pos, tap);
38017+ } else {
38018+ shift -= pos->position.pos;
38019+ while (1) {
38020+ /* repetitions: deadlock is possible when
38021+ going to the left. */
38022+ result = dir_go_to(dir, pos, tap);
38023+ if (result == 0) {
38024+ result = rewind_left(tap, shift);
38025+ if (result == -E_DEADLOCK) {
38026+ reiser4_tap_done(tap);
38027+ continue;
38028+ }
38029+ }
38030+ break;
38031+ }
38032+ }
38033+ } else {
38034+ /* rewinding to the right */
38035+ result = dir_go_to(dir, pos, tap);
38036+ if (result == 0)
38037+ result = rewind_right(tap, shift);
38038+ }
38039+ if (result == 0) {
38040+ result = set_pos(inode, pos, tap);
38041+ if (result == 0) {
38042+ /* update pos->position.pos */
38043+ pos->entry_no = destination;
38044+ pos->fpos = dirpos;
38045+ }
38046+ }
38047+ return result;
38048+}
38049+
38050+/*
38051+ * Function that is called by common_readdir() on each directory entry while
38052+ * doing readdir. ->filldir callback may block, so we had to release long term
38053+ * lock while calling it. To avoid repeating tree traversal, seal is used. If
38054+ * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
38055+ *
38056+ * Whether node is unlocked in case of any other error is undefined. It is
38057+ * guaranteed to be still locked if success (0) is returned.
38058+ *
38059+ * When ->filldir() wants no more, feed_entry() returns 1, and node is
38060+ * unlocked.
38061+ */
38062+static int
38063+feed_entry(struct file *f, struct readdir_pos * pos, tap_t * tap,
38064+ filldir_t filldir, void *dirent)
38065+{
38066+ item_plugin *iplug;
38067+ char *name;
38068+ reiser4_key sd_key;
38069+ int result;
38070+ char buf[DE_NAME_BUF_LEN];
38071+ char name_buf[32];
38072+ char *local_name;
38073+ unsigned file_type;
38074+ seal_t seal;
38075+ coord_t *coord;
38076+ reiser4_key entry_key;
38077+
38078+ coord = tap->coord;
38079+ iplug = item_plugin_by_coord(coord);
38080+
38081+ /* pointer to name within the node */
38082+ name = iplug->s.dir.extract_name(coord, buf);
38083+ assert("nikita-1371", name != NULL);
38084+
38085+ /* key of object the entry points to */
38086+ if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
38087+ return RETERR(-EIO);
38088+
38089+ /* we must release longterm znode lock before calling filldir to avoid
38090+ deadlock which may happen if filldir causes page fault. So, copy
38091+ name to intermediate buffer */
38092+ if (strlen(name) + 1 > sizeof(name_buf)) {
38093+ local_name = kmalloc(strlen(name) + 1,
38094+ reiser4_ctx_gfp_mask_get());
38095+ if (local_name == NULL)
38096+ return RETERR(-ENOMEM);
38097+ } else
38098+ local_name = name_buf;
38099+
38100+ strcpy(local_name, name);
38101+ file_type = iplug->s.dir.extract_file_type(coord);
38102+
38103+ unit_key_by_coord(coord, &entry_key);
38104+ reiser4_seal_init(&seal, coord, &entry_key);
38105+
38106+ longterm_unlock_znode(tap->lh);
38107+
38108+ /*
38109+ * send information about directory entry to the ->filldir() filler
38110+ * supplied to us by caller (VFS).
38111+ *
38112+ * ->filldir is entitled to do weird things. For example, ->filldir
38113+ * supplied by knfsd re-enters file system. Make sure no locks are
38114+ * held.
38115+ */
38116+ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
38117+
38118+ reiser4_txn_restart_current();
38119+ result = filldir(dirent, name, (int)strlen(name),
38120+ /* offset of this entry */
38121+ f->f_pos,
38122+ /* inode number of object bounden by this entry */
38123+ oid_to_uino(get_key_objectid(&sd_key)), file_type);
38124+ if (local_name != name_buf)
38125+ kfree(local_name);
38126+ if (result < 0)
38127+ /* ->filldir() is satisfied. (no space in buffer, IOW) */
38128+ result = 1;
38129+ else
38130+ result = reiser4_seal_validate(&seal, coord, &entry_key,
38131+ tap->lh, tap->mode,
38132+ ZNODE_LOCK_HIPRI);
38133+ return result;
38134+}
38135+
38136+static void move_entry(struct readdir_pos * pos, coord_t * coord)
38137+{
38138+ reiser4_key de_key;
38139+ de_id *did;
38140+
38141+ /* update @pos */
38142+ ++pos->entry_no;
38143+ did = &pos->position.dir_entry_key;
38144+
38145+ /* get key of directory entry */
38146+ unit_key_by_coord(coord, &de_key);
38147+
38148+ if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
38149+ /* we are within sequence of directory entries
38150+ with duplicate keys. */
38151+ ++pos->position.pos;
38152+ else {
38153+ pos->position.pos = 0;
38154+ build_de_id_by_key(&de_key, did);
38155+ }
38156+ ++pos->fpos;
38157+}
38158+
38159+/*
38160+ * STATELESS READDIR
38161+ *
38162+ * readdir support in reiser4 relies on ability to update readdir_pos embedded
38163+ * into reiser4_file_fsdata on each directory modification (name insertion and
38164+ * removal), see reiser4_readdir_common() function below. This obviously doesn't
38165+ * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
38166+ * across client READDIR requests for the same directory.
38167+ *
38168+ * To address this we maintain a "pool" of detached reiser4_file_fsdata
38169+ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
38170+ * find detached reiser4_file_fsdata corresponding to previous readdir
38171+ * request. In other words, additional state is maintained on the
38172+ * server. (This is somewhat contrary to the design goals of NFS protocol.)
38173+ *
38174+ * To efficiently detect when our ->readdir() method is called by NFS server,
38175+ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
38176+ * file_is_stateless() function).
38177+ *
38178+ * To find out d_cursor in the pool, we encode client id (cid) in the highest
38179+ * bits of NFS readdir cookie: when first readdir request comes to the given
38180+ * directory from the given client, cookie is set to 0. This situation is
38181+ * detected, global cid_counter is incremented, and stored in highest bits of
38182+ * all direntry offsets returned to the client, including last one. As the
38183+ * only valid readdir cookie is one obtained as direntry->offset, we are
38184+ * guaranteed that next readdir request (continuing current one) will have
38185+ * current cid in the highest bits of starting readdir cookie. All d_cursors
38186+ * are hashed into per-super-block hash table by (oid, cid) key.
38187+ *
38188+ * In addition d_cursors are placed into per-super-block radix tree where they
38189+ * are keyed by oid alone. This is necessary to efficiently remove them during
38190+ * rmdir.
38191+ *
38192+ * At last, currently unused d_cursors are linked into special list. This list
38193+ * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
38194+ *
38195+ */
38196+
38197+/*
38198+ * prepare for readdir.
38199+ */
38200+static int dir_readdir_init(struct file *f, tap_t * tap,
38201+ struct readdir_pos ** pos)
38202+{
38203+ struct inode *inode;
38204+ reiser4_file_fsdata *fsdata;
38205+ int result;
38206+
38207+ assert("nikita-1359", f != NULL);
38208+ inode = f->f_dentry->d_inode;
38209+ assert("nikita-1360", inode != NULL);
38210+
38211+ if (!S_ISDIR(inode->i_mode))
38212+ return RETERR(-ENOTDIR);
38213+
38214+ /* try to find detached readdir state */
38215+ result = reiser4_attach_fsdata(f, inode);
38216+ if (result != 0)
38217+ return result;
38218+
38219+ fsdata = reiser4_get_file_fsdata(f);
38220+ assert("nikita-2571", fsdata != NULL);
38221+ if (IS_ERR(fsdata))
38222+ return PTR_ERR(fsdata);
38223+
38224+ /* add file descriptor to the readdir list hanging of directory
38225+ * inode. This list is used to scan "readdirs-in-progress" while
38226+ * inserting or removing names in the directory. */
38227+ spin_lock_inode(inode);
38228+ if (list_empty_careful(&fsdata->dir.linkage))
38229+ list_add(&fsdata->dir.linkage, get_readdir_list(inode));
38230+ *pos = &fsdata->dir.readdir;
38231+ spin_unlock_inode(inode);
38232+
38233+ /* move @tap to the current position */
38234+ return dir_rewind(f, *pos, tap);
38235+}
38236+
38237+/* this is implementation of vfs's llseek method of struct file_operations for
38238+ typical directory
38239+ See comment before reiser4_readdir_common() for explanation.
38240+*/
38241+loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin)
38242+{
38243+ reiser4_context *ctx;
38244+ loff_t result;
38245+ struct inode *inode;
38246+
38247+ inode = file->f_dentry->d_inode;
38248+
38249+ ctx = reiser4_init_context(inode->i_sb);
38250+ if (IS_ERR(ctx))
38251+ return PTR_ERR(ctx);
38252+
38253+ mutex_lock(&inode->i_mutex);
38254+
38255+ /* update ->f_pos */
38256+ result = default_llseek(file, off, origin);
38257+ if (result >= 0) {
38258+ int ff;
38259+ coord_t coord;
38260+ lock_handle lh;
38261+ tap_t tap;
38262+ struct readdir_pos *pos;
38263+
38264+ coord_init_zero(&coord);
38265+ init_lh(&lh);
38266+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38267+
38268+ ff = dir_readdir_init(file, &tap, &pos);
38269+ reiser4_detach_fsdata(file);
38270+ if (ff != 0)
38271+ result = (loff_t) ff;
38272+ reiser4_tap_done(&tap);
38273+ }
38274+ reiser4_detach_fsdata(file);
38275+ mutex_unlock(&inode->i_mutex);
38276+
38277+ reiser4_exit_context(ctx);
38278+ return result;
38279+}
38280+
38281+/* this is common implementation of vfs's readdir method of struct
38282+ file_operations
38283+
38284+ readdir problems:
38285+
38286+ readdir(2)/getdents(2) interface is based on implicit assumption that
38287+ readdir can be restarted from any particular point by supplying file system
38288+ with off_t-full of data. That is, file system fills ->d_off field in struct
38289+ dirent and later user passes ->d_off to the seekdir(3), which is, actually,
38290+ implemented by glibc as lseek(2) on directory.
38291+
38292+ Reiser4 cannot restart readdir from 64 bits of data, because two last
38293+ components of the key of directory entry are unknown, which given 128 bits:
38294+ locality and type fields in the key of directory entry are always known, to
38295+ start readdir() from given point objectid and offset fields have to be
38296+ filled.
38297+
38298+ Traditional UNIX API for scanning through directory
38299+ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
38300+ assumption that directory is structured very much like regular file, in
38301+ particular, it is implied that each name within given directory (directory
38302+ entry) can be uniquely identified by scalar offset and that such offset is
38303+ stable across the life-time of the name is identifies.
38304+
38305+ This is manifestly not so for reiser4. In reiser4 the only stable unique
38306+ identifies for the directory entry is its key that doesn't fit into
38307+ seekdir/telldir API.
38308+
38309+ solution:
38310+
38311+ Within each file descriptor participating in readdir-ing of directory
38312+ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
38313+ the "current" directory entry that file descriptor looks at. It contains a
38314+ key of directory entry (plus some additional info to deal with non-unique
38315+ keys that we wouldn't dwell onto here) and a logical position of this
38316+ directory entry starting from the beginning of the directory, that is
38317+ ordinal number of this entry in the readdir order.
38318+
38319+ Obviously this logical position is not stable in the face of directory
38320+ modifications. To work around this, on each addition or removal of directory
38321+ entry all file descriptors for directory inode are scanned and their
38322+ readdir_pos are updated accordingly (adjust_dir_pos()).
38323+*/
38324+int reiser4_readdir_common(struct file *f /* directory file being read */,
38325+ void *dirent /* opaque data passed to us by VFS */,
38326+ filldir_t filld /* filler function passed to us
38327+ * by VFS */)
38328+{
38329+ reiser4_context *ctx;
38330+ int result;
38331+ struct inode *inode;
38332+ coord_t coord;
38333+ lock_handle lh;
38334+ tap_t tap;
38335+ struct readdir_pos *pos;
38336+
38337+ assert("nikita-1359", f != NULL);
38338+ inode = f->f_dentry->d_inode;
38339+ assert("nikita-1360", inode != NULL);
38340+
38341+ if (!S_ISDIR(inode->i_mode))
38342+ return RETERR(-ENOTDIR);
38343+
38344+ ctx = reiser4_init_context(inode->i_sb);
38345+ if (IS_ERR(ctx))
38346+ return PTR_ERR(ctx);
38347+
38348+ coord_init_zero(&coord);
38349+ init_lh(&lh);
38350+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38351+
38352+ reiser4_readdir_readahead_init(inode, &tap);
38353+
38354+ repeat:
38355+ result = dir_readdir_init(f, &tap, &pos);
38356+ if (result == 0) {
38357+ result = reiser4_tap_load(&tap);
38358+ /* scan entries one by one feeding them to @filld */
38359+ while (result == 0) {
38360+ coord_t *coord;
38361+
38362+ coord = tap.coord;
38363+ assert("nikita-2572", coord_is_existing_unit(coord));
38364+ assert("nikita-3227", is_valid_dir_coord(inode, coord));
38365+
38366+ result = feed_entry(f, pos, &tap, filld, dirent);
38367+ if (result > 0) {
38368+ break;
38369+ } else if (result == 0) {
38370+ ++f->f_pos;
38371+ result = go_next_unit(&tap);
38372+ if (result == -E_NO_NEIGHBOR ||
38373+ result == -ENOENT) {
38374+ result = 0;
38375+ break;
38376+ } else if (result == 0) {
38377+ if (is_valid_dir_coord(inode, coord))
38378+ move_entry(pos, coord);
38379+ else
38380+ break;
38381+ }
38382+ } else if (result == -E_REPEAT) {
38383+ /* feed_entry() had to restart. */
38384+ ++f->f_pos;
38385+ reiser4_tap_relse(&tap);
38386+ goto repeat;
38387+ } else
38388+ warning("vs-1617",
38389+ "reiser4_readdir_common: unexpected error %d",
38390+ result);
38391+ }
38392+ reiser4_tap_relse(&tap);
38393+
38394+ if (result >= 0)
38395+ f->f_version = inode->i_version;
38396+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
38397+ result = 0;
38398+ reiser4_tap_done(&tap);
38399+ reiser4_detach_fsdata(f);
38400+
38401+ /* try to update directory's atime */
38402+ if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
38403+ BA_CAN_COMMIT) != 0)
38404+ warning("", "failed to update atime on readdir: %llu",
38405+ get_inode_oid(inode));
38406+ else
38407+ file_accessed(f);
38408+
38409+ context_set_commit_async(ctx);
38410+ reiser4_exit_context(ctx);
38411+
38412+ return (result <= 0) ? result : 0;
38413+}
38414+
38415+/*
38416+ * Local variables:
38417+ * c-indentation-style: "K&R"
38418+ * mode-name: "LC"
38419+ * c-basic-offset: 8
38420+ * tab-width: 8
38421+ * fill-column: 79
38422+ * End:
38423+ */
38424diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.23/fs/reiser4/plugin/file_plugin_common.c
38425--- linux-2.6.23.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
38426+++ linux-2.6.23/fs/reiser4/plugin/file_plugin_common.c 2007-12-04 22:59:05.806371984 +0300
38427@@ -0,0 +1,1007 @@
38428+/* Copyright 2005 by Hans Reiser, licensing governed by
38429+ reiser4/README */
38430+
38431+/* this file contains typical implementations for most of methods of
38432+ file plugin
38433+*/
38434+
38435+#include "../inode.h"
38436+#include "object.h"
38437+#include "../safe_link.h"
38438+
38439+#include <linux/quotaops.h>
38440+
38441+static int insert_new_sd(struct inode *inode);
38442+static int update_sd(struct inode *inode);
38443+
38444+/* this is common implementation of write_sd_by_inode method of file plugin
38445+ either insert stat data or update it
38446+ */
38447+int write_sd_by_inode_common(struct inode *inode /* object to save */ )
38448+{
38449+ int result;
38450+
38451+ assert("nikita-730", inode != NULL);
38452+
38453+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
38454+ /* object doesn't have stat-data yet */
38455+ result = insert_new_sd(inode);
38456+ else
38457+ result = update_sd(inode);
38458+ if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
38459+ /* Don't issue warnings about "name is too long" */
38460+ warning("nikita-2221", "Failed to save sd for %llu: %i",
38461+ (unsigned long long)get_inode_oid(inode), result);
38462+ return result;
38463+}
38464+
38465+/* this is common implementation of key_by_inode method of file plugin
38466+ */
38467+int
38468+key_by_inode_and_offset_common(struct inode *inode, loff_t off,
38469+ reiser4_key * key)
38470+{
38471+ reiser4_key_init(key);
38472+ set_key_locality(key, reiser4_inode_data(inode)->locality_id);
38473+ set_key_ordering(key, get_inode_ordering(inode));
38474+ set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
38475+ set_key_type(key, KEY_BODY_MINOR);
38476+ set_key_offset(key, (__u64) off);
38477+ return 0;
38478+}
38479+
38480+/* this is common implementation of set_plug_in_inode method of file plugin
38481+ */
38482+int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
38483+ struct inode *parent /* parent object */ ,
38484+ reiser4_object_create_data * data /* creational
38485+ * data */ )
38486+{
38487+ __u64 mask;
38488+
38489+ object->i_mode = data->mode;
38490+ /* this should be plugin decision */
38491+ object->i_uid = current->fsuid;
38492+ object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
38493+
38494+ /* support for BSD style group-id assignment. See mount's manual page
38495+ description of bsdgroups ext2 mount options for more details */
38496+ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
38497+ object->i_gid = parent->i_gid;
38498+ else if (parent->i_mode & S_ISGID) {
38499+ /* parent directory has sguid bit */
38500+ object->i_gid = parent->i_gid;
38501+ if (S_ISDIR(object->i_mode))
38502+ /* sguid is inherited by sub-directories */
38503+ object->i_mode |= S_ISGID;
38504+ } else
38505+ object->i_gid = current->fsgid;
38506+
38507+ /* this object doesn't have stat-data yet */
38508+ reiser4_inode_set_flag(object, REISER4_NO_SD);
38509+#if 0
38510+ /* this is now called after all inode plugins are initialized:
38511+ do_create_vfs_child after adjust_to_parent */
38512+ /* setup inode and file-operations for this inode */
38513+ setup_inode_ops(object, data);
38514+#endif
38515+ object->i_nlink = 0;
38516+ reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
38517+ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
38518+ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
38519+ mask |= (1 << LARGE_TIMES_STAT);
38520+
38521+ reiser4_inode_data(object)->extmask = mask;
38522+ return 0;
38523+}
38524+
38525+/* this is common implementation of adjust_to_parent method of file plugin for
38526+ regular files
38527+ */
38528+int adjust_to_parent_common(struct inode *object /* new object */ ,
38529+ struct inode *parent /* parent directory */ ,
38530+ struct inode *root /* root directory */ )
38531+{
38532+ assert("nikita-2165", object != NULL);
38533+ if (parent == NULL)
38534+ parent = root;
38535+ assert("nikita-2069", parent != NULL);
38536+
38537+ /*
38538+ * inherit missing plugins from parent
38539+ */
38540+
38541+ grab_plugin_pset(object, parent, PSET_FILE);
38542+ grab_plugin_pset(object, parent, PSET_SD);
38543+ grab_plugin_pset(object, parent, PSET_FORMATTING);
38544+ grab_plugin_pset(object, parent, PSET_PERM);
38545+ return 0;
38546+}
38547+
38548+/* this is common implementation of adjust_to_parent method of file plugin for
38549+ typical directories
38550+ */
38551+int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
38552+ struct inode *parent /* parent directory */ ,
38553+ struct inode *root /* root directory */ )
38554+{
38555+ int result = 0;
38556+ pset_member memb;
38557+
38558+ assert("nikita-2166", object != NULL);
38559+ if (parent == NULL)
38560+ parent = root;
38561+ assert("nikita-2167", parent != NULL);
38562+
38563+ /*
38564+ * inherit missing plugins from parent
38565+ */
38566+ for (memb = 0; memb < PSET_LAST; ++memb) {
38567+ result = grab_plugin_pset(object, parent, memb);
38568+ if (result != 0)
38569+ break;
38570+ }
38571+ return result;
38572+}
38573+
38574+int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
38575+ struct inode *parent /* parent directory */,
38576+ struct inode *root /* root directory */)
38577+{
38578+ int result;
38579+ result = adjust_to_parent_common(object, parent, root);
38580+ if (result)
38581+ return result;
38582+ assert("edward-1416", parent != NULL);
38583+
38584+ grab_plugin_pset(object, parent, PSET_CLUSTER);
38585+ grab_plugin_pset(object, parent, PSET_CIPHER);
38586+ grab_plugin_pset(object, parent, PSET_DIGEST);
38587+ grab_plugin_pset(object, parent, PSET_COMPRESSION);
38588+ grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
38589+
38590+ return 0;
38591+}
38592+
38593+/* this is common implementation of create_object method of file plugin
38594+ */
38595+int reiser4_create_object_common(struct inode *object, struct inode *parent,
38596+ reiser4_object_create_data * data)
38597+{
38598+ reiser4_block_nr reserve;
38599+ assert("nikita-744", object != NULL);
38600+ assert("nikita-745", parent != NULL);
38601+ assert("nikita-747", data != NULL);
38602+ assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
38603+
38604+ reserve = estimate_create_common(object);
38605+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
38606+ return RETERR(-ENOSPC);
38607+ return write_sd_by_inode_common(object);
38608+}
38609+
38610+static int common_object_delete_no_reserve(struct inode *inode);
38611+
38612+/**
38613+ * reiser4_delete_object_common - delete_object of file_plugin
38614+ * @inode: inode to be deleted
38615+ *
38616+ * This is common implementation of delete_object method of file_plugin. It
38617+ * applies to object its deletion consists of removing two items - stat data
38618+ * and safe-link.
38619+ */
38620+int reiser4_delete_object_common(struct inode *inode)
38621+{
38622+ int result;
38623+
38624+ assert("nikita-1477", inode != NULL);
38625+ /* FIXME: if file body deletion failed (i/o error, for instance),
38626+ inode->i_size can be != 0 here */
38627+ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
38628+ assert("nikita-3421", inode->i_nlink == 0);
38629+
38630+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
38631+ reiser4_block_nr reserve;
38632+
38633+ /* grab space which is needed to remove 2 items from the tree:
38634+ stat data and safe-link */
38635+ reserve = 2 *
38636+ estimate_one_item_removal(reiser4_tree_by_inode(inode));
38637+ if (reiser4_grab_space_force(reserve,
38638+ BA_RESERVED | BA_CAN_COMMIT))
38639+ return RETERR(-ENOSPC);
38640+ result = common_object_delete_no_reserve(inode);
38641+ } else
38642+ result = 0;
38643+ return result;
38644+}
38645+
38646+/**
38647+ * reiser4_delete_dir_common - delete_object of file_plugin
38648+ * @inode: inode to be deleted
38649+ *
38650+ * This is common implementation of delete_object method of file_plugin for
38651+ * typical directory. It calls done method of dir_plugin to remove "." and
38652+ * removes stat data and safe-link.
38653+ */
38654+int reiser4_delete_dir_common(struct inode *inode)
38655+{
38656+ int result;
38657+ dir_plugin *dplug;
38658+
38659+ assert("", (get_current_context() &&
38660+ get_current_context()->trans->atom == NULL));
38661+
38662+ dplug = inode_dir_plugin(inode);
38663+ assert("vs-1101", dplug && dplug->done);
38664+
38665+ /* kill cursors which might be attached to inode */
38666+ reiser4_kill_cursors(inode);
38667+
38668+ /* grab space enough for removing two items */
38669+ if (reiser4_grab_space
38670+ (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
38671+ BA_RESERVED | BA_CAN_COMMIT))
38672+ return RETERR(-ENOSPC);
38673+
38674+ result = dplug->done(inode);
38675+ if (!result)
38676+ result = common_object_delete_no_reserve(inode);
38677+ return result;
38678+}
38679+
38680+/* this is common implementation of add_link method of file plugin
38681+ */
38682+int reiser4_add_link_common(struct inode *object, struct inode *parent)
38683+{
38684+ /*
38685+ * increment ->i_nlink and update ->i_ctime
38686+ */
38687+
38688+ INODE_INC_FIELD(object, i_nlink);
38689+ object->i_ctime = CURRENT_TIME;
38690+ return 0;
38691+}
38692+
38693+/* this is common implementation of rem_link method of file plugin
38694+ */
38695+int reiser4_rem_link_common(struct inode *object, struct inode *parent)
38696+{
38697+ assert("nikita-2021", object != NULL);
38698+ assert("nikita-2163", object->i_nlink > 0);
38699+
38700+ /*
38701+ * decrement ->i_nlink and update ->i_ctime
38702+ */
38703+
38704+ INODE_DEC_FIELD(object, i_nlink);
38705+ object->i_ctime = CURRENT_TIME;
38706+ return 0;
38707+}
38708+
38709+/* this is common implementation of rem_link method of file plugin for typical
38710+ directory
38711+*/
38712+int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
38713+{
38714+ assert("nikita-20211", object != NULL);
38715+ assert("nikita-21631", object->i_nlink > 0);
38716+
38717+ /*
38718+ * decrement ->i_nlink and update ->i_ctime
38719+ */
38720+ INODE_DEC_FIELD(object, i_nlink);
38721+ if (object->i_nlink == 1)
38722+ INODE_DEC_FIELD(object, i_nlink);
38723+ object->i_ctime = CURRENT_TIME;
38724+ return 0;
38725+}
38726+
38727+/* this is common implementation of owns_item method of file plugin
38728+ compare objectids of keys in inode and coord */
38729+int owns_item_common(const struct inode *inode, /* object to check
38730+ * against */
38731+ const coord_t * coord /* coord to check */ )
38732+{
38733+ reiser4_key item_key;
38734+ reiser4_key file_key;
38735+
38736+ assert("nikita-760", inode != NULL);
38737+ assert("nikita-761", coord != NULL);
38738+
38739+ return coord_is_existing_item(coord) &&
38740+ (get_key_objectid(build_sd_key(inode, &file_key)) ==
38741+ get_key_objectid(item_key_by_coord(coord, &item_key)));
38742+}
38743+
38744+/* this is common implementation of owns_item method of file plugin
38745+ for typical directory
38746+*/
38747+int owns_item_common_dir(const struct inode *inode, /* object to check against */
38748+ const coord_t * coord /* coord of item to check */ )
38749+{
38750+ reiser4_key item_key;
38751+
38752+ assert("nikita-1335", inode != NULL);
38753+ assert("nikita-1334", coord != NULL);
38754+
38755+ if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
38756+ return get_key_locality(item_key_by_coord(coord, &item_key)) ==
38757+ get_inode_oid(inode);
38758+ else
38759+ return owns_item_common(inode, coord);
38760+}
38761+
38762+/* this is common implementation of can_add_link method of file plugin
38763+ checks whether yet another hard links to this object can be added
38764+*/
38765+int can_add_link_common(const struct inode *object /* object to check */ )
38766+{
38767+ assert("nikita-732", object != NULL);
38768+
38769+ /* inode->i_nlink is unsigned int, so just check for integer
38770+ overflow */
38771+ return object->i_nlink + 1 != 0;
38772+}
38773+
38774+/* this is common implementation of can_rem_link method of file plugin for
38775+ typical directory
38776+*/
38777+int can_rem_link_common_dir(const struct inode *inode)
38778+{
38779+ /* is_dir_empty() returns 0 is dir is empty */
38780+ return !is_dir_empty(inode);
38781+}
38782+
38783+/* this is common implementation of detach method of file plugin for typical
38784+ directory
38785+*/
38786+int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
38787+{
38788+ dir_plugin *dplug;
38789+
38790+ dplug = inode_dir_plugin(child);
38791+ assert("nikita-2883", dplug != NULL);
38792+ assert("nikita-2884", dplug->detach != NULL);
38793+ return dplug->detach(child, parent);
38794+}
38795+
38796+/* this is common implementation of bind method of file plugin for typical
38797+ directory
38798+*/
38799+int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
38800+{
38801+ dir_plugin *dplug;
38802+
38803+ dplug = inode_dir_plugin(child);
38804+ assert("nikita-2646", dplug != NULL);
38805+ return dplug->attach(child, parent);
38806+}
38807+
38808+static int process_truncate(struct inode *, __u64 size);
38809+
38810+/* this is common implementation of safelink method of file plugin
38811+ */
38812+int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
38813+{
38814+ int result;
38815+
38816+ assert("vs-1705", get_current_context()->trans->atom == NULL);
38817+ if (link == SAFE_UNLINK)
38818+ /* nothing to do. iput() in the caller (process_safelink) will
38819+ * finish with file */
38820+ result = 0;
38821+ else if (link == SAFE_TRUNCATE)
38822+ result = process_truncate(object, value);
38823+ else {
38824+ warning("nikita-3438", "Unrecognized safe-link type: %i", link);
38825+ result = RETERR(-EIO);
38826+ }
38827+ return result;
38828+}
38829+
38830+/* this is common implementation of estimate.create method of file plugin
38831+ can be used when object creation involves insertion of one item (usually stat
38832+ data) into tree
38833+*/
38834+reiser4_block_nr estimate_create_common(const struct inode * object)
38835+{
38836+ return estimate_one_insert_item(reiser4_tree_by_inode(object));
38837+}
38838+
38839+/* this is common implementation of estimate.create method of file plugin for
38840+ typical directory
38841+ can be used when directory creation involves insertion of two items (usually
38842+ stat data and item containing "." and "..") into tree
38843+*/
38844+reiser4_block_nr estimate_create_common_dir(const struct inode * object)
38845+{
38846+ return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
38847+}
38848+
38849+/* this is common implementation of estimate.update method of file plugin
38850+ can be used when stat data update does not do more than inserting a unit
38851+ into a stat data item which is probably true for most cases
38852+*/
38853+reiser4_block_nr estimate_update_common(const struct inode * inode)
38854+{
38855+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
38856+}
38857+
38858+/* this is common implementation of estimate.unlink method of file plugin
38859+ */
38860+reiser4_block_nr
38861+estimate_unlink_common(const struct inode * object UNUSED_ARG,
38862+ const struct inode * parent UNUSED_ARG)
38863+{
38864+ return 0;
38865+}
38866+
38867+/* this is common implementation of estimate.unlink method of file plugin for
38868+ typical directory
38869+*/
38870+reiser4_block_nr
38871+estimate_unlink_common_dir(const struct inode * object,
38872+ const struct inode * parent)
38873+{
38874+ dir_plugin *dplug;
38875+
38876+ dplug = inode_dir_plugin(object);
38877+ assert("nikita-2888", dplug != NULL);
38878+ assert("nikita-2887", dplug->estimate.unlink != NULL);
38879+ return dplug->estimate.unlink(object, parent);
38880+}
38881+
38882+char *wire_write_common(struct inode *inode, char *start)
38883+{
38884+ return build_inode_onwire(inode, start);
38885+}
38886+
38887+char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
38888+{
38889+ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
38890+}
38891+
38892+struct dentry *wire_get_common(struct super_block *sb,
38893+ reiser4_object_on_wire * obj)
38894+{
38895+ struct inode *inode;
38896+ struct dentry *dentry;
38897+ reiser4_key key;
38898+
38899+ extract_key_from_id(&obj->u.std.key_id, &key);
38900+ inode = reiser4_iget(sb, &key, 1);
38901+ if (!IS_ERR(inode)) {
38902+ reiser4_iget_complete(inode);
38903+ dentry = d_alloc_anon(inode);
38904+ if (dentry == NULL) {
38905+ iput(inode);
38906+ dentry = ERR_PTR(-ENOMEM);
38907+ } else
38908+ dentry->d_op = &get_super_private(sb)->ops.dentry;
38909+ } else if (PTR_ERR(inode) == -ENOENT)
38910+ /*
38911+ * inode wasn't found at the key encoded in the file
38912+ * handle. Hence, file handle is stale.
38913+ */
38914+ dentry = ERR_PTR(RETERR(-ESTALE));
38915+ else
38916+ dentry = (void *)inode;
38917+ return dentry;
38918+}
38919+
38920+int wire_size_common(struct inode *inode)
38921+{
38922+ return inode_onwire_size(inode);
38923+}
38924+
38925+void wire_done_common(reiser4_object_on_wire * obj)
38926+{
38927+ /* nothing to do */
38928+}
38929+
38930+/* helper function to print errors */
38931+static void key_warning(const reiser4_key * key /* key to print */ ,
38932+ const struct inode *inode,
38933+ int code /* error code to print */ )
38934+{
38935+ assert("nikita-716", key != NULL);
38936+
38937+ if (code != -ENOMEM) {
38938+ warning("nikita-717", "Error for inode %llu (%i)",
38939+ (unsigned long long)get_key_objectid(key), code);
38940+ reiser4_print_key("for key", key);
38941+ }
38942+}
38943+
38944+/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
38945+#if REISER4_DEBUG
38946+static void
38947+check_inode_seal(const struct inode *inode,
38948+ const coord_t * coord, const reiser4_key * key)
38949+{
38950+ reiser4_key unit_key;
38951+
38952+ unit_key_by_coord(coord, &unit_key);
38953+ assert("nikita-2752",
38954+ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
38955+ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
38956+}
38957+
38958+static void check_sd_coord(coord_t * coord, const reiser4_key * key)
38959+{
38960+ reiser4_key ukey;
38961+
38962+ coord_clear_iplug(coord);
38963+ if (zload(coord->node))
38964+ return;
38965+
38966+ if (!coord_is_existing_unit(coord) ||
38967+ !item_plugin_by_coord(coord) ||
38968+ !keyeq(unit_key_by_coord(coord, &ukey), key) ||
38969+ (znode_get_level(coord->node) != LEAF_LEVEL) ||
38970+ !item_is_statdata(coord)) {
38971+ warning("nikita-1901", "Conspicuous seal");
38972+ reiser4_print_key("key", key);
38973+ print_coord("coord", coord, 1);
38974+ impossible("nikita-2877", "no way");
38975+ }
38976+ zrelse(coord->node);
38977+}
38978+
38979+#else
38980+#define check_inode_seal(inode, coord, key) noop
38981+#define check_sd_coord(coord, key) noop
38982+#endif
38983+
38984+/* insert new stat-data into tree. Called with inode state
38985+ locked. Return inode state locked. */
38986+static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
38987+{
38988+ int result;
38989+ reiser4_key key;
38990+ coord_t coord;
38991+ reiser4_item_data data;
38992+ char *area;
38993+ reiser4_inode *ref;
38994+ lock_handle lh;
38995+ oid_t oid;
38996+
38997+ assert("nikita-723", inode != NULL);
38998+ assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
38999+
39000+ ref = reiser4_inode_data(inode);
39001+ spin_lock_inode(inode);
39002+
39003+ if (ref->plugin_mask != 0)
39004+ /* inode has non-standard plugins */
39005+ inode_set_extension(inode, PLUGIN_STAT);
39006+ /*
39007+ * prepare specification of new item to be inserted
39008+ */
39009+
39010+ data.iplug = inode_sd_plugin(inode);
39011+ data.length = data.iplug->s.sd.save_len(inode);
39012+ spin_unlock_inode(inode);
39013+
39014+ data.data = NULL;
39015+ data.user = 0;
39016+/* could be optimized for case where there is only one node format in
39017+ * use in the filesystem, probably there are lots of such
39018+ * places we could optimize for only one node layout.... -Hans */
39019+ if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){
39020+ /* This is silly check, but we don't know actual node where
39021+ insertion will go into. */
39022+ return RETERR(-ENAMETOOLONG);
39023+ }
39024+ oid = oid_allocate(inode->i_sb);
39025+/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
39026+ if (oid == ABSOLUTE_MAX_OID)
39027+ return RETERR(-EOVERFLOW);
39028+
39029+ set_inode_oid(inode, oid);
39030+
39031+ coord_init_zero(&coord);
39032+ init_lh(&lh);
39033+
39034+ result = insert_by_key(reiser4_tree_by_inode(inode),
39035+ build_sd_key(inode, &key), &data, &coord, &lh,
39036+ /* stat data lives on a leaf level */
39037+ LEAF_LEVEL, CBK_UNIQUE);
39038+
39039+ /* we don't want to re-check that somebody didn't insert
39040+ stat-data while we were doing io, because if it did,
39041+ insert_by_key() returned error. */
39042+ /* but what _is_ possible is that plugin for inode's stat-data,
39043+ list of non-standard plugins or their state would change
39044+ during io, so that stat-data wouldn't fit into sd. To avoid
39045+ this race we keep inode_state lock. This lock has to be
39046+ taken each time you access inode in a way that would cause
39047+ changes in sd size: changing plugins etc.
39048+ */
39049+
39050+ if (result == IBK_INSERT_OK) {
39051+ coord_clear_iplug(&coord);
39052+ result = zload(coord.node);
39053+ if (result == 0) {
39054+ /* have we really inserted stat data? */
39055+ assert("nikita-725", item_is_statdata(&coord));
39056+
39057+ /* inode was just created. It is inserted into hash
39058+ table, but no directory entry was yet inserted into
39059+ parent. So, inode is inaccessible through
39060+ ->lookup(). All places that directly grab inode
39061+ from hash-table (like old knfsd), should check
39062+ IMMUTABLE flag that is set by common_create_child.
39063+ */
39064+ assert("nikita-3240", data.iplug != NULL);
39065+ assert("nikita-3241", data.iplug->s.sd.save != NULL);
39066+ area = item_body_by_coord(&coord);
39067+ result = data.iplug->s.sd.save(inode, &area);
39068+ znode_make_dirty(coord.node);
39069+ if (result == 0) {
39070+ /* object has stat-data now */
39071+ reiser4_inode_clr_flag(inode, REISER4_NO_SD);
39072+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
39073+ /* initialise stat-data seal */
39074+ reiser4_seal_init(&ref->sd_seal, &coord, &key);
39075+ ref->sd_coord = coord;
39076+ check_inode_seal(inode, &coord, &key);
39077+ } else if (result != -ENOMEM)
39078+ /*
39079+ * convert any other error code to -EIO to
39080+ * avoid confusing user level with unexpected
39081+ * errors.
39082+ */
39083+ result = RETERR(-EIO);
39084+ zrelse(coord.node);
39085+ }
39086+ }
39087+ done_lh(&lh);
39088+
39089+ if (result != 0)
39090+ key_warning(&key, inode, result);
39091+ else
39092+ oid_count_allocated();
39093+
39094+ return result;
39095+}
39096+
39097+/* find sd of inode in a tree, deal with errors */
39098+int lookup_sd(struct inode *inode /* inode to look sd for */ ,
39099+ znode_lock_mode lock_mode /* lock mode */ ,
39100+ coord_t * coord /* resulting coord */ ,
39101+ lock_handle * lh /* resulting lock handle */ ,
39102+ const reiser4_key * key /* resulting key */ ,
39103+ int silent)
39104+{
39105+ int result;
39106+ __u32 flags;
39107+
39108+ assert("nikita-1692", inode != NULL);
39109+ assert("nikita-1693", coord != NULL);
39110+ assert("nikita-1694", key != NULL);
39111+
39112+ /* look for the object's stat data in a tree.
39113+ This returns in "node" pointer to a locked znode and in "pos"
39114+ position of an item found in node. Both are only valid if
39115+ coord_found is returned. */
39116+ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
39117+ flags |= CBK_UNIQUE;
39118+ /*
39119+ * traverse tree to find stat data. We cannot use vroot here, because
39120+ * it only covers _body_ of the file, and stat data don't belong
39121+ * there.
39122+ */
39123+ result = coord_by_key(reiser4_tree_by_inode(inode),
39124+ key,
39125+ coord,
39126+ lh,
39127+ lock_mode,
39128+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
39129+ if (REISER4_DEBUG && result == 0)
39130+ check_sd_coord(coord, key);
39131+
39132+ if (result != 0 && !silent)
39133+ key_warning(key, inode, result);
39134+ return result;
39135+}
39136+
39137+static int
39138+locate_inode_sd(struct inode *inode,
39139+ reiser4_key * key, coord_t * coord, lock_handle * lh)
39140+{
39141+ reiser4_inode *state;
39142+ seal_t seal;
39143+ int result;
39144+
39145+ assert("nikita-3483", inode != NULL);
39146+
39147+ state = reiser4_inode_data(inode);
39148+ spin_lock_inode(inode);
39149+ *coord = state->sd_coord;
39150+ coord_clear_iplug(coord);
39151+ seal = state->sd_seal;
39152+ spin_unlock_inode(inode);
39153+
39154+ build_sd_key(inode, key);
39155+ if (reiser4_seal_is_set(&seal)) {
39156+ /* first, try to use seal */
39157+ result = reiser4_seal_validate(&seal,
39158+ coord,
39159+ key,
39160+ lh, ZNODE_WRITE_LOCK,
39161+ ZNODE_LOCK_LOPRI);
39162+ if (result == 0)
39163+ check_sd_coord(coord, key);
39164+ } else
39165+ result = -E_REPEAT;
39166+
39167+ if (result != 0) {
39168+ coord_init_zero(coord);
39169+ result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
39170+ }
39171+ return result;
39172+}
39173+
39174+#if REISER4_DEBUG
39175+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
39176+{
39177+ return (get_key_locality(k1) == get_key_locality(k2) &&
39178+ get_key_type(k1) == get_key_type(k2) &&
39179+ get_key_band(k1) == get_key_band(k2) &&
39180+ get_key_ordering(k1) == get_key_ordering(k2) &&
39181+ get_key_objectid(k1) == get_key_objectid(k2));
39182+}
39183+
39184+#include "../tree_walk.h"
39185+
39186+/* make some checks before and after stat-data resize operation */
39187+static int check_sd_resize(struct inode * inode, coord_t * coord,
39188+ int length, int progress /* 1 means after resize */)
39189+{
39190+ int ret = 0;
39191+ lock_handle left_lock;
39192+ coord_t left_coord;
39193+ reiser4_key left_key;
39194+ reiser4_key key;
39195+
39196+ if (inode_file_plugin(inode) !=
39197+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
39198+ return 0;
39199+ if (!length)
39200+ return 0;
39201+ if (coord->item_pos != 0)
39202+ return 0;
39203+
39204+ init_lh(&left_lock);
39205+ ret = reiser4_get_left_neighbor(&left_lock,
39206+ coord->node,
39207+ ZNODE_WRITE_LOCK,
39208+ GN_CAN_USE_UPPER_LEVELS);
39209+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
39210+ ret == -ENOENT || ret == -EINVAL
39211+ || ret == -E_DEADLOCK) {
39212+ ret = 0;
39213+ goto exit;
39214+ }
39215+ ret = zload(left_lock.node);
39216+ if (ret)
39217+ goto exit;
39218+ coord_init_last_unit(&left_coord, left_lock.node);
39219+ item_key_by_coord(&left_coord, &left_key);
39220+ item_key_by_coord(coord, &key);
39221+
39222+ if (all_but_offset_key_eq(&key, &left_key))
39223+ /* corruption occured */
39224+ ret = 1;
39225+ zrelse(left_lock.node);
39226+ exit:
39227+ done_lh(&left_lock);
39228+ return ret;
39229+}
39230+#endif
39231+
39232+/* update stat-data at @coord */
39233+static int
39234+update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
39235+ lock_handle * lh)
39236+{
39237+ int result;
39238+ reiser4_item_data data;
39239+ char *area;
39240+ reiser4_inode *state;
39241+ znode *loaded;
39242+
39243+ state = reiser4_inode_data(inode);
39244+
39245+ coord_clear_iplug(coord);
39246+ result = zload(coord->node);
39247+ if (result != 0)
39248+ return result;
39249+ loaded = coord->node;
39250+
39251+ spin_lock_inode(inode);
39252+ assert("nikita-728", inode_sd_plugin(inode) != NULL);
39253+ data.iplug = inode_sd_plugin(inode);
39254+
39255+ /* if inode has non-standard plugins, add appropriate stat data
39256+ * extension */
39257+ if (state->extmask & (1 << PLUGIN_STAT)) {
39258+ if (state->plugin_mask == 0)
39259+ inode_clr_extension(inode, PLUGIN_STAT);
39260+ } else if (state->plugin_mask != 0)
39261+ inode_set_extension(inode, PLUGIN_STAT);
39262+
39263+ if (state->extmask & (1 << HEIR_STAT)) {
39264+ if (state->heir_mask == 0)
39265+ inode_clr_extension(inode, HEIR_STAT);
39266+ } else if (state->heir_mask != 0)
39267+ inode_set_extension(inode, HEIR_STAT);
39268+
39269+ /* data.length is how much space to add to (or remove
39270+ from if negative) sd */
39271+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
39272+ /* recalculate stat-data length */
39273+ data.length =
39274+ data.iplug->s.sd.save_len(inode) -
39275+ item_length_by_coord(coord);
39276+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
39277+ } else
39278+ data.length = 0;
39279+ spin_unlock_inode(inode);
39280+
39281+ /* if on-disk stat data is of different length than required
39282+ for this inode, resize it */
39283+
39284+ if (data.length != 0) {
39285+ data.data = NULL;
39286+ data.user = 0;
39287+
39288+ assert("edward-1441",
39289+ !check_sd_resize(inode, coord,
39290+ data.length, 0/* before resize */));
39291+
39292+ /* insertion code requires that insertion point (coord) was
39293+ * between units. */
39294+ coord->between = AFTER_UNIT;
39295+ result = reiser4_resize_item(coord, &data, key, lh,
39296+ COPI_DONT_SHIFT_LEFT);
39297+ if (result != 0) {
39298+ key_warning(key, inode, result);
39299+ zrelse(loaded);
39300+ return result;
39301+ }
39302+ if (loaded != coord->node) {
39303+ /* reiser4_resize_item moved coord to another node.
39304+ Zload it */
39305+ zrelse(loaded);
39306+ coord_clear_iplug(coord);
39307+ result = zload(coord->node);
39308+ if (result != 0)
39309+ return result;
39310+ loaded = coord->node;
39311+ }
39312+ assert("edward-1442",
39313+ !check_sd_resize(inode, coord,
39314+ data.length, 1/* after resize */));
39315+ }
39316+ area = item_body_by_coord(coord);
39317+ spin_lock_inode(inode);
39318+ result = data.iplug->s.sd.save(inode, &area);
39319+ znode_make_dirty(coord->node);
39320+
39321+ /* re-initialise stat-data seal */
39322+
39323+ /*
39324+ * coord.between was possibly skewed from AT_UNIT when stat-data size
39325+ * was changed and new extensions were pasted into item.
39326+ */
39327+ coord->between = AT_UNIT;
39328+ reiser4_seal_init(&state->sd_seal, coord, key);
39329+ state->sd_coord = *coord;
39330+ spin_unlock_inode(inode);
39331+ check_inode_seal(inode, coord, key);
39332+ zrelse(loaded);
39333+ return result;
39334+}
39335+
39336+/* Update existing stat-data in a tree. Called with inode state locked. Return
39337+ inode state locked. */
39338+static int update_sd(struct inode *inode /* inode to update sd for */ )
39339+{
39340+ int result;
39341+ reiser4_key key;
39342+ coord_t coord;
39343+ lock_handle lh;
39344+
39345+ assert("nikita-726", inode != NULL);
39346+
39347+ /* no stat-data, nothing to update?! */
39348+ assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
39349+
39350+ init_lh(&lh);
39351+
39352+ result = locate_inode_sd(inode, &key, &coord, &lh);
39353+ if (result == 0)
39354+ result = update_sd_at(inode, &coord, &key, &lh);
39355+ done_lh(&lh);
39356+
39357+ return result;
39358+}
39359+
39360+/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
39361+ Remove object stat data. Space for that must be reserved by caller before
39362+*/
39363+static int
39364+common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
39365+{
39366+ int result;
39367+
39368+ assert("nikita-1477", inode != NULL);
39369+
39370+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
39371+ reiser4_key sd_key;
39372+
39373+ DQUOT_FREE_INODE(inode);
39374+ DQUOT_DROP(inode);
39375+
39376+ build_sd_key(inode, &sd_key);
39377+ result =
39378+ reiser4_cut_tree(reiser4_tree_by_inode(inode),
39379+ &sd_key, &sd_key, NULL, 0);
39380+ if (result == 0) {
39381+ reiser4_inode_set_flag(inode, REISER4_NO_SD);
39382+ result = oid_release(inode->i_sb, get_inode_oid(inode));
39383+ if (result == 0) {
39384+ oid_count_released();
39385+
39386+ result = safe_link_del(reiser4_tree_by_inode(inode),
39387+ get_inode_oid(inode),
39388+ SAFE_UNLINK);
39389+ }
39390+ }
39391+ } else
39392+ result = 0;
39393+ return result;
39394+}
39395+
39396+/* helper for safelink_common */
39397+static int process_truncate(struct inode *inode, __u64 size)
39398+{
39399+ int result;
39400+ struct iattr attr;
39401+ file_plugin *fplug;
39402+ reiser4_context *ctx;
39403+ struct dentry dentry;
39404+
39405+ assert("vs-21", is_in_reiser4_context());
39406+ ctx = reiser4_init_context(inode->i_sb);
39407+ assert("vs-22", !IS_ERR(ctx));
39408+
39409+ attr.ia_size = size;
39410+ attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
39411+ fplug = inode_file_plugin(inode);
39412+
39413+ mutex_lock(&inode->i_mutex);
39414+ assert("vs-1704", get_current_context()->trans->atom == NULL);
39415+ dentry.d_inode = inode;
39416+ result = inode->i_op->setattr(&dentry, &attr);
39417+ mutex_unlock(&inode->i_mutex);
39418+
39419+ context_set_commit_async(ctx);
39420+ reiser4_exit_context(ctx);
39421+
39422+ return result;
39423+}
39424+
39425+/*
39426+ Local variables:
39427+ c-indentation-style: "K&R"
39428+ mode-name: "LC"
39429+ c-basic-offset: 8
39430+ tab-width: 8
39431+ fill-column: 80
39432+ scroll-step: 1
39433+ End:
39434+*/
39435diff -urN linux-2.6.23.orig/fs/reiser4/plugin/hash.c linux-2.6.23/fs/reiser4/plugin/hash.c
39436--- linux-2.6.23.orig/fs/reiser4/plugin/hash.c 1970-01-01 03:00:00.000000000 +0300
39437+++ linux-2.6.23/fs/reiser4/plugin/hash.c 2007-12-04 16:49:30.000000000 +0300
39438@@ -0,0 +1,353 @@
39439+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
39440+ * reiser4/README */
39441+
39442+/* Hash functions */
39443+
39444+#include "../debug.h"
39445+#include "plugin_header.h"
39446+#include "plugin.h"
39447+#include "../super.h"
39448+#include "../inode.h"
39449+
39450+#include <linux/types.h>
39451+
39452+/* old rupasov (yura) hash */
39453+static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
39454+ int len /* @name's length */ )
39455+{
39456+ int i;
39457+ int j;
39458+ int pow;
39459+ __u64 a;
39460+ __u64 c;
39461+
39462+ assert("nikita-672", name != NULL);
39463+ assert("nikita-673", len >= 0);
39464+
39465+ for (pow = 1, i = 1; i < len; ++i)
39466+ pow = pow * 10;
39467+
39468+ if (len == 1)
39469+ a = name[0] - 48;
39470+ else
39471+ a = (name[0] - 48) * pow;
39472+
39473+ for (i = 1; i < len; ++i) {
39474+ c = name[i] - 48;
39475+ for (pow = 1, j = i; j < len - 1; ++j)
39476+ pow = pow * 10;
39477+ a = a + c * pow;
39478+ }
39479+ for (; i < 40; ++i) {
39480+ c = '0' - 48;
39481+ for (pow = 1, j = i; j < len - 1; ++j)
39482+ pow = pow * 10;
39483+ a = a + c * pow;
39484+ }
39485+
39486+ for (; i < 256; ++i) {
39487+ c = i;
39488+ for (pow = 1, j = i; j < len - 1; ++j)
39489+ pow = pow * 10;
39490+ a = a + c * pow;
39491+ }
39492+
39493+ a = a << 7;
39494+ return a;
39495+}
39496+
39497+/* r5 hash */
39498+static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
39499+ int len UNUSED_ARG /* @name's length */ )
39500+{
39501+ __u64 a = 0;
39502+
39503+ assert("nikita-674", name != NULL);
39504+ assert("nikita-675", len >= 0);
39505+
39506+ while (*name) {
39507+ a += *name << 4;
39508+ a += *name >> 4;
39509+ a *= 11;
39510+ name++;
39511+ }
39512+ return a;
39513+}
39514+
39515+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
39516+ H0 = Key
39517+ Hi = E Mi(Hi-1) + Hi-1
39518+
39519+ (see Applied Cryptography, 2nd edition, p448).
39520+
39521+ Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
39522+
39523+ Jeremy has agreed to the contents of reiserfs/README. -Hans
39524+
39525+ This code was blindly upgraded to __u64 by s/__u32/__u64/g.
39526+*/
39527+static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
39528+ int len /* @name's length */ )
39529+{
39530+ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
39531+
39532+ __u64 h0 = k[0], h1 = k[1];
39533+ __u64 a, b, c, d;
39534+ __u64 pad;
39535+ int i;
39536+
39537+ assert("nikita-676", name != NULL);
39538+ assert("nikita-677", len >= 0);
39539+
39540+#define DELTA 0x9E3779B9u
39541+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
39542+#define PARTROUNDS 6 /* 6 gets complete mixing */
39543+
39544+/* a, b, c, d - data; h0, h1 - accumulated hash */
39545+#define TEACORE(rounds) \
39546+ do { \
39547+ __u64 sum = 0; \
39548+ int n = rounds; \
39549+ __u64 b0, b1; \
39550+ \
39551+ b0 = h0; \
39552+ b1 = h1; \
39553+ \
39554+ do \
39555+ { \
39556+ sum += DELTA; \
39557+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
39558+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
39559+ } while(--n); \
39560+ \
39561+ h0 += b0; \
39562+ h1 += b1; \
39563+ } while(0)
39564+
39565+ pad = (__u64) len | ((__u64) len << 8);
39566+ pad |= pad << 16;
39567+
39568+ while (len >= 16) {
39569+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39570+ 16 | (__u64) name[3] << 24;
39571+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39572+ 16 | (__u64) name[7] << 24;
39573+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39574+ 16 | (__u64) name[11] << 24;
39575+ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
39576+ << 16 | (__u64) name[15] << 24;
39577+
39578+ TEACORE(PARTROUNDS);
39579+
39580+ len -= 16;
39581+ name += 16;
39582+ }
39583+
39584+ if (len >= 12) {
39585+ //assert(len < 16);
39586+ if (len >= 16)
39587+ *(int *)0 = 0;
39588+
39589+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39590+ 16 | (__u64) name[3] << 24;
39591+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39592+ 16 | (__u64) name[7] << 24;
39593+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39594+ 16 | (__u64) name[11] << 24;
39595+
39596+ d = pad;
39597+ for (i = 12; i < len; i++) {
39598+ d <<= 8;
39599+ d |= name[i];
39600+ }
39601+ } else if (len >= 8) {
39602+ //assert(len < 12);
39603+ if (len >= 12)
39604+ *(int *)0 = 0;
39605+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39606+ 16 | (__u64) name[3] << 24;
39607+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39608+ 16 | (__u64) name[7] << 24;
39609+
39610+ c = d = pad;
39611+ for (i = 8; i < len; i++) {
39612+ c <<= 8;
39613+ c |= name[i];
39614+ }
39615+ } else if (len >= 4) {
39616+ //assert(len < 8);
39617+ if (len >= 8)
39618+ *(int *)0 = 0;
39619+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39620+ 16 | (__u64) name[3] << 24;
39621+
39622+ b = c = d = pad;
39623+ for (i = 4; i < len; i++) {
39624+ b <<= 8;
39625+ b |= name[i];
39626+ }
39627+ } else {
39628+ //assert(len < 4);
39629+ if (len >= 4)
39630+ *(int *)0 = 0;
39631+ a = b = c = d = pad;
39632+ for (i = 0; i < len; i++) {
39633+ a <<= 8;
39634+ a |= name[i];
39635+ }
39636+ }
39637+
39638+ TEACORE(FULLROUNDS);
39639+
39640+/* return 0;*/
39641+ return h0 ^ h1;
39642+
39643+}
39644+
39645+/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
39646+
39647+ See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
39648+
39649+ Excerpts:
39650+
39651+ FNV hashes are designed to be fast while maintaining a low collision
39652+ rate.
39653+
39654+ [This version also seems to preserve lexicographical order locally.]
39655+
39656+ FNV hash algorithms and source code have been released into the public
39657+ domain.
39658+
39659+*/
39660+static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
39661+ int len UNUSED_ARG /* @name's length */ )
39662+{
39663+ unsigned long long a = 0xcbf29ce484222325ull;
39664+ const unsigned long long fnv_64_prime = 0x100000001b3ull;
39665+
39666+ assert("nikita-678", name != NULL);
39667+ assert("nikita-679", len >= 0);
39668+
39669+ /* FNV-1 hash each octet in the buffer */
39670+ for (; *name; ++name) {
39671+ /* multiply by the 32 bit FNV magic prime mod 2^64 */
39672+ a *= fnv_64_prime;
39673+ /* xor the bottom with the current octet */
39674+ a ^= (unsigned long long)(*name);
39675+ }
39676+ /* return our new hash value */
39677+ return a;
39678+}
39679+
39680+/* degenerate hash function used to simplify testing of non-unique key
39681+ handling */
39682+static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
39683+ int len UNUSED_ARG /* @name's length */ )
39684+{
39685+ return 0xc0c0c0c010101010ull;
39686+}
39687+
39688+static int change_hash(struct inode *inode,
39689+ reiser4_plugin * plugin,
39690+ pset_member memb)
39691+{
39692+ int result;
39693+
39694+ assert("nikita-3503", inode != NULL);
39695+ assert("nikita-3504", plugin != NULL);
39696+
39697+ assert("nikita-3505", is_reiser4_inode(inode));
39698+ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
39699+
39700+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
39701+ return RETERR(-EINVAL);
39702+
39703+ result = 0;
39704+ if (inode_hash_plugin(inode) == NULL ||
39705+ inode_hash_plugin(inode)->h.id != plugin->h.id) {
39706+ if (is_dir_empty(inode) == 0)
39707+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
39708+ PSET_HASH, plugin);
39709+ else
39710+ result = RETERR(-ENOTEMPTY);
39711+
39712+ }
39713+ return result;
39714+}
39715+
39716+static reiser4_plugin_ops hash_plugin_ops = {
39717+ .init = NULL,
39718+ .load = NULL,
39719+ .save_len = NULL,
39720+ .save = NULL,
39721+ .change = change_hash
39722+};
39723+
39724+/* hash plugins */
39725+hash_plugin hash_plugins[LAST_HASH_ID] = {
39726+ [RUPASOV_HASH_ID] = {
39727+ .h = {
39728+ .type_id = REISER4_HASH_PLUGIN_TYPE,
39729+ .id = RUPASOV_HASH_ID,
39730+ .pops = &hash_plugin_ops,
39731+ .label = "rupasov",
39732+ .desc = "Original Yura's hash",
39733+ .linkage = {NULL, NULL}
39734+ },
39735+ .hash = hash_rupasov
39736+ },
39737+ [R5_HASH_ID] = {
39738+ .h = {
39739+ .type_id = REISER4_HASH_PLUGIN_TYPE,
39740+ .id = R5_HASH_ID,
39741+ .pops = &hash_plugin_ops,
39742+ .label = "r5",
39743+ .desc = "r5 hash",
39744+ .linkage = {NULL, NULL}
39745+ },
39746+ .hash = hash_r5
39747+ },
39748+ [TEA_HASH_ID] = {
39749+ .h = {
39750+ .type_id = REISER4_HASH_PLUGIN_TYPE,
39751+ .id = TEA_HASH_ID,
39752+ .pops = &hash_plugin_ops,
39753+ .label = "tea",
39754+ .desc = "tea hash",
39755+ .linkage = {NULL, NULL}
39756+ },
39757+ .hash = hash_tea
39758+ },
39759+ [FNV1_HASH_ID] = {
39760+ .h = {
39761+ .type_id = REISER4_HASH_PLUGIN_TYPE,
39762+ .id = FNV1_HASH_ID,
39763+ .pops = &hash_plugin_ops,
39764+ .label = "fnv1",
39765+ .desc = "fnv1 hash",
39766+ .linkage = {NULL, NULL}
39767+ },
39768+ .hash = hash_fnv1
39769+ },
39770+ [DEGENERATE_HASH_ID] = {
39771+ .h = {
39772+ .type_id = REISER4_HASH_PLUGIN_TYPE,
39773+ .id = DEGENERATE_HASH_ID,
39774+ .pops = &hash_plugin_ops,
39775+ .label = "degenerate hash",
39776+ .desc = "Degenerate hash: only for testing",
39777+ .linkage = {NULL, NULL}
39778+ },
39779+ .hash = hash_deg
39780+ }
39781+};
39782+
39783+/* Make Linus happy.
39784+ Local variables:
39785+ c-indentation-style: "K&R"
39786+ mode-name: "LC"
39787+ c-basic-offset: 8
39788+ tab-width: 8
39789+ fill-column: 120
39790+ End:
39791+*/
39792diff -urN linux-2.6.23.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.23/fs/reiser4/plugin/inode_ops.c
39793--- linux-2.6.23.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 03:00:00.000000000 +0300
39794+++ linux-2.6.23/fs/reiser4/plugin/inode_ops.c 2007-12-04 16:49:30.000000000 +0300
39795@@ -0,0 +1,897 @@
39796+/*
39797+ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
39798+ */
39799+
39800+/*
39801+ * this file contains typical implementations for most of methods of struct
39802+ * inode_operations
39803+ */
39804+
39805+#include "../inode.h"
39806+#include "../safe_link.h"
39807+
39808+#include <linux/quotaops.h>
39809+#include <linux/namei.h>
39810+
39811+static int create_vfs_object(struct inode *parent, struct dentry *dentry,
39812+ reiser4_object_create_data *data);
39813+
39814+/**
39815+ * reiser4_create_common - create of inode operations
39816+ * @parent: inode of parent directory
39817+ * @dentry: dentry of new object to create
39818+ * @mode: the permissions to use
39819+ * @nameidata:
39820+ *
39821+ * This is common implementation of vfs's create method of struct
39822+ * inode_operations.
39823+ * Creates regular file using file plugin from parent directory plugin set.
39824+ */
39825+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
39826+ int mode, struct nameidata *nameidata)
39827+{
39828+ reiser4_object_create_data data;
39829+ file_plugin *fplug;
39830+
39831+ memset(&data, 0, sizeof data);
39832+ data.mode = S_IFREG | mode;
39833+ fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
39834+ if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
39835+ warning("vpf-1900", "'%s' is not a regular file plugin.",
39836+ fplug->h.label);
39837+ return RETERR(-EIO);
39838+ }
39839+ data.id = fplug->h.id;
39840+ return create_vfs_object(parent, dentry, &data);
39841+}
39842+
39843+int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
39844+void check_light_weight(struct inode *inode, struct inode *parent);
39845+
39846+/**
39847+ * reiser4_lookup_common - lookup of inode operations
39848+ * @parent: inode of directory to lookup into
39849+ * @dentry: name to look for
39850+ * @nameidata:
39851+ *
39852+ * This is common implementation of vfs's lookup method of struct
39853+ * inode_operations.
39854+ */
39855+struct dentry *reiser4_lookup_common(struct inode *parent,
39856+ struct dentry *dentry,
39857+ struct nameidata *nameidata)
39858+{
39859+ reiser4_context *ctx;
39860+ int result;
39861+ struct dentry *new;
39862+ struct inode *inode;
39863+ reiser4_dir_entry_desc entry;
39864+
39865+ ctx = reiser4_init_context(parent->i_sb);
39866+ if (IS_ERR(ctx))
39867+ return (struct dentry *)ctx;
39868+
39869+ /* set up operations on dentry. */
39870+ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
39871+
39872+ result = reiser4_lookup_name(parent, dentry, &entry.key);
39873+ if (result) {
39874+ context_set_commit_async(ctx);
39875+ reiser4_exit_context(ctx);
39876+ if (result == -ENOENT) {
39877+ /* object not found */
39878+ if (!IS_DEADDIR(parent))
39879+ d_add(dentry, NULL);
39880+ return NULL;
39881+ }
39882+ return ERR_PTR(result);
39883+ }
39884+
39885+ inode = reiser4_iget(parent->i_sb, &entry.key, 0);
39886+ if (IS_ERR(inode)) {
39887+ context_set_commit_async(ctx);
39888+ reiser4_exit_context(ctx);
39889+ return ERR_PTR(PTR_ERR(inode));
39890+ }
39891+
39892+ /* success */
39893+ check_light_weight(inode, parent);
39894+ new = d_splice_alias(inode, dentry);
39895+ reiser4_iget_complete(inode);
39896+
39897+ /* prevent balance_dirty_pages() from being called: we don't want to
39898+ * do this under directory i_mutex. */
39899+ context_set_commit_async(ctx);
39900+ reiser4_exit_context(ctx);
39901+ return new;
39902+}
39903+
39904+static reiser4_block_nr common_estimate_link(struct inode *parent,
39905+ struct inode *object);
39906+int reiser4_update_dir(struct inode *);
39907+
39908+/**
39909+ * reiser4_link_common - link of inode operations
39910+ * @existing: dentry of object which is to get new name
39911+ * @parent: directory where new name is to be created
39912+ * @newname: new name
39913+ *
39914+ * This is common implementation of vfs's link method of struct
39915+ * inode_operations.
39916+ */
39917+int reiser4_link_common(struct dentry *existing, struct inode *parent,
39918+ struct dentry *newname)
39919+{
39920+ reiser4_context *ctx;
39921+ int result;
39922+ struct inode *object;
39923+ dir_plugin *parent_dplug;
39924+ reiser4_dir_entry_desc entry;
39925+ reiser4_object_create_data data;
39926+ reiser4_block_nr reserve;
39927+
39928+ ctx = reiser4_init_context(parent->i_sb);
39929+ if (IS_ERR(ctx))
39930+ return PTR_ERR(ctx);
39931+
39932+ assert("nikita-1431", existing != NULL);
39933+ assert("nikita-1432", parent != NULL);
39934+ assert("nikita-1433", newname != NULL);
39935+
39936+ object = existing->d_inode;
39937+ assert("nikita-1434", object != NULL);
39938+
39939+ /* check for race with create_object() */
39940+ if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
39941+ context_set_commit_async(ctx);
39942+ reiser4_exit_context(ctx);
39943+ return RETERR(-E_REPEAT);
39944+ }
39945+
39946+ parent_dplug = inode_dir_plugin(parent);
39947+
39948+ memset(&entry, 0, sizeof entry);
39949+ entry.obj = object;
39950+
39951+ data.mode = object->i_mode;
39952+ data.id = inode_file_plugin(object)->h.id;
39953+
39954+ reserve = common_estimate_link(parent, existing->d_inode);
39955+ if ((__s64) reserve < 0) {
39956+ context_set_commit_async(ctx);
39957+ reiser4_exit_context(ctx);
39958+ return reserve;
39959+ }
39960+
39961+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
39962+ context_set_commit_async(ctx);
39963+ reiser4_exit_context(ctx);
39964+ return RETERR(-ENOSPC);
39965+ }
39966+
39967+ /*
39968+ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
39969+ * means that link(2) can race against unlink(2) or rename(2), and
39970+ * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
39971+ *
39972+ * For such inode we have to undo special processing done in
39973+ * reiser4_unlink() viz. creation of safe-link.
39974+ */
39975+ if (unlikely(object->i_nlink == 0)) {
39976+ result = safe_link_del(reiser4_tree_by_inode(object),
39977+ get_inode_oid(object), SAFE_UNLINK);
39978+ if (result != 0) {
39979+ context_set_commit_async(ctx);
39980+ reiser4_exit_context(ctx);
39981+ return result;
39982+ }
39983+ }
39984+
39985+ /* increment nlink of @existing and update its stat data */
39986+ result = reiser4_add_nlink(object, parent, 1);
39987+ if (result == 0) {
39988+ /* add entry to the parent */
39989+ result =
39990+ parent_dplug->add_entry(parent, newname, &data, &entry);
39991+ if (result != 0) {
39992+ /* failed to add entry to the parent, decrement nlink
39993+ of @existing */
39994+ reiser4_del_nlink(object, parent, 1);
39995+ /*
39996+ * now, if that failed, we have a file with too big
39997+ * nlink---space leak, much better than directory
39998+ * entry pointing to nowhere
39999+ */
40000+ }
40001+ }
40002+ if (result == 0) {
40003+ atomic_inc(&object->i_count);
40004+ /*
40005+ * Upon successful completion, link() shall mark for update
40006+ * the st_ctime field of the file. Also, the st_ctime and
40007+ * st_mtime fields of the directory that contains the new
40008+ * entry shall be marked for update. --SUS
40009+ */
40010+ result = reiser4_update_dir(parent);
40011+ }
40012+ if (result == 0)
40013+ d_instantiate(newname, existing->d_inode);
40014+
40015+ context_set_commit_async(ctx);
40016+ reiser4_exit_context(ctx);
40017+ return result;
40018+}
40019+
40020+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
40021+
40022+/**
40023+ * reiser4_unlink_common - unlink of inode operations
40024+ * @parent: inode of directory to remove name from
40025+ * @victim: name to be removed
40026+ *
40027+ * This is common implementation of vfs's unlink method of struct
40028+ * inode_operations.
40029+ */
40030+int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
40031+{
40032+ reiser4_context *ctx;
40033+ int result;
40034+ struct inode *object;
40035+ file_plugin *fplug;
40036+
40037+ ctx = reiser4_init_context(parent->i_sb);
40038+ if (IS_ERR(ctx))
40039+ return PTR_ERR(ctx);
40040+
40041+ object = victim->d_inode;
40042+ fplug = inode_file_plugin(object);
40043+ assert("nikita-2882", fplug->detach != NULL);
40044+
40045+ result = unlink_check_and_grab(parent, victim);
40046+ if (result != 0) {
40047+ context_set_commit_async(ctx);
40048+ reiser4_exit_context(ctx);
40049+ return result;
40050+ }
40051+
40052+ result = fplug->detach(object, parent);
40053+ if (result == 0) {
40054+ dir_plugin *parent_dplug;
40055+ reiser4_dir_entry_desc entry;
40056+
40057+ parent_dplug = inode_dir_plugin(parent);
40058+ memset(&entry, 0, sizeof entry);
40059+
40060+ /* first, delete directory entry */
40061+ result = parent_dplug->rem_entry(parent, victim, &entry);
40062+ if (result == 0) {
40063+ /*
40064+ * if name was removed successfully, we _have_ to
40065+ * return 0 from this function, because upper level
40066+ * caller (vfs_{rmdir,unlink}) expect this.
40067+ *
40068+ * now that directory entry is removed, update
40069+ * stat-data
40070+ */
40071+ reiser4_del_nlink(object, parent, 1);
40072+ /*
40073+ * Upon successful completion, unlink() shall mark for
40074+ * update the st_ctime and st_mtime fields of the
40075+ * parent directory. Also, if the file's link count is
40076+ * not 0, the st_ctime field of the file shall be
40077+ * marked for update. --SUS
40078+ */
40079+ reiser4_update_dir(parent);
40080+ /* add safe-link for this file */
40081+ if (object->i_nlink == 0)
40082+ safe_link_add(object, SAFE_UNLINK);
40083+ }
40084+ }
40085+
40086+ if (unlikely(result != 0)) {
40087+ if (result != -ENOMEM)
40088+ warning("nikita-3398", "Cannot unlink %llu (%i)",
40089+ (unsigned long long)get_inode_oid(object),
40090+ result);
40091+ /* if operation failed commit pending inode modifications to
40092+ * the stat-data */
40093+ reiser4_update_sd(object);
40094+ reiser4_update_sd(parent);
40095+ }
40096+
40097+ reiser4_release_reserved(object->i_sb);
40098+
40099+ /* @object's i_ctime was updated by ->rem_link() method(). */
40100+
40101+ /* @victim can be already removed from the disk by this time. Inode is
40102+ then marked so that iput() wouldn't try to remove stat data. But
40103+ inode itself is still there.
40104+ */
40105+
40106+ /*
40107+ * we cannot release directory semaphore here, because name has
40108+ * already been deleted, but dentry (@victim) still exists. Prevent
40109+ * balance_dirty_pages() from being called on exiting this context: we
40110+ * don't want to do this under directory i_mutex.
40111+ */
40112+ context_set_commit_async(ctx);
40113+ reiser4_exit_context(ctx);
40114+ return result;
40115+}
40116+
40117+/**
40118+ * reiser4_symlink_common - symlink of inode operations
40119+ * @parent: inode of parent directory
40120+ * @dentry: dentry of object to be created
40121+ * @linkname: string symlink is to contain
40122+ *
40123+ * This is common implementation of vfs's symlink method of struct
40124+ * inode_operations.
40125+ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
40126+ */
40127+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
40128+ const char *linkname)
40129+{
40130+ reiser4_object_create_data data;
40131+
40132+ memset(&data, 0, sizeof data);
40133+ data.name = linkname;
40134+ data.id = SYMLINK_FILE_PLUGIN_ID;
40135+ data.mode = S_IFLNK | S_IRWXUGO;
40136+ return create_vfs_object(parent, dentry, &data);
40137+}
40138+
40139+/**
40140+ * reiser4_mkdir_common - mkdir of inode operations
40141+ * @parent: inode of parent directory
40142+ * @dentry: dentry of object to be created
40143+ * @mode: the permissions to use
40144+ *
40145+ * This is common implementation of vfs's mkdir method of struct
40146+ * inode_operations.
40147+ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
40148+ */
40149+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
40150+{
40151+ reiser4_object_create_data data;
40152+
40153+ memset(&data, 0, sizeof data);
40154+ data.mode = S_IFDIR | mode;
40155+ data.id = DIRECTORY_FILE_PLUGIN_ID;
40156+ return create_vfs_object(parent, dentry, &data);
40157+}
40158+
40159+/**
40160+ * reiser4_mknod_common - mknod of inode operations
40161+ * @parent: inode of parent directory
40162+ * @dentry: dentry of object to be created
40163+ * @mode: the permissions to use and file type
40164+ * @rdev: minor and major of new device file
40165+ *
40166+ * This is common implementation of vfs's mknod method of struct
40167+ * inode_operations.
40168+ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
40169+ */
40170+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
40171+ int mode, dev_t rdev)
40172+{
40173+ reiser4_object_create_data data;
40174+
40175+ memset(&data, 0, sizeof data);
40176+ data.mode = mode;
40177+ data.rdev = rdev;
40178+ data.id = SPECIAL_FILE_PLUGIN_ID;
40179+ return create_vfs_object(parent, dentry, &data);
40180+}
40181+
40182+/*
40183+ * implementation of vfs's rename method of struct inode_operations for typical
40184+ * directory is in inode_ops_rename.c
40185+ */
40186+
40187+/**
40188+ * reiser4_follow_link_common - follow_link of inode operations
40189+ * @dentry: dentry of symlink
40190+ * @data:
40191+ *
40192+ * This is common implementation of vfs's followlink method of struct
40193+ * inode_operations.
40194+ * Assumes that inode's i_private points to the content of symbolic link.
40195+ */
40196+void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
40197+{
40198+ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
40199+
40200+ if (!dentry->d_inode->i_private
40201+ || !reiser4_inode_get_flag(dentry->d_inode,
40202+ REISER4_GENERIC_PTR_USED))
40203+ return ERR_PTR(RETERR(-EINVAL));
40204+ nd_set_link(nd, dentry->d_inode->i_private);
40205+ return NULL;
40206+}
40207+
40208+/**
40209+ * reiser4_permission_common - permission of inode operations
40210+ * @inode: inode to check permissions for
40211+ * @mask: mode bits to check permissions for
40212+ * @nameidata:
40213+ *
40214+ * Uses generic function to check for rwx permissions.
40215+ */
40216+int reiser4_permission_common(struct inode *inode, int mask,
40217+ struct nameidata *nameidata)
40218+{
40219+ return generic_permission(inode, mask, NULL);
40220+}
40221+
40222+static int setattr_reserve(reiser4_tree *);
40223+
40224+/* this is common implementation of vfs's setattr method of struct
40225+ inode_operations
40226+*/
40227+int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
40228+{
40229+ reiser4_context *ctx;
40230+ struct inode *inode;
40231+ int result;
40232+
40233+ inode = dentry->d_inode;
40234+ result = inode_change_ok(inode, attr);
40235+ if (result)
40236+ return result;
40237+
40238+ ctx = reiser4_init_context(inode->i_sb);
40239+ if (IS_ERR(ctx))
40240+ return PTR_ERR(ctx);
40241+
40242+ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
40243+
40244+ /*
40245+ * grab disk space and call standard inode_setattr().
40246+ */
40247+ result = setattr_reserve(reiser4_tree_by_inode(inode));
40248+ if (!result) {
40249+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
40250+ || (attr->ia_valid & ATTR_GID
40251+ && attr->ia_gid != inode->i_gid)) {
40252+ result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
40253+ if (result) {
40254+ context_set_commit_async(ctx);
40255+ reiser4_exit_context(ctx);
40256+ return result;
40257+ }
40258+ }
40259+ result = inode_setattr(inode, attr);
40260+ if (!result)
40261+ reiser4_update_sd(inode);
40262+ }
40263+
40264+ context_set_commit_async(ctx);
40265+ reiser4_exit_context(ctx);
40266+ return result;
40267+}
40268+
40269+/* this is common implementation of vfs's getattr method of struct
40270+ inode_operations
40271+*/
40272+int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
40273+ struct dentry *dentry, struct kstat *stat)
40274+{
40275+ struct inode *obj;
40276+
40277+ assert("nikita-2298", dentry != NULL);
40278+ assert("nikita-2299", stat != NULL);
40279+ assert("nikita-2300", dentry->d_inode != NULL);
40280+
40281+ obj = dentry->d_inode;
40282+
40283+ stat->dev = obj->i_sb->s_dev;
40284+ stat->ino = oid_to_uino(get_inode_oid(obj));
40285+ stat->mode = obj->i_mode;
40286+ /* don't confuse userland with huge nlink. This is not entirely
40287+ * correct, because nlink_t is not necessary 16 bit signed. */
40288+ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
40289+ stat->uid = obj->i_uid;
40290+ stat->gid = obj->i_gid;
40291+ stat->rdev = obj->i_rdev;
40292+ stat->atime = obj->i_atime;
40293+ stat->mtime = obj->i_mtime;
40294+ stat->ctime = obj->i_ctime;
40295+ stat->size = obj->i_size;
40296+ stat->blocks =
40297+ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
40298+ /* "preferred" blocksize for efficient file system I/O */
40299+ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
40300+
40301+ return 0;
40302+}
40303+
40304+/* Estimate the maximum amount of nodes which might be allocated or changed on
40305+ typical new object creation. Typical creation consists of calling create
40306+ method of file plugin, adding directory entry to parent and update parent
40307+ directory's stat data.
40308+*/
40309+static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */
40310+ struct inode *object
40311+ /* object */ )
40312+{
40313+ assert("vpf-309", parent != NULL);
40314+ assert("vpf-307", object != NULL);
40315+
40316+ return
40317+ /* object creation estimation */
40318+ inode_file_plugin(object)->estimate.create(object) +
40319+ /* stat data of parent directory estimation */
40320+ inode_file_plugin(parent)->estimate.update(parent) +
40321+ /* adding entry estimation */
40322+ inode_dir_plugin(parent)->estimate.add_entry(parent) +
40323+ /* to undo in the case of failure */
40324+ inode_dir_plugin(parent)->estimate.rem_entry(parent);
40325+}
40326+
40327+/* Create child in directory.
40328+
40329+ . get object's plugin
40330+ . get fresh inode
40331+ . initialize inode
40332+ . add object's stat-data
40333+ . initialize object's directory
40334+ . add entry to the parent
40335+ . instantiate dentry
40336+
40337+*/
40338+static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new
40339+ object */
40340+ struct inode **retobj)
40341+{
40342+ int result;
40343+
40344+ struct dentry *dentry; /* parent object */
40345+ struct inode *parent; /* new name */
40346+
40347+ dir_plugin *par_dir; /* directory plugin on the parent */
40348+ dir_plugin *obj_dir; /* directory plugin on the new object */
40349+ file_plugin *obj_plug; /* object plugin on the new object */
40350+ struct inode *object; /* new object */
40351+ reiser4_block_nr reserve;
40352+
40353+ reiser4_dir_entry_desc entry; /* new directory entry */
40354+
40355+ assert("nikita-1420", data != NULL);
40356+ parent = data->parent;
40357+ dentry = data->dentry;
40358+
40359+ assert("nikita-1418", parent != NULL);
40360+ assert("nikita-1419", dentry != NULL);
40361+
40362+ /* check, that name is acceptable for parent */
40363+ par_dir = inode_dir_plugin(parent);
40364+ if (par_dir->is_name_acceptable &&
40365+ !par_dir->is_name_acceptable(parent,
40366+ dentry->d_name.name,
40367+ (int)dentry->d_name.len))
40368+ return RETERR(-ENAMETOOLONG);
40369+
40370+ result = 0;
40371+ obj_plug = file_plugin_by_id((int)data->id);
40372+ if (obj_plug == NULL) {
40373+ warning("nikita-430", "Cannot find plugin %i", data->id);
40374+ return RETERR(-ENOENT);
40375+ }
40376+ object = new_inode(parent->i_sb);
40377+ if (object == NULL)
40378+ return RETERR(-ENOMEM);
40379+ /* we'll update i_nlink below */
40380+ object->i_nlink = 0;
40381+ /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
40382+ * to simplify error handling: if some error occurs before i_ino is
40383+ * initialized with oid, i_ino should already be set to some
40384+ * distinguished value. */
40385+ object->i_ino = 0;
40386+
40387+ /* So that on error iput will be called. */
40388+ *retobj = object;
40389+
40390+ if (DQUOT_ALLOC_INODE(object)) {
40391+ DQUOT_DROP(object);
40392+ object->i_flags |= S_NOQUOTA;
40393+ return RETERR(-EDQUOT);
40394+ }
40395+
40396+ memset(&entry, 0, sizeof entry);
40397+ entry.obj = object;
40398+
40399+ set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
40400+ file_plugin_to_plugin(obj_plug));
40401+ result = obj_plug->set_plug_in_inode(object, parent, data);
40402+ if (result) {
40403+ warning("nikita-431", "Cannot install plugin %i on %llx",
40404+ data->id, (unsigned long long)get_inode_oid(object));
40405+ DQUOT_FREE_INODE(object);
40406+ object->i_flags |= S_NOQUOTA;
40407+ return result;
40408+ }
40409+
40410+ /* reget plugin after installation */
40411+ obj_plug = inode_file_plugin(object);
40412+
40413+ if (obj_plug->create_object == NULL) {
40414+ DQUOT_FREE_INODE(object);
40415+ object->i_flags |= S_NOQUOTA;
40416+ return RETERR(-EPERM);
40417+ }
40418+
40419+ /* if any of hash, tail, sd or permission plugins for newly created
40420+ object are not set yet set them here inheriting them from parent
40421+ directory
40422+ */
40423+ assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
40424+ result = obj_plug->adjust_to_parent(object,
40425+ parent,
40426+ object->i_sb->s_root->d_inode);
40427+ if (result == 0)
40428+ result = finish_pset(object);
40429+ if (result != 0) {
40430+ warning("nikita-432", "Cannot inherit from %llx to %llx",
40431+ (unsigned long long)get_inode_oid(parent),
40432+ (unsigned long long)get_inode_oid(object));
40433+ DQUOT_FREE_INODE(object);
40434+ object->i_flags |= S_NOQUOTA;
40435+ return result;
40436+ }
40437+
40438+ /* setup inode and file-operations for this inode */
40439+ setup_inode_ops(object, data);
40440+
40441+ /* call file plugin's method to initialize plugin specific part of
40442+ * inode */
40443+ if (obj_plug->init_inode_data)
40444+ obj_plug->init_inode_data(object, data, 1 /*create */ );
40445+
40446+ /* obtain directory plugin (if any) for new object. */
40447+ obj_dir = inode_dir_plugin(object);
40448+ if (obj_dir != NULL && obj_dir->init == NULL) {
40449+ DQUOT_FREE_INODE(object);
40450+ object->i_flags |= S_NOQUOTA;
40451+ return RETERR(-EPERM);
40452+ }
40453+
40454+ reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
40455+
40456+ reserve = estimate_create_vfs_object(parent, object);
40457+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
40458+ DQUOT_FREE_INODE(object);
40459+ object->i_flags |= S_NOQUOTA;
40460+ return RETERR(-ENOSPC);
40461+ }
40462+
40463+ /* mark inode `immutable'. We disable changes to the file being
40464+ created until valid directory entry for it is inserted. Otherwise,
40465+ if file were expanded and insertion of directory entry fails, we
40466+ have to remove file, but we only alloted enough space in
40467+ transaction to remove _empty_ file. 3.x code used to remove stat
40468+ data in different transaction thus possibly leaking disk space on
40469+ crash. This all only matters if it's possible to access file
40470+ without name, for example, by inode number
40471+ */
40472+ reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
40473+
40474+ /* create empty object, this includes allocation of new objectid. For
40475+ directories this implies creation of dot and dotdot */
40476+ assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
40477+
40478+ /* mark inode as `loaded'. From this point onward
40479+ reiser4_delete_inode() will try to remove its stat-data. */
40480+ reiser4_inode_set_flag(object, REISER4_LOADED);
40481+
40482+ result = obj_plug->create_object(object, parent, data);
40483+ if (result != 0) {
40484+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40485+ if (result != -ENAMETOOLONG && result != -ENOMEM)
40486+ warning("nikita-2219",
40487+ "Failed to create sd for %llu",
40488+ (unsigned long long)get_inode_oid(object));
40489+ DQUOT_FREE_INODE(object);
40490+ object->i_flags |= S_NOQUOTA;
40491+ return result;
40492+ }
40493+
40494+ if (obj_dir != NULL)
40495+ result = obj_dir->init(object, parent, data);
40496+ if (result == 0) {
40497+ assert("nikita-434", !reiser4_inode_get_flag(object,
40498+ REISER4_NO_SD));
40499+ /* insert inode into VFS hash table */
40500+ insert_inode_hash(object);
40501+ /* create entry */
40502+ result = par_dir->add_entry(parent, dentry, data, &entry);
40503+ if (result == 0) {
40504+ result = reiser4_add_nlink(object, parent, 0);
40505+ /* If O_CREAT is set and the file did not previously
40506+ exist, upon successful completion, open() shall
40507+ mark for update the st_atime, st_ctime, and
40508+ st_mtime fields of the file and the st_ctime and
40509+ st_mtime fields of the parent directory. --SUS
40510+ */
40511+ /* @object times are already updated by
40512+ reiser4_add_nlink() */
40513+ if (result == 0)
40514+ reiser4_update_dir(parent);
40515+ if (result != 0)
40516+ /* cleanup failure to add nlink */
40517+ par_dir->rem_entry(parent, dentry, &entry);
40518+ }
40519+ if (result != 0)
40520+ /* cleanup failure to add entry */
40521+ obj_plug->detach(object, parent);
40522+ } else if (result != -ENOMEM)
40523+ warning("nikita-2219", "Failed to initialize dir for %llu: %i",
40524+ (unsigned long long)get_inode_oid(object), result);
40525+
40526+ /*
40527+ * update stat-data, committing all pending modifications to the inode
40528+ * fields.
40529+ */
40530+ reiser4_update_sd(object);
40531+ if (result != 0) {
40532+ DQUOT_FREE_INODE(object);
40533+ object->i_flags |= S_NOQUOTA;
40534+ /* if everything was ok (result == 0), parent stat-data is
40535+ * already updated above (update_parent_dir()) */
40536+ reiser4_update_sd(parent);
40537+ /* failure to create entry, remove object */
40538+ obj_plug->delete_object(object);
40539+ }
40540+
40541+ /* file has name now, clear immutable flag */
40542+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40543+
40544+ /* on error, iput() will call ->delete_inode(). We should keep track
40545+ of the existence of stat-data for this inode and avoid attempt to
40546+ remove it in reiser4_delete_inode(). This is accomplished through
40547+ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
40548+ */
40549+ return result;
40550+}
40551+
40552+/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
40553+ reiser4_mknod and reiser4_symlink
40554+*/
40555+static int
40556+create_vfs_object(struct inode *parent,
40557+ struct dentry *dentry, reiser4_object_create_data * data)
40558+{
40559+ reiser4_context *ctx;
40560+ int result;
40561+ struct inode *child;
40562+
40563+ ctx = reiser4_init_context(parent->i_sb);
40564+ if (IS_ERR(ctx))
40565+ return PTR_ERR(ctx);
40566+ context_set_commit_async(ctx);
40567+
40568+ data->parent = parent;
40569+ data->dentry = dentry;
40570+ child = NULL;
40571+ result = do_create_vfs_child(data, &child);
40572+ if (unlikely(result != 0)) {
40573+ if (child != NULL) {
40574+ reiser4_make_bad_inode(child);
40575+ iput(child);
40576+ }
40577+ } else
40578+ d_instantiate(dentry, child);
40579+
40580+ reiser4_exit_context(ctx);
40581+ return result;
40582+}
40583+
40584+/* helper for link_common. Estimate disk space necessary to add a link
40585+ from @parent to @object
40586+*/
40587+static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */
40588+ struct inode *object
40589+ /* object to which new link is being cerated */
40590+ )
40591+{
40592+ reiser4_block_nr res = 0;
40593+ file_plugin *fplug;
40594+ dir_plugin *dplug;
40595+
40596+ assert("vpf-317", object != NULL);
40597+ assert("vpf-318", parent != NULL);
40598+
40599+ fplug = inode_file_plugin(object);
40600+ dplug = inode_dir_plugin(parent);
40601+ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
40602+ /* reiser4_add_nlink(object) */
40603+ res += fplug->estimate.update(object);
40604+ /* add_entry(parent) */
40605+ res += dplug->estimate.add_entry(parent);
40606+ /* reiser4_del_nlink(object) */
40607+ res += fplug->estimate.update(object);
40608+ /* update_dir(parent) */
40609+ res += inode_file_plugin(parent)->estimate.update(parent);
40610+ /* safe-link */
40611+ res += estimate_one_item_removal(reiser4_tree_by_inode(object));
40612+
40613+ return res;
40614+}
40615+
40616+/* Estimate disk space necessary to remove a link between @parent and
40617+ @object.
40618+*/
40619+static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */
40620+ struct inode *object
40621+ /* object to which new link is being cerated */
40622+ )
40623+{
40624+ reiser4_block_nr res = 0;
40625+ file_plugin *fplug;
40626+ dir_plugin *dplug;
40627+
40628+ assert("vpf-317", object != NULL);
40629+ assert("vpf-318", parent != NULL);
40630+
40631+ fplug = inode_file_plugin(object);
40632+ dplug = inode_dir_plugin(parent);
40633+
40634+ /* rem_entry(parent) */
40635+ res += dplug->estimate.rem_entry(parent);
40636+ /* reiser4_del_nlink(object) */
40637+ res += fplug->estimate.update(object);
40638+ /* update_dir(parent) */
40639+ res += inode_file_plugin(parent)->estimate.update(parent);
40640+ /* fplug->unlink */
40641+ res += fplug->estimate.unlink(object, parent);
40642+ /* safe-link */
40643+ res += estimate_one_insert_item(reiser4_tree_by_inode(object));
40644+
40645+ return res;
40646+}
40647+
40648+/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
40649+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
40650+{
40651+ file_plugin *fplug;
40652+ struct inode *child;
40653+ int result;
40654+
40655+ result = 0;
40656+ child = victim->d_inode;
40657+ fplug = inode_file_plugin(child);
40658+
40659+ /* check for race with create_object() */
40660+ if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
40661+ return RETERR(-E_REPEAT);
40662+ /* object being deleted should have stat data */
40663+ assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
40664+
40665+ /* ask object plugin */
40666+ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
40667+ return RETERR(-ENOTEMPTY);
40668+
40669+ result = (int)estimate_unlink(parent, child);
40670+ if (result < 0)
40671+ return result;
40672+
40673+ return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
40674+}
40675+
40676+/* helper for reiser4_setattr_common */
40677+static int setattr_reserve(reiser4_tree * tree)
40678+{
40679+ assert("vs-1096", is_grab_enabled(get_current_context()));
40680+ return reiser4_grab_space(estimate_one_insert_into_item(tree),
40681+ BA_CAN_COMMIT);
40682+}
40683+
40684+/* helper function. Standards require that for many file-system operations
40685+ on success ctime and mtime of parent directory is to be updated. */
40686+int reiser4_update_dir(struct inode *dir)
40687+{
40688+ assert("nikita-2525", dir != NULL);
40689+
40690+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
40691+ return reiser4_update_sd(dir);
40692+}
40693diff -urN linux-2.6.23.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.23/fs/reiser4/plugin/inode_ops_rename.c
40694--- linux-2.6.23.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 03:00:00.000000000 +0300
40695+++ linux-2.6.23/fs/reiser4/plugin/inode_ops_rename.c 2007-12-04 16:49:30.000000000 +0300
40696@@ -0,0 +1,912 @@
40697+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
40698+ * reiser4/README */
40699+
40700+#include "../inode.h"
40701+#include "../safe_link.h"
40702+
40703+static const char *possible_leak = "Possible disk space leak.";
40704+
40705+/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
40706+
40707+ Helper function called from hashed_rename() */
40708+static int replace_name(struct inode *to_inode, /* inode where @from_coord is
40709+ * to be re-targeted at */
40710+ struct inode *from_dir, /* directory where @from_coord
40711+ * lives */
40712+ struct inode *from_inode, /* inode @from_coord
40713+ * originally point to */
40714+ coord_t * from_coord, /* where directory entry is in
40715+ * the tree */
40716+ lock_handle * from_lh /* lock handle on @from_coord */ )
40717+{
40718+ item_plugin *from_item;
40719+ int result;
40720+ znode *node;
40721+
40722+ coord_clear_iplug(from_coord);
40723+ node = from_coord->node;
40724+ result = zload(node);
40725+ if (result != 0)
40726+ return result;
40727+ from_item = item_plugin_by_coord(from_coord);
40728+ if (plugin_of_group(item_plugin_by_coord(from_coord),
40729+ DIR_ENTRY_ITEM_TYPE))
40730+ {
40731+ reiser4_key to_key;
40732+
40733+ build_sd_key(to_inode, &to_key);
40734+
40735+ /* everything is found and prepared to change directory entry
40736+ at @from_coord to point to @to_inode.
40737+
40738+ @to_inode is just about to get new name, so bump its link
40739+ counter.
40740+
40741+ */
40742+ result = reiser4_add_nlink(to_inode, from_dir, 0);
40743+ if (result != 0) {
40744+ /* Don't issue warning: this may be plain -EMLINK */
40745+ zrelse(node);
40746+ return result;
40747+ }
40748+
40749+ result =
40750+ from_item->s.dir.update_key(from_coord, &to_key, from_lh);
40751+ if (result != 0) {
40752+ reiser4_del_nlink(to_inode, from_dir, 0);
40753+ zrelse(node);
40754+ return result;
40755+ }
40756+
40757+ /* @from_inode just lost its name, he-he.
40758+
40759+ If @from_inode was directory, it contained dotdot pointing
40760+ to @from_dir. @from_dir i_nlink will be decreased when
40761+ iput() will be called on @from_inode.
40762+
40763+ If file-system is not ADG (hard-links are
40764+ supported on directories), iput(from_inode) will not remove
40765+ @from_inode, and thus above is incorrect, but hard-links on
40766+ directories are problematic in many other respects.
40767+ */
40768+ result = reiser4_del_nlink(from_inode, from_dir, 0);
40769+ if (result != 0) {
40770+ warning("nikita-2330",
40771+ "Cannot remove link from source: %i. %s",
40772+ result, possible_leak);
40773+ }
40774+ /* Has to return success, because entry is already
40775+ * modified. */
40776+ result = 0;
40777+
40778+ /* NOTE-NIKITA consider calling plugin method in stead of
40779+ accessing inode fields directly. */
40780+ from_dir->i_mtime = CURRENT_TIME;
40781+ } else {
40782+ warning("nikita-2326", "Unexpected item type");
40783+ result = RETERR(-EIO);
40784+ }
40785+ zrelse(node);
40786+ return result;
40787+}
40788+
40789+/* add new entry pointing to @inode into @dir at @coord, locked by @lh
40790+
40791+ Helper function used by hashed_rename(). */
40792+static int add_name(struct inode *inode, /* inode where @coord is to be
40793+ * re-targeted at */
40794+ struct inode *dir, /* directory where @coord lives */
40795+ struct dentry *name, /* new name */
40796+ coord_t * coord, /* where directory entry is in the tree */
40797+ lock_handle * lh, /* lock handle on @coord */
40798+ int is_dir /* true, if @inode is directory */ )
40799+{
40800+ int result;
40801+ reiser4_dir_entry_desc entry;
40802+
40803+ assert("nikita-2333", lh->node == coord->node);
40804+ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
40805+
40806+ memset(&entry, 0, sizeof entry);
40807+ entry.obj = inode;
40808+ /* build key of directory entry description */
40809+ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
40810+
40811+ /* ext2 does this in different order: first inserts new entry,
40812+ then increases directory nlink. We don't want do this,
40813+ because reiser4_add_nlink() calls ->add_link() plugin
40814+ method that can fail for whatever reason, leaving as with
40815+ cleanup problems.
40816+ */
40817+ /* @inode is getting new name */
40818+ reiser4_add_nlink(inode, dir, 0);
40819+ /* create @new_name in @new_dir pointing to
40820+ @old_inode */
40821+ result = WITH_COORD(coord,
40822+ inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
40823+ coord,
40824+ lh,
40825+ name,
40826+ &entry));
40827+ if (result != 0) {
40828+ int result2;
40829+ result2 = reiser4_del_nlink(inode, dir, 0);
40830+ if (result2 != 0) {
40831+ warning("nikita-2327",
40832+ "Cannot drop link on %lli %i. %s",
40833+ (unsigned long long)get_inode_oid(inode),
40834+ result2, possible_leak);
40835+ }
40836+ } else
40837+ INODE_INC_FIELD(dir, i_size);
40838+ return result;
40839+}
40840+
40841+static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
40842+ struct dentry *old_name, /* old name */
40843+ struct inode *new_dir, /* directory where @new is located */
40844+ struct dentry *new_name /* new name */ )
40845+{
40846+ reiser4_block_nr res1, res2;
40847+ dir_plugin *p_parent_old, *p_parent_new;
40848+ file_plugin *p_child_old, *p_child_new;
40849+
40850+ assert("vpf-311", old_dir != NULL);
40851+ assert("vpf-312", new_dir != NULL);
40852+ assert("vpf-313", old_name != NULL);
40853+ assert("vpf-314", new_name != NULL);
40854+
40855+ p_parent_old = inode_dir_plugin(old_dir);
40856+ p_parent_new = inode_dir_plugin(new_dir);
40857+ p_child_old = inode_file_plugin(old_name->d_inode);
40858+ if (new_name->d_inode)
40859+ p_child_new = inode_file_plugin(new_name->d_inode);
40860+ else
40861+ p_child_new = NULL;
40862+
40863+ /* find_entry - can insert one leaf. */
40864+ res1 = res2 = 1;
40865+
40866+ /* replace_name */
40867+ {
40868+ /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
40869+ res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
40870+ /* update key */
40871+ res1 += 1;
40872+ /* reiser4_del_nlink(p_child_new) */
40873+ if (p_child_new)
40874+ res1 += p_child_new->estimate.update(new_name->d_inode);
40875+ }
40876+
40877+ /* else add_name */
40878+ {
40879+ /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
40880+ res2 +=
40881+ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
40882+ /* reiser4_add_nlink(p_parent_old) */
40883+ res2 += p_child_old->estimate.update(old_name->d_inode);
40884+ /* add_entry(p_parent_new) */
40885+ res2 += p_parent_new->estimate.add_entry(new_dir);
40886+ /* reiser4_del_nlink(p_parent_old) */
40887+ res2 += p_child_old->estimate.update(old_name->d_inode);
40888+ }
40889+
40890+ res1 = res1 < res2 ? res2 : res1;
40891+
40892+ /* reiser4_write_sd(p_parent_new) */
40893+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
40894+
40895+ /* reiser4_write_sd(p_child_new) */
40896+ if (p_child_new)
40897+ res1 += p_child_new->estimate.update(new_name->d_inode);
40898+
40899+ /* hashed_rem_entry(p_parent_old) */
40900+ res1 += p_parent_old->estimate.rem_entry(old_dir);
40901+
40902+ /* reiser4_del_nlink(p_child_old) */
40903+ res1 += p_child_old->estimate.update(old_name->d_inode);
40904+
40905+ /* replace_name */
40906+ {
40907+ /* reiser4_add_nlink(p_parent_dir_new) */
40908+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
40909+ /* update_key */
40910+ res1 += 1;
40911+ /* reiser4_del_nlink(p_parent_new) */
40912+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
40913+ /* reiser4_del_nlink(p_parent_old) */
40914+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
40915+ }
40916+
40917+ /* reiser4_write_sd(p_parent_old) */
40918+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
40919+
40920+ /* reiser4_write_sd(p_child_old) */
40921+ res1 += p_child_old->estimate.update(old_name->d_inode);
40922+
40923+ return res1;
40924+}
40925+
40926+static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */
40927+ struct dentry *old_name, /* old name */
40928+ struct inode *new_dir, /* directory where @new is located */
40929+ struct dentry *new_name
40930+ /* new name */ )
40931+{
40932+ reiser4_block_nr reserve;
40933+
40934+ reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
40935+
40936+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
40937+ return RETERR(-ENOSPC);
40938+
40939+ return 0;
40940+}
40941+
40942+/* check whether @old_inode and @new_inode can be moved within file system
40943+ * tree. This singles out attempts to rename pseudo-files, for example. */
40944+static int can_rename(struct inode *old_dir, struct inode *old_inode,
40945+ struct inode *new_dir, struct inode *new_inode)
40946+{
40947+ file_plugin *fplug;
40948+ dir_plugin *dplug;
40949+
40950+ assert("nikita-3370", old_inode != NULL);
40951+
40952+ dplug = inode_dir_plugin(new_dir);
40953+ fplug = inode_file_plugin(old_inode);
40954+
40955+ if (dplug == NULL)
40956+ return RETERR(-ENOTDIR);
40957+ else if (new_dir->i_op->create == NULL)
40958+ return RETERR(-EPERM);
40959+ else if (!fplug->can_add_link(old_inode))
40960+ return RETERR(-EMLINK);
40961+ else if (new_inode != NULL) {
40962+ fplug = inode_file_plugin(new_inode);
40963+ if (fplug->can_rem_link != NULL &&
40964+ !fplug->can_rem_link(new_inode))
40965+ return RETERR(-EBUSY);
40966+ }
40967+ return 0;
40968+}
40969+
40970+int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *,
40971+ znode_lock_mode, reiser4_dir_entry_desc *);
40972+int reiser4_update_dir(struct inode *);
40973+
40974+/* this is common implementation of vfs's rename method of struct
40975+ inode_operations
40976+ See comments in the body.
40977+
40978+ It is arguable that this function can be made generic so, that it
40979+ will be applicable to any kind of directory plugin that deals with
40980+ directories composed out of directory entries. The only obstacle
40981+ here is that we don't have any data-type to represent directory
40982+ entry. This should be re-considered when more than one different
40983+ directory plugin will be implemented.
40984+*/
40985+int reiser4_rename_common(struct inode *old_dir /* directory where @old
40986+ * is located */ ,
40987+ struct dentry *old_name /* old name */ ,
40988+ struct inode *new_dir /* directory where @new
40989+ * is located */ ,
40990+ struct dentry *new_name /* new name */ )
40991+{
40992+ /* From `The Open Group Base Specifications Issue 6'
40993+
40994+ If either the old or new argument names a symbolic link, rename()
40995+ shall operate on the symbolic link itself, and shall not resolve
40996+ the last component of the argument. If the old argument and the new
40997+ argument resolve to the same existing file, rename() shall return
40998+ successfully and perform no other action.
40999+
41000+ [this is done by VFS: vfs_rename()]
41001+
41002+ If the old argument points to the pathname of a file that is not a
41003+ directory, the new argument shall not point to the pathname of a
41004+ directory.
41005+
41006+ [checked by VFS: vfs_rename->may_delete()]
41007+
41008+ If the link named by the new argument exists, it shall
41009+ be removed and old renamed to new. In this case, a link named new
41010+ shall remain visible to other processes throughout the renaming
41011+ operation and refer either to the file referred to by new or old
41012+ before the operation began.
41013+
41014+ [we should assure this]
41015+
41016+ Write access permission is required for
41017+ both the directory containing old and the directory containing new.
41018+
41019+ [checked by VFS: vfs_rename->may_delete(), may_create()]
41020+
41021+ If the old argument points to the pathname of a directory, the new
41022+ argument shall not point to the pathname of a file that is not a
41023+ directory.
41024+
41025+ [checked by VFS: vfs_rename->may_delete()]
41026+
41027+ If the directory named by the new argument exists, it
41028+ shall be removed and old renamed to new. In this case, a link named
41029+ new shall exist throughout the renaming operation and shall refer
41030+ either to the directory referred to by new or old before the
41031+ operation began.
41032+
41033+ [we should assure this]
41034+
41035+ If new names an existing directory, it shall be
41036+ required to be an empty directory.
41037+
41038+ [we should check this]
41039+
41040+ If the old argument points to a pathname of a symbolic link, the
41041+ symbolic link shall be renamed. If the new argument points to a
41042+ pathname of a symbolic link, the symbolic link shall be removed.
41043+
41044+ The new pathname shall not contain a path prefix that names
41045+ old. Write access permission is required for the directory
41046+ containing old and the directory containing new. If the old
41047+ argument points to the pathname of a directory, write access
41048+ permission may be required for the directory named by old, and, if
41049+ it exists, the directory named by new.
41050+
41051+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
41052+
41053+ If the link named by the new argument exists and the file's link
41054+ count becomes 0 when it is removed and no process has the file
41055+ open, the space occupied by the file shall be freed and the file
41056+ shall no longer be accessible. If one or more processes have the
41057+ file open when the last link is removed, the link shall be removed
41058+ before rename() returns, but the removal of the file contents shall
41059+ be postponed until all references to the file are closed.
41060+
41061+ [iput() handles this, but we can do this manually, a la
41062+ reiser4_unlink()]
41063+
41064+ Upon successful completion, rename() shall mark for update the
41065+ st_ctime and st_mtime fields of the parent directory of each file.
41066+
41067+ [N/A]
41068+
41069+ */
41070+ reiser4_context *ctx;
41071+ int result;
41072+ int is_dir; /* is @old_name directory */
41073+
41074+ struct inode *old_inode;
41075+ struct inode *new_inode;
41076+ coord_t *new_coord;
41077+
41078+ struct reiser4_dentry_fsdata *new_fsdata;
41079+ dir_plugin *dplug;
41080+ file_plugin *fplug;
41081+
41082+ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
41083+ lock_handle *new_lh, *dotdot_lh;
41084+ struct dentry *dotdot_name;
41085+ struct reiser4_dentry_fsdata *dataonstack;
41086+
41087+ ctx = reiser4_init_context(old_dir->i_sb);
41088+ if (IS_ERR(ctx))
41089+ return PTR_ERR(ctx);
41090+
41091+ old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
41092+ sizeof(*dotdot_name) + sizeof(*dataonstack),
41093+ reiser4_ctx_gfp_mask_get());
41094+ if (!old_entry) {
41095+ context_set_commit_async(ctx);
41096+ reiser4_exit_context(ctx);
41097+ return RETERR(-ENOMEM);
41098+ }
41099+
41100+ new_entry = old_entry + 1;
41101+ dotdot_entry = old_entry + 2;
41102+ new_lh = (lock_handle *)(old_entry + 3);
41103+ dotdot_lh = new_lh + 1;
41104+ dotdot_name = (struct dentry *)(new_lh + 2);
41105+ dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1);
41106+
41107+ assert("nikita-2318", old_dir != NULL);
41108+ assert("nikita-2319", new_dir != NULL);
41109+ assert("nikita-2320", old_name != NULL);
41110+ assert("nikita-2321", new_name != NULL);
41111+
41112+ old_inode = old_name->d_inode;
41113+ new_inode = new_name->d_inode;
41114+
41115+ dplug = inode_dir_plugin(old_dir);
41116+ fplug = NULL;
41117+
41118+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
41119+ if (IS_ERR(new_fsdata)) {
41120+ kfree(old_entry);
41121+ context_set_commit_async(ctx);
41122+ reiser4_exit_context(ctx);
41123+ return PTR_ERR(new_fsdata);
41124+ }
41125+
41126+ new_coord = &new_fsdata->dec.entry_coord;
41127+ coord_clear_iplug(new_coord);
41128+
41129+ is_dir = S_ISDIR(old_inode->i_mode);
41130+
41131+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41132+
41133+ /* if target is existing directory and it's not empty---return error.
41134+
41135+ This check is done specifically, because is_dir_empty() requires
41136+ tree traversal and have to be done before locks are taken.
41137+ */
41138+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
41139+ kfree(old_entry);
41140+ context_set_commit_async(ctx);
41141+ reiser4_exit_context(ctx);
41142+ return RETERR(-ENOTEMPTY);
41143+ }
41144+
41145+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
41146+ if (result != 0) {
41147+ kfree(old_entry);
41148+ context_set_commit_async(ctx);
41149+ reiser4_exit_context(ctx);
41150+ return result;
41151+ }
41152+
41153+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
41154+ new_dir, new_name);
41155+ if (result != 0) {
41156+ kfree(old_entry);
41157+ context_set_commit_async(ctx);
41158+ reiser4_exit_context(ctx);
41159+ return result;
41160+ }
41161+
41162+ init_lh(new_lh);
41163+
41164+ /* find entry for @new_name */
41165+ result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
41166+ new_entry);
41167+
41168+ if (IS_CBKERR(result)) {
41169+ done_lh(new_lh);
41170+ kfree(old_entry);
41171+ context_set_commit_async(ctx);
41172+ reiser4_exit_context(ctx);
41173+ return result;
41174+ }
41175+
41176+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
41177+
41178+ /* add or replace name for @old_inode as @new_name */
41179+ if (new_inode != NULL) {
41180+ /* target (@new_name) exists. */
41181+ /* Not clear what to do with objects that are
41182+ both directories and files at the same time. */
41183+ if (result == CBK_COORD_FOUND) {
41184+ result = replace_name(old_inode,
41185+ new_dir,
41186+ new_inode, new_coord, new_lh);
41187+ if (result == 0)
41188+ fplug = inode_file_plugin(new_inode);
41189+ } else if (result == CBK_COORD_NOTFOUND) {
41190+ /* VFS told us that @new_name is bound to existing
41191+ inode, but we failed to find directory entry. */
41192+ warning("nikita-2324", "Target not found");
41193+ result = RETERR(-ENOENT);
41194+ }
41195+ } else {
41196+ /* target (@new_name) doesn't exists. */
41197+ if (result == CBK_COORD_NOTFOUND)
41198+ result = add_name(old_inode,
41199+ new_dir,
41200+ new_name, new_coord, new_lh, is_dir);
41201+ else if (result == CBK_COORD_FOUND) {
41202+ /* VFS told us that @new_name is "negative" dentry,
41203+ but we found directory entry. */
41204+ warning("nikita-2331", "Target found unexpectedly");
41205+ result = RETERR(-EIO);
41206+ }
41207+ }
41208+
41209+ assert("nikita-3462", ergo(result == 0,
41210+ old_inode->i_nlink >= 2 + !!is_dir));
41211+
41212+ /* We are done with all modifications to the @new_dir, release lock on
41213+ node. */
41214+ done_lh(new_lh);
41215+
41216+ if (fplug != NULL) {
41217+ /* detach @new_inode from name-space */
41218+ result = fplug->detach(new_inode, new_dir);
41219+ if (result != 0)
41220+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
41221+ (unsigned long long)get_inode_oid(new_inode),
41222+ result, possible_leak);
41223+ }
41224+
41225+ if (new_inode != NULL)
41226+ reiser4_update_sd(new_inode);
41227+
41228+ if (result == 0) {
41229+ old_entry->obj = old_inode;
41230+
41231+ dplug->build_entry_key(old_dir,
41232+ &old_name->d_name, &old_entry->key);
41233+
41234+ /* At this stage new name was introduced for
41235+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41236+ counters were updated.
41237+
41238+ We want to remove @old_name now. If @old_inode wasn't
41239+ directory this is simple.
41240+ */
41241+ result = dplug->rem_entry(old_dir, old_name, old_entry);
41242+ if (result != 0 && result != -ENOMEM) {
41243+ warning("nikita-2335",
41244+ "Cannot remove old name: %i", result);
41245+ } else {
41246+ result = reiser4_del_nlink(old_inode, old_dir, 0);
41247+ if (result != 0 && result != -ENOMEM) {
41248+ warning("nikita-2337",
41249+ "Cannot drop link on old: %i", result);
41250+ }
41251+ }
41252+
41253+ if (result == 0 && is_dir) {
41254+ /* @old_inode is directory. We also have to update
41255+ dotdot entry. */
41256+ coord_t *dotdot_coord;
41257+
41258+ memset(dataonstack, 0, sizeof dataonstack);
41259+ memset(dotdot_entry, 0, sizeof dotdot_entry);
41260+ dotdot_entry->obj = old_dir;
41261+ memset(dotdot_name, 0, sizeof dotdot_name);
41262+ dotdot_name->d_name.name = "..";
41263+ dotdot_name->d_name.len = 2;
41264+ /*
41265+ * allocate ->d_fsdata on the stack to avoid using
41266+ * reiser4_get_dentry_fsdata(). Locking is not needed,
41267+ * because dentry is private to the current thread.
41268+ */
41269+ dotdot_name->d_fsdata = dataonstack;
41270+ init_lh(dotdot_lh);
41271+
41272+ dotdot_coord = &dataonstack->dec.entry_coord;
41273+ coord_clear_iplug(dotdot_coord);
41274+
41275+ result = reiser4_find_entry(old_inode, dotdot_name,
41276+ dotdot_lh, ZNODE_WRITE_LOCK,
41277+ dotdot_entry);
41278+ if (result == 0) {
41279+ /* replace_name() decreases i_nlink on
41280+ * @old_dir */
41281+ result = replace_name(new_dir,
41282+ old_inode,
41283+ old_dir,
41284+ dotdot_coord, dotdot_lh);
41285+ } else
41286+ result = RETERR(-EIO);
41287+ done_lh(dotdot_lh);
41288+ }
41289+ }
41290+ reiser4_update_dir(new_dir);
41291+ reiser4_update_dir(old_dir);
41292+ reiser4_update_sd(old_inode);
41293+ if (result == 0) {
41294+ file_plugin *fplug;
41295+
41296+ if (new_inode != NULL) {
41297+ /* add safe-link for target file (in case we removed
41298+ * last reference to the poor fellow */
41299+ fplug = inode_file_plugin(new_inode);
41300+ if (new_inode->i_nlink == 0)
41301+ result = safe_link_add(new_inode, SAFE_UNLINK);
41302+ }
41303+ }
41304+ kfree(old_entry);
41305+ context_set_commit_async(ctx);
41306+ reiser4_exit_context(ctx);
41307+ return result;
41308+}
41309+
41310+#if 0
41311+int reiser4_rename_common(struct inode *old_dir /* directory where @old
41312+ * is located */ ,
41313+ struct dentry *old_name /* old name */ ,
41314+ struct inode *new_dir /* directory where @new
41315+ * is located */ ,
41316+ struct dentry *new_name /* new name */ )
41317+{
41318+ /* From `The Open Group Base Specifications Issue 6'
41319+
41320+ If either the old or new argument names a symbolic link, rename()
41321+ shall operate on the symbolic link itself, and shall not resolve
41322+ the last component of the argument. If the old argument and the new
41323+ argument resolve to the same existing file, rename() shall return
41324+ successfully and perform no other action.
41325+
41326+ [this is done by VFS: vfs_rename()]
41327+
41328+ If the old argument points to the pathname of a file that is not a
41329+ directory, the new argument shall not point to the pathname of a
41330+ directory.
41331+
41332+ [checked by VFS: vfs_rename->may_delete()]
41333+
41334+ If the link named by the new argument exists, it shall
41335+ be removed and old renamed to new. In this case, a link named new
41336+ shall remain visible to other processes throughout the renaming
41337+ operation and refer either to the file referred to by new or old
41338+ before the operation began.
41339+
41340+ [we should assure this]
41341+
41342+ Write access permission is required for
41343+ both the directory containing old and the directory containing new.
41344+
41345+ [checked by VFS: vfs_rename->may_delete(), may_create()]
41346+
41347+ If the old argument points to the pathname of a directory, the new
41348+ argument shall not point to the pathname of a file that is not a
41349+ directory.
41350+
41351+ [checked by VFS: vfs_rename->may_delete()]
41352+
41353+ If the directory named by the new argument exists, it
41354+ shall be removed and old renamed to new. In this case, a link named
41355+ new shall exist throughout the renaming operation and shall refer
41356+ either to the directory referred to by new or old before the
41357+ operation began.
41358+
41359+ [we should assure this]
41360+
41361+ If new names an existing directory, it shall be
41362+ required to be an empty directory.
41363+
41364+ [we should check this]
41365+
41366+ If the old argument points to a pathname of a symbolic link, the
41367+ symbolic link shall be renamed. If the new argument points to a
41368+ pathname of a symbolic link, the symbolic link shall be removed.
41369+
41370+ The new pathname shall not contain a path prefix that names
41371+ old. Write access permission is required for the directory
41372+ containing old and the directory containing new. If the old
41373+ argument points to the pathname of a directory, write access
41374+ permission may be required for the directory named by old, and, if
41375+ it exists, the directory named by new.
41376+
41377+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
41378+
41379+ If the link named by the new argument exists and the file's link
41380+ count becomes 0 when it is removed and no process has the file
41381+ open, the space occupied by the file shall be freed and the file
41382+ shall no longer be accessible. If one or more processes have the
41383+ file open when the last link is removed, the link shall be removed
41384+ before rename() returns, but the removal of the file contents shall
41385+ be postponed until all references to the file are closed.
41386+
41387+ [iput() handles this, but we can do this manually, a la
41388+ reiser4_unlink()]
41389+
41390+ Upon successful completion, rename() shall mark for update the
41391+ st_ctime and st_mtime fields of the parent directory of each file.
41392+
41393+ [N/A]
41394+
41395+ */
41396+ reiser4_context *ctx;
41397+ int result;
41398+ int is_dir; /* is @old_name directory */
41399+ struct inode *old_inode;
41400+ struct inode *new_inode;
41401+ reiser4_dir_entry_desc old_entry;
41402+ reiser4_dir_entry_desc new_entry;
41403+ coord_t *new_coord;
41404+ struct reiser4_dentry_fsdata *new_fsdata;
41405+ lock_handle new_lh;
41406+ dir_plugin *dplug;
41407+ file_plugin *fplug;
41408+
41409+ ctx = reiser4_init_context(old_dir->i_sb);
41410+ if (IS_ERR(ctx))
41411+ return PTR_ERR(ctx);
41412+
41413+ assert("nikita-2318", old_dir != NULL);
41414+ assert("nikita-2319", new_dir != NULL);
41415+ assert("nikita-2320", old_name != NULL);
41416+ assert("nikita-2321", new_name != NULL);
41417+
41418+ old_inode = old_name->d_inode;
41419+ new_inode = new_name->d_inode;
41420+
41421+ dplug = inode_dir_plugin(old_dir);
41422+ fplug = NULL;
41423+
41424+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
41425+ if (IS_ERR(new_fsdata)) {
41426+ result = PTR_ERR(new_fsdata);
41427+ goto exit;
41428+ }
41429+
41430+ new_coord = &new_fsdata->dec.entry_coord;
41431+ coord_clear_iplug(new_coord);
41432+
41433+ is_dir = S_ISDIR(old_inode->i_mode);
41434+
41435+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41436+
41437+ /* if target is existing directory and it's not empty---return error.
41438+
41439+ This check is done specifically, because is_dir_empty() requires
41440+ tree traversal and have to be done before locks are taken.
41441+ */
41442+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
41443+ return RETERR(-ENOTEMPTY);
41444+
41445+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
41446+ if (result != 0)
41447+ goto exit;
41448+
41449+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
41450+ new_dir, new_name);
41451+ if (result != 0)
41452+ goto exit;
41453+
41454+ init_lh(&new_lh);
41455+
41456+ /* find entry for @new_name */
41457+ result = reiser4_find_entry(new_dir, new_name, &new_lh,
41458+ ZNODE_WRITE_LOCK, &new_entry);
41459+
41460+ if (IS_CBKERR(result)) {
41461+ done_lh(&new_lh);
41462+ goto exit;
41463+ }
41464+
41465+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
41466+
41467+ /* add or replace name for @old_inode as @new_name */
41468+ if (new_inode != NULL) {
41469+ /* target (@new_name) exists. */
41470+ /* Not clear what to do with objects that are
41471+ both directories and files at the same time. */
41472+ if (result == CBK_COORD_FOUND) {
41473+ result = replace_name(old_inode,
41474+ new_dir,
41475+ new_inode, new_coord, &new_lh);
41476+ if (result == 0)
41477+ fplug = inode_file_plugin(new_inode);
41478+ } else if (result == CBK_COORD_NOTFOUND) {
41479+ /* VFS told us that @new_name is bound to existing
41480+ inode, but we failed to find directory entry. */
41481+ warning("nikita-2324", "Target not found");
41482+ result = RETERR(-ENOENT);
41483+ }
41484+ } else {
41485+ /* target (@new_name) doesn't exists. */
41486+ if (result == CBK_COORD_NOTFOUND)
41487+ result = add_name(old_inode,
41488+ new_dir,
41489+ new_name, new_coord, &new_lh, is_dir);
41490+ else if (result == CBK_COORD_FOUND) {
41491+ /* VFS told us that @new_name is "negative" dentry,
41492+ but we found directory entry. */
41493+ warning("nikita-2331", "Target found unexpectedly");
41494+ result = RETERR(-EIO);
41495+ }
41496+ }
41497+
41498+ assert("nikita-3462", ergo(result == 0,
41499+ old_inode->i_nlink >= 2 + !!is_dir));
41500+
41501+ /* We are done with all modifications to the @new_dir, release lock on
41502+ node. */
41503+ done_lh(&new_lh);
41504+
41505+ if (fplug != NULL) {
41506+ /* detach @new_inode from name-space */
41507+ result = fplug->detach(new_inode, new_dir);
41508+ if (result != 0)
41509+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
41510+ (unsigned long long)get_inode_oid(new_inode),
41511+ result, possible_leak);
41512+ }
41513+
41514+ if (new_inode != NULL)
41515+ reiser4_update_sd(new_inode);
41516+
41517+ if (result == 0) {
41518+ memset(&old_entry, 0, sizeof old_entry);
41519+ old_entry.obj = old_inode;
41520+
41521+ dplug->build_entry_key(old_dir,
41522+ &old_name->d_name, &old_entry.key);
41523+
41524+ /* At this stage new name was introduced for
41525+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41526+ counters were updated.
41527+
41528+ We want to remove @old_name now. If @old_inode wasn't
41529+ directory this is simple.
41530+ */
41531+ result = dplug->rem_entry(old_dir, old_name, &old_entry);
41532+ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
41533+ if (result != 0 && result != -ENOMEM) {
41534+ warning("nikita-2335",
41535+ "Cannot remove old name: %i", result);
41536+ } else {
41537+ result = reiser4_del_nlink(old_inode, old_dir, 0);
41538+ if (result != 0 && result != -ENOMEM) {
41539+ warning("nikita-2337",
41540+ "Cannot drop link on old: %i", result);
41541+ }
41542+ }
41543+
41544+ if (result == 0 && is_dir) {
41545+ /* @old_inode is directory. We also have to update
41546+ dotdot entry. */
41547+ coord_t *dotdot_coord;
41548+ lock_handle dotdot_lh;
41549+ struct dentry dotdot_name;
41550+ reiser4_dir_entry_desc dotdot_entry;
41551+ struct reiser4_dentry_fsdata dataonstack;
41552+ struct reiser4_dentry_fsdata *fsdata;
41553+
41554+ memset(&dataonstack, 0, sizeof dataonstack);
41555+ memset(&dotdot_entry, 0, sizeof dotdot_entry);
41556+ dotdot_entry.obj = old_dir;
41557+ memset(&dotdot_name, 0, sizeof dotdot_name);
41558+ dotdot_name.d_name.name = "..";
41559+ dotdot_name.d_name.len = 2;
41560+ /*
41561+ * allocate ->d_fsdata on the stack to avoid using
41562+ * reiser4_get_dentry_fsdata(). Locking is not needed,
41563+ * because dentry is private to the current thread.
41564+ */
41565+ dotdot_name.d_fsdata = &dataonstack;
41566+ init_lh(&dotdot_lh);
41567+
41568+ fsdata = &dataonstack;
41569+ dotdot_coord = &fsdata->dec.entry_coord;
41570+ coord_clear_iplug(dotdot_coord);
41571+
41572+ result = reiser4_find_entry(old_inode,
41573+ &dotdot_name,
41574+ &dotdot_lh,
41575+ ZNODE_WRITE_LOCK,
41576+ &dotdot_entry);
41577+ if (result == 0) {
41578+ /* replace_name() decreases i_nlink on
41579+ * @old_dir */
41580+ result = replace_name(new_dir,
41581+ old_inode,
41582+ old_dir,
41583+ dotdot_coord, &dotdot_lh);
41584+ } else
41585+ result = RETERR(-EIO);
41586+ done_lh(&dotdot_lh);
41587+ }
41588+ }
41589+ reiser4_update_dir(new_dir);
41590+ reiser4_update_dir(old_dir);
41591+ reiser4_update_sd(old_inode);
41592+ if (result == 0) {
41593+ file_plugin *fplug;
41594+
41595+ if (new_inode != NULL) {
41596+ /* add safe-link for target file (in case we removed
41597+ * last reference to the poor fellow */
41598+ fplug = inode_file_plugin(new_inode);
41599+ if (new_inode->i_nlink == 0)
41600+ result = safe_link_add(new_inode, SAFE_UNLINK);
41601+ }
41602+ }
41603+ exit:
41604+ context_set_commit_async(ctx);
41605+ reiser4_exit_context(ctx);
41606+ return result;
41607+}
41608+#endif
41609diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/acl.h linux-2.6.23/fs/reiser4/plugin/item/acl.h
41610--- linux-2.6.23.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 03:00:00.000000000 +0300
41611+++ linux-2.6.23/fs/reiser4/plugin/item/acl.h 2007-12-04 16:49:30.000000000 +0300
41612@@ -0,0 +1,66 @@
41613+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
41614+
41615+/* Directory entry. */
41616+
41617+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
41618+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
41619+
41620+#include "../../forward.h"
41621+#include "../../dformat.h"
41622+#include "../../kassign.h"
41623+#include "../../key.h"
41624+
41625+#include <linux/fs.h>
41626+#include <linux/dcache.h> /* for struct dentry */
41627+
41628+typedef struct directory_entry_format {
41629+ /* key of object stat-data. It's not necessary to store whole
41630+ key here, because it's always key of stat-data, so minor
41631+ packing locality and offset can be omitted here. But this
41632+ relies on particular key allocation scheme for stat-data, so,
41633+ for extensibility sake, whole key can be stored here.
41634+
41635+ We store key as array of bytes, because we don't want 8-byte
41636+ alignment of dir entries.
41637+ */
41638+ obj_key_id id;
41639+ /* file name. Null terminated string. */
41640+ d8 name[0];
41641+} directory_entry_format;
41642+
41643+void print_de(const char *prefix, coord_t * coord);
41644+int extract_key_de(const coord_t * coord, reiser4_key * key);
41645+int update_key_de(const coord_t * coord, const reiser4_key * key,
41646+ lock_handle * lh);
41647+char *extract_name_de(const coord_t * coord, char *buf);
41648+unsigned extract_file_type_de(const coord_t * coord);
41649+int add_entry_de(struct inode *dir, coord_t * coord,
41650+ lock_handle * lh, const struct dentry *name,
41651+ reiser4_dir_entry_desc * entry);
41652+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
41653+ lock_handle * lh, reiser4_dir_entry_desc * entry);
41654+int max_name_len_de(const struct inode *dir);
41655+
41656+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
41657+
41658+char *extract_dent_name(const coord_t * coord,
41659+ directory_entry_format * dent, char *buf);
41660+
41661+#if REISER4_LARGE_KEY
41662+#define DE_NAME_BUF_LEN (24)
41663+#else
41664+#define DE_NAME_BUF_LEN (16)
41665+#endif
41666+
41667+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
41668+#endif
41669+
41670+/* Make Linus happy.
41671+ Local variables:
41672+ c-indentation-style: "K&R"
41673+ mode-name: "LC"
41674+ c-basic-offset: 8
41675+ tab-width: 8
41676+ fill-column: 120
41677+ End:
41678+*/
41679diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.23/fs/reiser4/plugin/item/blackbox.c
41680--- linux-2.6.23.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 03:00:00.000000000 +0300
41681+++ linux-2.6.23/fs/reiser4/plugin/item/blackbox.c 2007-12-04 16:49:30.000000000 +0300
41682@@ -0,0 +1,142 @@
41683+/* Copyright 2003 by Hans Reiser, licensing governed by
41684+ * reiser4/README */
41685+
41686+/* Black box item implementation */
41687+
41688+#include "../../forward.h"
41689+#include "../../debug.h"
41690+#include "../../dformat.h"
41691+#include "../../kassign.h"
41692+#include "../../coord.h"
41693+#include "../../tree.h"
41694+#include "../../lock.h"
41695+
41696+#include "blackbox.h"
41697+#include "item.h"
41698+#include "../plugin.h"
41699+
41700+int
41701+store_black_box(reiser4_tree * tree,
41702+ const reiser4_key * key, void *data, int length)
41703+{
41704+ int result;
41705+ reiser4_item_data idata;
41706+ coord_t coord;
41707+ lock_handle lh;
41708+
41709+ memset(&idata, 0, sizeof idata);
41710+
41711+ idata.data = data;
41712+ idata.user = 0;
41713+ idata.length = length;
41714+ idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
41715+
41716+ init_lh(&lh);
41717+ result = insert_by_key(tree, key,
41718+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
41719+
41720+ assert("nikita-3413",
41721+ ergo(result == 0,
41722+ WITH_COORD(&coord,
41723+ item_length_by_coord(&coord) == length)));
41724+
41725+ done_lh(&lh);
41726+ return result;
41727+}
41728+
41729+int
41730+load_black_box(reiser4_tree * tree,
41731+ reiser4_key * key, void *data, int length, int exact)
41732+{
41733+ int result;
41734+ coord_t coord;
41735+ lock_handle lh;
41736+
41737+ init_lh(&lh);
41738+ result = coord_by_key(tree, key,
41739+ &coord, &lh, ZNODE_READ_LOCK,
41740+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
41741+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
41742+
41743+ if (result == 0) {
41744+ int ilen;
41745+
41746+ result = zload(coord.node);
41747+ if (result == 0) {
41748+ ilen = item_length_by_coord(&coord);
41749+ if (ilen <= length) {
41750+ memcpy(data, item_body_by_coord(&coord), ilen);
41751+ unit_key_by_coord(&coord, key);
41752+ } else if (exact) {
41753+ /*
41754+ * item is larger than buffer provided by the
41755+ * user. Only issue a warning if @exact is
41756+ * set. If @exact is false, we are iterating
41757+ * over all safe-links and here we are reaching
41758+ * the end of the iteration.
41759+ */
41760+ warning("nikita-3415",
41761+ "Wrong black box length: %i > %i",
41762+ ilen, length);
41763+ result = RETERR(-EIO);
41764+ }
41765+ zrelse(coord.node);
41766+ }
41767+ }
41768+
41769+ done_lh(&lh);
41770+ return result;
41771+
41772+}
41773+
41774+int
41775+update_black_box(reiser4_tree * tree,
41776+ const reiser4_key * key, void *data, int length)
41777+{
41778+ int result;
41779+ coord_t coord;
41780+ lock_handle lh;
41781+
41782+ init_lh(&lh);
41783+ result = coord_by_key(tree, key,
41784+ &coord, &lh, ZNODE_READ_LOCK,
41785+ FIND_EXACT,
41786+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
41787+ if (result == 0) {
41788+ int ilen;
41789+
41790+ result = zload(coord.node);
41791+ if (result == 0) {
41792+ ilen = item_length_by_coord(&coord);
41793+ if (length <= ilen) {
41794+ memcpy(item_body_by_coord(&coord), data,
41795+ length);
41796+ } else {
41797+ warning("nikita-3437",
41798+ "Wrong black box length: %i < %i",
41799+ ilen, length);
41800+ result = RETERR(-EIO);
41801+ }
41802+ zrelse(coord.node);
41803+ }
41804+ }
41805+
41806+ done_lh(&lh);
41807+ return result;
41808+
41809+}
41810+
41811+int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
41812+{
41813+ return reiser4_cut_tree(tree, key, key, NULL, 1);
41814+}
41815+
41816+/* Make Linus happy.
41817+ Local variables:
41818+ c-indentation-style: "K&R"
41819+ mode-name: "LC"
41820+ c-basic-offset: 8
41821+ tab-width: 8
41822+ fill-column: 120
41823+ End:
41824+*/
41825diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.23/fs/reiser4/plugin/item/blackbox.h
41826--- linux-2.6.23.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 03:00:00.000000000 +0300
41827+++ linux-2.6.23/fs/reiser4/plugin/item/blackbox.h 2007-12-04 16:49:30.000000000 +0300
41828@@ -0,0 +1,33 @@
41829+/* Copyright 2003 by Hans Reiser, licensing governed by
41830+ * reiser4/README */
41831+
41832+/* "Black box" entry to fixed-width contain user supplied data */
41833+
41834+#if !defined( __FS_REISER4_BLACK_BOX_H__ )
41835+#define __FS_REISER4_BLACK_BOX_H__
41836+
41837+#include "../../forward.h"
41838+#include "../../dformat.h"
41839+#include "../../kassign.h"
41840+#include "../../key.h"
41841+
41842+extern int store_black_box(reiser4_tree * tree,
41843+ const reiser4_key * key, void *data, int length);
41844+extern int load_black_box(reiser4_tree * tree,
41845+ reiser4_key * key, void *data, int length, int exact);
41846+extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
41847+extern int update_black_box(reiser4_tree * tree,
41848+ const reiser4_key * key, void *data, int length);
41849+
41850+/* __FS_REISER4_BLACK_BOX_H__ */
41851+#endif
41852+
41853+/* Make Linus happy.
41854+ Local variables:
41855+ c-indentation-style: "K&R"
41856+ mode-name: "LC"
41857+ c-basic-offset: 8
41858+ tab-width: 8
41859+ fill-column: 120
41860+ End:
41861+*/
41862diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/cde.c linux-2.6.23/fs/reiser4/plugin/item/cde.c
41863--- linux-2.6.23.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 03:00:00.000000000 +0300
41864+++ linux-2.6.23/fs/reiser4/plugin/item/cde.c 2007-12-04 16:49:30.000000000 +0300
41865@@ -0,0 +1,1008 @@
41866+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
41867+
41868+/* Directory entry implementation */
41869+
41870+/* DESCRIPTION:
41871+
41872+ This is "compound" directory item plugin implementation. This directory
41873+ item type is compound (as opposed to the "simple directory item" in
41874+ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
41875+ entries.
41876+
41877+ The reason behind this decision is disk space efficiency: all directory
41878+ entries inside the same directory have identical fragment in their
41879+ keys. This, of course, depends on key assignment policy. In our default key
41880+ assignment policy, all directory entries have the same locality which is
41881+ equal to the object id of their directory.
41882+
41883+ Composing directory item out of several directory entries for the same
41884+ directory allows us to store said key fragment only once. That is, this is
41885+ some ad hoc form of key compression (stem compression) that is implemented
41886+ here, because general key compression is not supposed to be implemented in
41887+ v4.0.
41888+
41889+ Another decision that was made regarding all directory item plugins, is
41890+ that they will store entry keys unaligned. This is for that sake of disk
41891+ space efficiency again.
41892+
41893+ In should be noted, that storing keys unaligned increases CPU consumption,
41894+ at least on some architectures.
41895+
41896+ Internal on-disk structure of the compound directory item is the following:
41897+
41898+ HEADER cde_item_format. Here number of entries is stored.
41899+ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
41900+ ENTRY_HEADER_1 offset of entry body are stored.
41901+ ENTRY_HEADER_2 (basically two last parts of key)
41902+ ...
41903+ ENTRY_HEADER_N
41904+ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
41905+ ENTRY_BODY_1 NUL-terminated name are stored.
41906+ ENTRY_BODY_2 (part of statadta key in the
41907+ sence that since all SDs have
41908+ zero offset, this offset is not
41909+ stored on disk).
41910+ ...
41911+ ENTRY_BODY_N
41912+
41913+ When it comes to the balancing, each directory entry in compound directory
41914+ item is unit, that is, something that can be cut from one item and pasted
41915+ into another item of the same type. Handling of unit cut and paste is major
41916+ reason for the complexity of code below.
41917+
41918+*/
41919+
41920+#include "../../forward.h"
41921+#include "../../debug.h"
41922+#include "../../dformat.h"
41923+#include "../../kassign.h"
41924+#include "../../key.h"
41925+#include "../../coord.h"
41926+#include "sde.h"
41927+#include "cde.h"
41928+#include "item.h"
41929+#include "../node/node.h"
41930+#include "../plugin.h"
41931+#include "../../znode.h"
41932+#include "../../carry.h"
41933+#include "../../tree.h"
41934+#include "../../inode.h"
41935+
41936+#include <linux/fs.h> /* for struct inode */
41937+#include <linux/dcache.h> /* for struct dentry */
41938+#include <linux/quotaops.h>
41939+
41940+#if 0
41941+#define CHECKME(coord) \
41942+({ \
41943+ const char *message; \
41944+ coord_t dup; \
41945+ \
41946+ coord_dup_nocheck(&dup, (coord)); \
41947+ dup.unit_pos = 0; \
41948+ assert("nikita-2871", cde_check(&dup, &message) == 0); \
41949+})
41950+#else
41951+#define CHECKME(coord) noop
41952+#endif
41953+
41954+/* return body of compound directory item at @coord */
41955+static inline cde_item_format *formatted_at(const coord_t * coord)
41956+{
41957+ assert("nikita-1282", coord != NULL);
41958+ return item_body_by_coord(coord);
41959+}
41960+
41961+/* return entry header at @coord */
41962+static inline cde_unit_header *header_at(const coord_t *
41963+ coord /* coord of item */ ,
41964+ int idx /* index of unit */ )
41965+{
41966+ assert("nikita-1283", coord != NULL);
41967+ return &formatted_at(coord)->entry[idx];
41968+}
41969+
41970+/* return number of units in compound directory item at @coord */
41971+static int units(const coord_t * coord /* coord of item */ )
41972+{
41973+ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
41974+}
41975+
41976+/* return offset of the body of @idx-th entry in @coord */
41977+static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
41978+ int idx /* index of unit */ )
41979+{
41980+ if (idx < units(coord))
41981+ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
41982+ else if (idx == units(coord))
41983+ return item_length_by_coord(coord);
41984+ else
41985+ impossible("nikita-1308", "Wrong idx");
41986+ return 0;
41987+}
41988+
41989+/* set offset of the body of @idx-th entry in @coord */
41990+static void set_offset(const coord_t * coord /* coord of item */ ,
41991+ int idx /* index of unit */ ,
41992+ unsigned int offset /* new offset */ )
41993+{
41994+ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
41995+}
41996+
41997+static void adj_offset(const coord_t * coord /* coord of item */ ,
41998+ int idx /* index of unit */ ,
41999+ int delta /* offset change */ )
42000+{
42001+ d16 *doffset;
42002+ __u16 offset;
42003+
42004+ doffset = &header_at(coord, idx)->offset;
42005+ offset = le16_to_cpu(get_unaligned(doffset));
42006+ offset += delta;
42007+ put_unaligned(cpu_to_le16((__u16) offset), doffset);
42008+}
42009+
42010+/* return pointer to @offset-th byte from the beginning of @coord */
42011+static char *address(const coord_t * coord /* coord of item */ ,
42012+ int offset)
42013+{
42014+ return ((char *)item_body_by_coord(coord)) + offset;
42015+}
42016+
42017+/* return pointer to the body of @idx-th entry in @coord */
42018+static directory_entry_format *entry_at(const coord_t * coord /* coord of
42019+ * item */ ,
42020+ int idx /* index of unit */ )
42021+{
42022+ return (directory_entry_format *) address(coord,
42023+ (int)offset_of(coord, idx));
42024+}
42025+
42026+/* return number of unit referenced by @coord */
42027+static int idx_of(const coord_t * coord /* coord of item */ )
42028+{
42029+ assert("nikita-1285", coord != NULL);
42030+ return coord->unit_pos;
42031+}
42032+
42033+/* find position where entry with @entry_key would be inserted into @coord */
42034+static int find(const coord_t * coord /* coord of item */ ,
42035+ const reiser4_key * entry_key /* key to look for */ ,
42036+ cmp_t * last /* result of last comparison */ )
42037+{
42038+ int entries;
42039+
42040+ int left;
42041+ int right;
42042+
42043+ cde_unit_header *header;
42044+
42045+ assert("nikita-1295", coord != NULL);
42046+ assert("nikita-1296", entry_key != NULL);
42047+ assert("nikita-1297", last != NULL);
42048+
42049+ entries = units(coord);
42050+ left = 0;
42051+ right = entries - 1;
42052+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
42053+ int median;
42054+
42055+ median = (left + right) >> 1;
42056+
42057+ header = header_at(coord, median);
42058+ *last = de_id_key_cmp(&header->hash, entry_key);
42059+ switch (*last) {
42060+ case LESS_THAN:
42061+ left = median;
42062+ break;
42063+ case GREATER_THAN:
42064+ right = median;
42065+ break;
42066+ case EQUAL_TO:{
42067+ do {
42068+ median--;
42069+ header--;
42070+ } while (median >= 0 &&
42071+ de_id_key_cmp(&header->hash,
42072+ entry_key) == EQUAL_TO);
42073+ return median + 1;
42074+ }
42075+ }
42076+ }
42077+ header = header_at(coord, left);
42078+ for (; left < entries; ++left, ++header) {
42079+ prefetch(header + 1);
42080+ *last = de_id_key_cmp(&header->hash, entry_key);
42081+ if (*last != LESS_THAN)
42082+ break;
42083+ }
42084+ if (left < entries)
42085+ return left;
42086+ else
42087+ return RETERR(-ENOENT);
42088+
42089+}
42090+
42091+/* expand @coord as to accommodate for insertion of @no new entries starting
42092+ from @pos, with total bodies size @size. */
42093+static int expand_item(const coord_t * coord /* coord of item */ ,
42094+ int pos /* unit position */ , int no /* number of new
42095+ * units*/ ,
42096+ int size /* total size of new units' data */ ,
42097+ unsigned int data_size /* free space already reserved
42098+ * in the item for insertion */ )
42099+{
42100+ int entries;
42101+ cde_unit_header *header;
42102+ char *dent;
42103+ int i;
42104+
42105+ assert("nikita-1310", coord != NULL);
42106+ assert("nikita-1311", pos >= 0);
42107+ assert("nikita-1312", no > 0);
42108+ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
42109+ assert("nikita-1343",
42110+ item_length_by_coord(coord) >=
42111+ (int)(size + data_size + no * sizeof *header));
42112+
42113+ entries = units(coord);
42114+
42115+ if (pos == entries)
42116+ dent = address(coord, size);
42117+ else
42118+ dent = (char *)entry_at(coord, pos);
42119+ /* place where new header will be in */
42120+ header = header_at(coord, pos);
42121+ /* free space for new entry headers */
42122+ memmove(header + no, header,
42123+ (unsigned)(address(coord, size) - (char *)header));
42124+ /* if adding to the end initialise first new header */
42125+ if (pos == entries) {
42126+ set_offset(coord, pos, (unsigned)size);
42127+ }
42128+
42129+ /* adjust entry pointer and size */
42130+ dent = dent + no * sizeof *header;
42131+ size += no * sizeof *header;
42132+ /* free space for new entries */
42133+ memmove(dent + data_size, dent,
42134+ (unsigned)(address(coord, size) - dent));
42135+
42136+ /* increase counter */
42137+ entries += no;
42138+ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
42139+
42140+ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
42141+ bytes. */
42142+ for (i = 0; i <= pos; ++i)
42143+ adj_offset(coord, i, no * sizeof *header);
42144+ /* [ pos + no ... +\infty ) entries were shifted by ( no *
42145+ sizeof *header + data_size ) bytes */
42146+ for (i = pos + no; i < entries; ++i)
42147+ adj_offset(coord, i, no * sizeof *header + data_size);
42148+ return 0;
42149+}
42150+
42151+/* insert new @entry into item */
42152+static int expand(const coord_t * coord /* coord of item */ ,
42153+ struct cde_entry * entry /* entry to insert */ ,
42154+ int len /* length of @entry data */ ,
42155+ int *pos /* position to insert */ ,
42156+ reiser4_dir_entry_desc * dir_entry /* parameters for new
42157+ * entry */ )
42158+{
42159+ cmp_t cmp_res;
42160+ int datasize;
42161+
42162+ *pos = find(coord, &dir_entry->key, &cmp_res);
42163+ if (*pos < 0)
42164+ *pos = units(coord);
42165+
42166+ datasize = sizeof(directory_entry_format);
42167+ if (is_longname(entry->name->name, entry->name->len))
42168+ datasize += entry->name->len + 1;
42169+
42170+ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
42171+ datasize);
42172+ return 0;
42173+}
42174+
42175+/* paste body of @entry into item */
42176+static int paste_entry(const coord_t * coord /* coord of item */ ,
42177+ struct cde_entry * entry /* new entry */ ,
42178+ int pos /* position to insert */ ,
42179+ reiser4_dir_entry_desc * dir_entry /* parameters for
42180+ * new entry */ )
42181+{
42182+ cde_unit_header *header;
42183+ directory_entry_format *dent;
42184+ const char *name;
42185+ int len;
42186+
42187+ header = header_at(coord, pos);
42188+ dent = entry_at(coord, pos);
42189+
42190+ build_de_id_by_key(&dir_entry->key, &header->hash);
42191+ build_inode_key_id(entry->obj, &dent->id);
42192+ /* AUDIT unsafe strcpy() operation! It should be replaced with
42193+ much less CPU hungry
42194+ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
42195+
42196+ Also a more major thing is that there should be a way to figure out
42197+ amount of space in dent -> name and be able to check that we are
42198+ not going to overwrite more than we supposed to */
42199+ name = entry->name->name;
42200+ len = entry->name->len;
42201+ if (is_longname(name, len)) {
42202+ strcpy((unsigned char *)dent->name, name);
42203+ put_unaligned(0, &dent->name[len]);
42204+ }
42205+ return 0;
42206+}
42207+
42208+/* estimate how much space is necessary in item to insert/paste set of entries
42209+ described in @data. */
42210+int estimate_cde(const coord_t * coord /* coord of item */ ,
42211+ const reiser4_item_data * data /* parameters for new item */ )
42212+{
42213+ struct cde_entry_data *e;
42214+ int result;
42215+ int i;
42216+
42217+ e = (struct cde_entry_data *) data->data;
42218+
42219+ assert("nikita-1288", e != NULL);
42220+ assert("nikita-1289", e->num_of_entries >= 0);
42221+
42222+ if (coord == NULL)
42223+ /* insert */
42224+ result = sizeof(cde_item_format);
42225+ else
42226+ /* paste */
42227+ result = 0;
42228+
42229+ result += e->num_of_entries *
42230+ (sizeof(cde_unit_header) + sizeof(directory_entry_format));
42231+ for (i = 0; i < e->num_of_entries; ++i) {
42232+ const char *name;
42233+ int len;
42234+
42235+ name = e->entry[i].name->name;
42236+ len = e->entry[i].name->len;
42237+ assert("nikita-2054", strlen(name) == len);
42238+ if (is_longname(name, len))
42239+ result += len + 1;
42240+ }
42241+ ((reiser4_item_data *) data)->length = result;
42242+ return result;
42243+}
42244+
42245+/* ->nr_units() method for this item plugin. */
42246+pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
42247+{
42248+ return units(coord);
42249+}
42250+
42251+/* ->unit_key() method for this item plugin. */
42252+reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
42253+ reiser4_key * key /* resulting key */ )
42254+{
42255+ assert("nikita-1452", coord != NULL);
42256+ assert("nikita-1345", idx_of(coord) < units(coord));
42257+ assert("nikita-1346", key != NULL);
42258+
42259+ item_key_by_coord(coord, key);
42260+ extract_key_from_de_id(extract_dir_id_from_key(key),
42261+ &header_at(coord, idx_of(coord))->hash, key);
42262+ return key;
42263+}
42264+
42265+/* mergeable_cde(): implementation of ->mergeable() item method.
42266+
42267+ Two directory items are mergeable iff they are from the same
42268+ directory. That simple.
42269+
42270+*/
42271+int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
42272+ const coord_t * p2 /* coord of second item */ )
42273+{
42274+ reiser4_key k1;
42275+ reiser4_key k2;
42276+
42277+ assert("nikita-1339", p1 != NULL);
42278+ assert("nikita-1340", p2 != NULL);
42279+
42280+ return
42281+ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
42282+ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
42283+ extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
42284+
42285+}
42286+
42287+/* ->max_key_inside() method for this item plugin. */
42288+reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
42289+ reiser4_key * result /* resulting key */ )
42290+{
42291+ assert("nikita-1342", coord != NULL);
42292+
42293+ item_key_by_coord(coord, result);
42294+ set_key_ordering(result, get_key_ordering(reiser4_max_key()));
42295+ set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
42296+ set_key_offset(result, get_key_offset(reiser4_max_key()));
42297+ return result;
42298+}
42299+
42300+/* @data contains data which are to be put into tree */
42301+int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
42302+ const reiser4_key * key /* key to check */ ,
42303+ const reiser4_item_data * data /* parameters of new
42304+ * item/unit being
42305+ * created */ )
42306+{
42307+ reiser4_key item_key;
42308+
42309+ /* FIXME-VS: do not rely on anything but iplug field of @data. Only
42310+ data->iplug is initialized */
42311+ assert("vs-457", data && data->iplug);
42312+/* assert( "vs-553", data -> user == 0 );*/
42313+ item_key_by_coord(coord, &item_key);
42314+
42315+ return (item_plugin_by_coord(coord) == data->iplug) &&
42316+ (extract_dir_id_from_key(&item_key) ==
42317+ extract_dir_id_from_key(key));
42318+}
42319+
42320+#if REISER4_DEBUG
42321+/* cde_check ->check() method for compressed directory items
42322+
42323+ used for debugging, every item should have here the most complete
42324+ possible check of the consistency of the item that the inventor can
42325+ construct
42326+*/
42327+int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
42328+ const char **error /* where to store error message */)
42329+{
42330+ int i;
42331+ int result;
42332+ char *item_start;
42333+ char *item_end;
42334+ reiser4_key key;
42335+
42336+ coord_t c;
42337+
42338+ assert("nikita-1357", coord != NULL);
42339+ assert("nikita-1358", error != NULL);
42340+
42341+ if (!ergo(coord->item_pos != 0,
42342+ is_dot_key(item_key_by_coord(coord, &key)))) {
42343+ *error = "CDE doesn't start with dot";
42344+ return -1;
42345+ }
42346+ item_start = item_body_by_coord(coord);
42347+ item_end = item_start + item_length_by_coord(coord);
42348+
42349+ coord_dup(&c, coord);
42350+ result = 0;
42351+ for (i = 0; i < units(coord); ++i) {
42352+ directory_entry_format *entry;
42353+
42354+ if ((char *)(header_at(coord, i) + 1) >
42355+ item_end - units(coord) * sizeof *entry) {
42356+ *error = "CDE header is out of bounds";
42357+ result = -1;
42358+ break;
42359+ }
42360+ entry = entry_at(coord, i);
42361+ if ((char *)entry < item_start + sizeof(cde_item_format)) {
42362+ *error = "CDE header is too low";
42363+ result = -1;
42364+ break;
42365+ }
42366+ if ((char *)(entry + 1) > item_end) {
42367+ *error = "CDE header is too high";
42368+ result = -1;
42369+ break;
42370+ }
42371+ }
42372+
42373+ return result;
42374+}
42375+#endif
42376+
42377+/* ->init() method for this item plugin. */
42378+int init_cde(coord_t * coord /* coord of item */ ,
42379+ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
42380+ UNUSED_ARG)
42381+{
42382+ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
42383+ return 0;
42384+}
42385+
42386+/* ->lookup() method for this item plugin. */
42387+lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
42388+ lookup_bias bias /* search bias */ ,
42389+ coord_t * coord /* coord of item to lookup in */ )
42390+{
42391+ cmp_t last_comp;
42392+ int pos;
42393+
42394+ reiser4_key utmost_key;
42395+
42396+ assert("nikita-1293", coord != NULL);
42397+ assert("nikita-1294", key != NULL);
42398+
42399+ CHECKME(coord);
42400+
42401+ if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
42402+ coord->unit_pos = 0;
42403+ coord->between = BEFORE_UNIT;
42404+ return CBK_COORD_NOTFOUND;
42405+ }
42406+ pos = find(coord, key, &last_comp);
42407+ if (pos >= 0) {
42408+ coord->unit_pos = (int)pos;
42409+ switch (last_comp) {
42410+ case EQUAL_TO:
42411+ coord->between = AT_UNIT;
42412+ return CBK_COORD_FOUND;
42413+ case GREATER_THAN:
42414+ coord->between = BEFORE_UNIT;
42415+ return RETERR(-ENOENT);
42416+ case LESS_THAN:
42417+ default:
42418+ impossible("nikita-1298", "Broken find");
42419+ return RETERR(-EIO);
42420+ }
42421+ } else {
42422+ coord->unit_pos = units(coord) - 1;
42423+ coord->between = AFTER_UNIT;
42424+ return (bias ==
42425+ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
42426+ CBK_COORD_NOTFOUND;
42427+ }
42428+}
42429+
42430+/* ->paste() method for this item plugin. */
42431+int paste_cde(coord_t * coord /* coord of item */ ,
42432+ reiser4_item_data * data /* parameters of new unit being
42433+ * inserted */ ,
42434+ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
42435+{
42436+ struct cde_entry_data *e;
42437+ int result;
42438+ int i;
42439+
42440+ CHECKME(coord);
42441+ e = (struct cde_entry_data *) data->data;
42442+
42443+ result = 0;
42444+ for (i = 0; i < e->num_of_entries; ++i) {
42445+ int pos;
42446+ int phantom_size;
42447+
42448+ phantom_size = data->length;
42449+ if (units(coord) == 0)
42450+ phantom_size -= sizeof(cde_item_format);
42451+
42452+ result =
42453+ expand(coord, e->entry + i, phantom_size, &pos, data->arg);
42454+ if (result != 0)
42455+ break;
42456+ result = paste_entry(coord, e->entry + i, pos, data->arg);
42457+ if (result != 0)
42458+ break;
42459+ }
42460+ CHECKME(coord);
42461+ return result;
42462+}
42463+
42464+/* amount of space occupied by all entries starting from @idx both headers and
42465+ bodies. */
42466+static unsigned int part_size(const coord_t * coord /* coord of item */ ,
42467+ int idx /* index of unit */ )
42468+{
42469+ assert("nikita-1299", coord != NULL);
42470+ assert("nikita-1300", idx < (int)units(coord));
42471+
42472+ return sizeof(cde_item_format) +
42473+ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
42474+ idx + 1) -
42475+ offset_of(coord, 0);
42476+}
42477+
42478+/* how many but not more than @want units of @source can be merged with
42479+ item in @target node. If pend == append - we try to append last item
42480+ of @target by first units of @source. If pend == prepend - we try to
42481+ "prepend" first item in @target by last units of @source. @target
42482+ node has @free_space bytes of free space. Total size of those units
42483+ are returned via @size */
42484+int can_shift_cde(unsigned free_space /* free space in item */ ,
42485+ coord_t * coord /* coord of source item */ ,
42486+ znode * target /* target node */ ,
42487+ shift_direction pend /* shift direction */ ,
42488+ unsigned *size /* resulting number of shifted bytes */ ,
42489+ unsigned want /* maximal number of bytes to shift */ )
42490+{
42491+ int shift;
42492+
42493+ CHECKME(coord);
42494+ if (want == 0) {
42495+ *size = 0;
42496+ return 0;
42497+ }
42498+
42499+ /* pend == SHIFT_LEFT <==> shifting to the left */
42500+ if (pend == SHIFT_LEFT) {
42501+ for (shift = min((int)want - 1, units(coord)); shift >= 0;
42502+ --shift) {
42503+ *size = part_size(coord, shift);
42504+ if (target != NULL)
42505+ *size -= sizeof(cde_item_format);
42506+ if (*size <= free_space)
42507+ break;
42508+ }
42509+ shift = shift + 1;
42510+ } else {
42511+ int total_size;
42512+
42513+ assert("nikita-1301", pend == SHIFT_RIGHT);
42514+
42515+ total_size = item_length_by_coord(coord);
42516+ for (shift = units(coord) - want - 1; shift < units(coord) - 1;
42517+ ++shift) {
42518+ *size = total_size - part_size(coord, shift);
42519+ if (target == NULL)
42520+ *size += sizeof(cde_item_format);
42521+ if (*size <= free_space)
42522+ break;
42523+ }
42524+ shift = units(coord) - shift - 1;
42525+ }
42526+ if (shift == 0)
42527+ *size = 0;
42528+ CHECKME(coord);
42529+ return shift;
42530+}
42531+
42532+/* ->copy_units() method for this item plugin. */
42533+void copy_units_cde(coord_t * target /* coord of target item */ ,
42534+ coord_t * source /* coord of source item */ ,
42535+ unsigned from /* starting unit */ ,
42536+ unsigned count /* how many units to copy */ ,
42537+ shift_direction where_is_free_space /* shift direction */ ,
42538+ unsigned free_space /* free space in item */ )
42539+{
42540+ char *header_from;
42541+ char *header_to;
42542+
42543+ char *entry_from;
42544+ char *entry_to;
42545+
42546+ int pos_in_target;
42547+ int data_size;
42548+ int data_delta;
42549+ int i;
42550+
42551+ assert("nikita-1303", target != NULL);
42552+ assert("nikita-1304", source != NULL);
42553+ assert("nikita-1305", (int)from < units(source));
42554+ assert("nikita-1307", (int)(from + count) <= units(source));
42555+
42556+ if (where_is_free_space == SHIFT_LEFT) {
42557+ assert("nikita-1453", from == 0);
42558+ pos_in_target = units(target);
42559+ } else {
42560+ assert("nikita-1309", (int)(from + count) == units(source));
42561+ pos_in_target = 0;
42562+ memmove(item_body_by_coord(target),
42563+ (char *)item_body_by_coord(target) + free_space,
42564+ item_length_by_coord(target) - free_space);
42565+ }
42566+
42567+ CHECKME(target);
42568+ CHECKME(source);
42569+
42570+ /* expand @target */
42571+ data_size =
42572+ offset_of(source, (int)(from + count)) - offset_of(source,
42573+ (int)from);
42574+
42575+ if (units(target) == 0)
42576+ free_space -= sizeof(cde_item_format);
42577+
42578+ expand_item(target, pos_in_target, (int)count,
42579+ (int)(item_length_by_coord(target) - free_space),
42580+ (unsigned)data_size);
42581+
42582+ /* copy first @count units of @source into @target */
42583+ data_delta =
42584+ offset_of(target, pos_in_target) - offset_of(source, (int)from);
42585+
42586+ /* copy entries */
42587+ entry_from = (char *)entry_at(source, (int)from);
42588+ entry_to = (char *)entry_at(source, (int)(from + count));
42589+ memmove(entry_at(target, pos_in_target), entry_from,
42590+ (unsigned)(entry_to - entry_from));
42591+
42592+ /* copy headers */
42593+ header_from = (char *)header_at(source, (int)from);
42594+ header_to = (char *)header_at(source, (int)(from + count));
42595+ memmove(header_at(target, pos_in_target), header_from,
42596+ (unsigned)(header_to - header_from));
42597+
42598+ /* update offsets */
42599+ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
42600+ adj_offset(target, i, data_delta);
42601+ CHECKME(target);
42602+ CHECKME(source);
42603+}
42604+
42605+/* ->cut_units() method for this item plugin. */
42606+int cut_units_cde(coord_t * coord /* coord of item */ ,
42607+ pos_in_node_t from /* start unit pos */ ,
42608+ pos_in_node_t to /* stop unit pos */ ,
42609+ struct carry_cut_data *cdata UNUSED_ARG,
42610+ reiser4_key * smallest_removed, reiser4_key * new_first)
42611+{
42612+ char *header_from;
42613+ char *header_to;
42614+
42615+ char *entry_from;
42616+ char *entry_to;
42617+
42618+ int size;
42619+ int entry_delta;
42620+ int header_delta;
42621+ int i;
42622+
42623+ unsigned count;
42624+
42625+ CHECKME(coord);
42626+
42627+ count = to - from + 1;
42628+
42629+ assert("nikita-1454", coord != NULL);
42630+ assert("nikita-1455", (int)(from + count) <= units(coord));
42631+
42632+ if (smallest_removed)
42633+ unit_key_by_coord(coord, smallest_removed);
42634+
42635+ if (new_first) {
42636+ coord_t next;
42637+
42638+ /* not everything is cut from item head */
42639+ assert("vs-1527", from == 0);
42640+ assert("vs-1528", to < units(coord) - 1);
42641+
42642+ coord_dup(&next, coord);
42643+ next.unit_pos++;
42644+ unit_key_by_coord(&next, new_first);
42645+ }
42646+
42647+ size = item_length_by_coord(coord);
42648+ if (count == (unsigned)units(coord)) {
42649+ return size;
42650+ }
42651+
42652+ header_from = (char *)header_at(coord, (int)from);
42653+ header_to = (char *)header_at(coord, (int)(from + count));
42654+
42655+ entry_from = (char *)entry_at(coord, (int)from);
42656+ entry_to = (char *)entry_at(coord, (int)(from + count));
42657+
42658+ /* move headers */
42659+ memmove(header_from, header_to,
42660+ (unsigned)(address(coord, size) - header_to));
42661+
42662+ header_delta = header_to - header_from;
42663+
42664+ entry_from -= header_delta;
42665+ entry_to -= header_delta;
42666+ size -= header_delta;
42667+
42668+ /* copy entries */
42669+ memmove(entry_from, entry_to,
42670+ (unsigned)(address(coord, size) - entry_to));
42671+
42672+ entry_delta = entry_to - entry_from;
42673+ size -= entry_delta;
42674+
42675+ /* update offsets */
42676+
42677+ for (i = 0; i < (int)from; ++i)
42678+ adj_offset(coord, i, -header_delta);
42679+
42680+ for (i = from; i < units(coord) - (int)count; ++i)
42681+ adj_offset(coord, i, -header_delta - entry_delta);
42682+
42683+ put_unaligned(cpu_to_le16((__u16) units(coord) - count),
42684+ &formatted_at(coord)->num_of_entries);
42685+
42686+ if (from == 0) {
42687+ /* entries from head was removed - move remaining to right */
42688+ memmove((char *)item_body_by_coord(coord) +
42689+ header_delta + entry_delta, item_body_by_coord(coord),
42690+ (unsigned)size);
42691+ if (REISER4_DEBUG)
42692+ memset(item_body_by_coord(coord), 0,
42693+ (unsigned)header_delta + entry_delta);
42694+ } else {
42695+ /* freed space is already at the end of item */
42696+ if (REISER4_DEBUG)
42697+ memset((char *)item_body_by_coord(coord) + size, 0,
42698+ (unsigned)header_delta + entry_delta);
42699+ }
42700+
42701+ return header_delta + entry_delta;
42702+}
42703+
42704+int kill_units_cde(coord_t * coord /* coord of item */ ,
42705+ pos_in_node_t from /* start unit pos */ ,
42706+ pos_in_node_t to /* stop unit pos */ ,
42707+ struct carry_kill_data *kdata UNUSED_ARG,
42708+ reiser4_key * smallest_removed, reiser4_key * new_first)
42709+{
42710+ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
42711+}
42712+
42713+/* ->s.dir.extract_key() method for this item plugin. */
42714+int extract_key_cde(const coord_t * coord /* coord of item */ ,
42715+ reiser4_key * key /* resulting key */ )
42716+{
42717+ directory_entry_format *dent;
42718+
42719+ assert("nikita-1155", coord != NULL);
42720+ assert("nikita-1156", key != NULL);
42721+
42722+ dent = entry_at(coord, idx_of(coord));
42723+ return extract_key_from_id(&dent->id, key);
42724+}
42725+
42726+int
42727+update_key_cde(const coord_t * coord, const reiser4_key * key,
42728+ lock_handle * lh UNUSED_ARG)
42729+{
42730+ directory_entry_format *dent;
42731+ obj_key_id obj_id;
42732+ int result;
42733+
42734+ assert("nikita-2344", coord != NULL);
42735+ assert("nikita-2345", key != NULL);
42736+
42737+ dent = entry_at(coord, idx_of(coord));
42738+ result = build_obj_key_id(key, &obj_id);
42739+ if (result == 0) {
42740+ dent->id = obj_id;
42741+ znode_make_dirty(coord->node);
42742+ }
42743+ return 0;
42744+}
42745+
42746+/* ->s.dir.extract_name() method for this item plugin. */
42747+char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
42748+{
42749+ directory_entry_format *dent;
42750+
42751+ assert("nikita-1157", coord != NULL);
42752+
42753+ dent = entry_at(coord, idx_of(coord));
42754+ return extract_dent_name(coord, dent, buf);
42755+}
42756+
42757+static int cde_bytes(int pasting, const reiser4_item_data * data)
42758+{
42759+ int result;
42760+
42761+ result = data->length;
42762+ if (!pasting)
42763+ result -= sizeof(cde_item_format);
42764+ return result;
42765+}
42766+
42767+/* ->s.dir.add_entry() method for this item plugin */
42768+int add_entry_cde(struct inode *dir /* directory object */ ,
42769+ coord_t * coord /* coord of item */ ,
42770+ lock_handle * lh /* lock handle for insertion */ ,
42771+ const struct dentry *name /* name to insert */ ,
42772+ reiser4_dir_entry_desc * dir_entry /* parameters of new
42773+ * directory entry */ )
42774+{
42775+ reiser4_item_data data;
42776+ struct cde_entry entry;
42777+ struct cde_entry_data edata;
42778+ int result;
42779+
42780+ assert("nikita-1656", coord->node == lh->node);
42781+ assert("nikita-1657", znode_is_write_locked(coord->node));
42782+
42783+ edata.num_of_entries = 1;
42784+ edata.entry = &entry;
42785+
42786+ entry.dir = dir;
42787+ entry.obj = dir_entry->obj;
42788+ entry.name = &name->d_name;
42789+
42790+ data.data = (char *)&edata;
42791+ data.user = 0; /* &edata is not user space */
42792+ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
42793+ data.arg = dir_entry;
42794+ assert("nikita-1302", data.iplug != NULL);
42795+
42796+ result = is_dot_key(&dir_entry->key);
42797+ data.length = estimate_cde(result ? coord : NULL, &data);
42798+
42799+ /* NOTE-NIKITA quota plugin? */
42800+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
42801+ return RETERR(-EDQUOT);
42802+
42803+ if (result)
42804+ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
42805+ else
42806+ result = reiser4_resize_item(coord, &data, &dir_entry->key,
42807+ lh, 0);
42808+ return result;
42809+}
42810+
42811+/* ->s.dir.rem_entry() */
42812+int rem_entry_cde(struct inode *dir /* directory of item */ ,
42813+ const struct qstr *name, coord_t * coord /* coord of item */ ,
42814+ lock_handle * lh UNUSED_ARG /* lock handle for
42815+ * removal */ ,
42816+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
42817+ * directory entry
42818+ * being removed */ )
42819+{
42820+ coord_t shadow;
42821+ int result;
42822+ int length;
42823+ ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
42824+
42825+ assert("nikita-2870", strlen(name->name) == name->len);
42826+ assert("nikita-2869",
42827+ !strcmp(name->name, extract_name_cde(coord, buf)));
42828+
42829+ length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
42830+ if (is_longname(name->name, name->len))
42831+ length += name->len + 1;
42832+
42833+ if (inode_get_bytes(dir) < length) {
42834+ warning("nikita-2628", "Dir is broke: %llu: %llu",
42835+ (unsigned long long)get_inode_oid(dir),
42836+ inode_get_bytes(dir));
42837+
42838+ return RETERR(-EIO);
42839+ }
42840+
42841+ /* cut_node() is supposed to take pointers to _different_
42842+ coords, because it will modify them without respect to
42843+ possible aliasing. To work around this, create temporary copy
42844+ of @coord.
42845+ */
42846+ coord_dup(&shadow, coord);
42847+ result =
42848+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
42849+ if (result == 0) {
42850+ /* NOTE-NIKITA quota plugin? */
42851+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
42852+ }
42853+ return result;
42854+}
42855+
42856+/* ->s.dir.max_name_len() method for this item plugin */
42857+int max_name_len_cde(const struct inode *dir /* directory */ )
42858+{
42859+ return
42860+ reiser4_tree_by_inode(dir)->nplug->max_item_size() -
42861+ sizeof(directory_entry_format) - sizeof(cde_item_format) -
42862+ sizeof(cde_unit_header) - 2;
42863+}
42864+
42865+/* Make Linus happy.
42866+ Local variables:
42867+ c-indentation-style: "K&R"
42868+ mode-name: "LC"
42869+ c-basic-offset: 8
42870+ tab-width: 8
42871+ fill-column: 120
42872+ End:
42873+*/
42874diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/cde.h linux-2.6.23/fs/reiser4/plugin/item/cde.h
42875--- linux-2.6.23.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 03:00:00.000000000 +0300
42876+++ linux-2.6.23/fs/reiser4/plugin/item/cde.h 2007-12-04 16:49:30.000000000 +0300
42877@@ -0,0 +1,87 @@
42878+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
42879+
42880+/* Compound directory item. See cde.c for description. */
42881+
42882+#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
42883+#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
42884+
42885+#include "../../forward.h"
42886+#include "../../kassign.h"
42887+#include "../../dformat.h"
42888+
42889+#include <linux/fs.h> /* for struct inode */
42890+#include <linux/dcache.h> /* for struct dentry, etc */
42891+
42892+typedef struct cde_unit_header {
42893+ de_id hash;
42894+ d16 offset;
42895+} cde_unit_header;
42896+
42897+typedef struct cde_item_format {
42898+ d16 num_of_entries;
42899+ cde_unit_header entry[0];
42900+} cde_item_format;
42901+
42902+struct cde_entry {
42903+ const struct inode *dir;
42904+ const struct inode *obj;
42905+ const struct qstr *name;
42906+};
42907+
42908+struct cde_entry_data {
42909+ int num_of_entries;
42910+ struct cde_entry *entry;
42911+};
42912+
42913+/* plugin->item.b.* */
42914+reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
42915+int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
42916+ const reiser4_item_data *);
42917+int mergeable_cde(const coord_t * p1, const coord_t * p2);
42918+pos_in_node_t nr_units_cde(const coord_t * coord);
42919+reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
42920+int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
42921+void print_cde(const char *prefix, coord_t * coord);
42922+int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
42923+lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
42924+ coord_t * coord);
42925+int paste_cde(coord_t * coord, reiser4_item_data * data,
42926+ carry_plugin_info * info UNUSED_ARG);
42927+int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
42928+ shift_direction pend, unsigned *size, unsigned want);
42929+void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
42930+ unsigned count, shift_direction where_is_free_space,
42931+ unsigned free_space);
42932+int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
42933+ struct carry_cut_data *, reiser4_key * smallest_removed,
42934+ reiser4_key * new_first);
42935+int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
42936+ struct carry_kill_data *, reiser4_key * smallest_removed,
42937+ reiser4_key * new_first);
42938+void print_cde(const char *prefix, coord_t * coord);
42939+int reiser4_check_cde(const coord_t * coord, const char **error);
42940+
42941+/* plugin->u.item.s.dir.* */
42942+int extract_key_cde(const coord_t * coord, reiser4_key * key);
42943+int update_key_cde(const coord_t * coord, const reiser4_key * key,
42944+ lock_handle * lh);
42945+char *extract_name_cde(const coord_t * coord, char *buf);
42946+int add_entry_cde(struct inode *dir, coord_t * coord,
42947+ lock_handle * lh, const struct dentry *name,
42948+ reiser4_dir_entry_desc * entry);
42949+int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
42950+ lock_handle * lh, reiser4_dir_entry_desc * entry);
42951+int max_name_len_cde(const struct inode *dir);
42952+
42953+/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
42954+#endif
42955+
42956+/* Make Linus happy.
42957+ Local variables:
42958+ c-indentation-style: "K&R"
42959+ mode-name: "LC"
42960+ c-basic-offset: 8
42961+ tab-width: 8
42962+ fill-column: 120
42963+ End:
42964+*/
42965diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.23/fs/reiser4/plugin/item/ctail.c
42966--- linux-2.6.23.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 03:00:00.000000000 +0300
42967+++ linux-2.6.23/fs/reiser4/plugin/item/ctail.c 2007-12-04 23:04:00.730306034 +0300
42968@@ -0,0 +1,1615 @@
42969+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
42970+
42971+/* ctails (aka "clustered tails") are items for cryptcompress objects */
42972+
42973+/* DESCRIPTION:
42974+
42975+Each cryptcompress object is stored on disk as a set of clusters sliced
42976+into ctails.
42977+
42978+Internal on-disk structure:
42979+
42980+ HEADER (1) Here stored disk cluster shift
42981+ BODY
42982+*/
42983+
42984+#include "../../forward.h"
42985+#include "../../debug.h"
42986+#include "../../dformat.h"
42987+#include "../../kassign.h"
42988+#include "../../key.h"
42989+#include "../../coord.h"
42990+#include "item.h"
42991+#include "../node/node.h"
42992+#include "../plugin.h"
42993+#include "../object.h"
42994+#include "../../znode.h"
42995+#include "../../carry.h"
42996+#include "../../tree.h"
42997+#include "../../inode.h"
42998+#include "../../super.h"
42999+#include "../../context.h"
43000+#include "../../page_cache.h"
43001+#include "../cluster.h"
43002+#include "../../flush.h"
43003+#include "../../tree_walk.h"
43004+
43005+#include <linux/pagevec.h>
43006+#include <linux/swap.h>
43007+#include <linux/fs.h>
43008+
43009+/* return body of ctail item at @coord */
43010+static ctail_item_format *ctail_formatted_at(const coord_t * coord)
43011+{
43012+ assert("edward-60", coord != NULL);
43013+ return item_body_by_coord(coord);
43014+}
43015+
43016+static int cluster_shift_by_coord(const coord_t * coord)
43017+{
43018+ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
43019+}
43020+
43021+static inline void dclust_set_extension_shift(hint_t * hint)
43022+{
43023+ assert("edward-1270",
43024+ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
43025+ hint->ext_coord.extension.ctail.shift =
43026+ cluster_shift_by_coord(&hint->ext_coord.coord);
43027+}
43028+
43029+static loff_t off_by_coord(const coord_t * coord)
43030+{
43031+ reiser4_key key;
43032+ return get_key_offset(item_key_by_coord(coord, &key));
43033+}
43034+
43035+int coord_is_unprepped_ctail(const coord_t * coord)
43036+{
43037+ assert("edward-1233", coord != NULL);
43038+ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
43039+ assert("edward-1235",
43040+ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
43041+ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
43042+
43043+ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
43044+}
43045+
43046+static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
43047+{
43048+ int shift;
43049+
43050+ if (inode != NULL) {
43051+ shift = inode_cluster_shift(inode);
43052+ assert("edward-1236",
43053+ ergo(!coord_is_unprepped_ctail(coord),
43054+ shift == cluster_shift_by_coord(coord)));
43055+ } else {
43056+ assert("edward-1237", !coord_is_unprepped_ctail(coord));
43057+ shift = cluster_shift_by_coord(coord);
43058+ }
43059+ return off_by_coord(coord) >> shift;
43060+}
43061+
43062+static int disk_cluster_size(const coord_t * coord)
43063+{
43064+ assert("edward-1156",
43065+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
43066+ /* calculation of disk cluster size
43067+ is meaninless if ctail is unprepped */
43068+ assert("edward-1238", !coord_is_unprepped_ctail(coord));
43069+
43070+ return 1 << cluster_shift_by_coord(coord);
43071+}
43072+
43073+/* true if the key is of first disk cluster item */
43074+static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
43075+{
43076+ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
43077+
43078+ return coord_is_unprepped_ctail(coord) ||
43079+ ((get_key_offset(key) &
43080+ ((loff_t) disk_cluster_size(coord) - 1)) == 0);
43081+}
43082+
43083+static char *first_unit(coord_t * coord)
43084+{
43085+ /* FIXME: warning: pointer of type `void *' used in arithmetic */
43086+ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
43087+}
43088+
43089+/* plugin->u.item.b.max_key_inside :
43090+ tail_max_key_inside */
43091+
43092+/* plugin->u.item.b.can_contain_key */
43093+int
43094+can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
43095+ const reiser4_item_data * data)
43096+{
43097+ reiser4_key item_key;
43098+
43099+ if (item_plugin_by_coord(coord) != data->iplug)
43100+ return 0;
43101+
43102+ item_key_by_coord(coord, &item_key);
43103+ if (get_key_locality(key) != get_key_locality(&item_key) ||
43104+ get_key_objectid(key) != get_key_objectid(&item_key))
43105+ return 0;
43106+ if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
43107+ get_key_offset(key))
43108+ return 0;
43109+ if (is_disk_cluster_key(key, coord))
43110+ return 0;
43111+ return 1;
43112+}
43113+
43114+/* plugin->u.item.b.mergeable */
43115+int mergeable_ctail(const coord_t * p1, const coord_t * p2)
43116+{
43117+ reiser4_key key1, key2;
43118+
43119+ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
43120+ assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
43121+ UNIX_FILE_METADATA_ITEM_TYPE));
43122+
43123+ if (item_id_by_coord(p2) != CTAIL_ID) {
43124+ /* second item is of another type */
43125+ return 0;
43126+ }
43127+
43128+ item_key_by_coord(p1, &key1);
43129+ item_key_by_coord(p2, &key2);
43130+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
43131+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
43132+ get_key_type(&key1) != get_key_type(&key2)) {
43133+ /* items of different objects */
43134+ return 0;
43135+ }
43136+ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
43137+ /* not adjacent items */
43138+ return 0;
43139+ if (is_disk_cluster_key(&key2, p2))
43140+ return 0;
43141+ return 1;
43142+}
43143+
43144+/* plugin->u.item.b.nr_units */
43145+pos_in_node_t nr_units_ctail(const coord_t * coord)
43146+{
43147+ return (item_length_by_coord(coord) -
43148+ sizeof(ctail_formatted_at(coord)->cluster_shift));
43149+}
43150+
43151+/* plugin->u.item.b.estimate:
43152+ estimate how much space is needed to insert/paste @data->length bytes
43153+ into ctail at @coord */
43154+int estimate_ctail(const coord_t * coord /* coord of item */ ,
43155+ const reiser4_item_data *
43156+ data /* parameters for new item */ )
43157+{
43158+ if (coord == NULL)
43159+ /* insert */
43160+ return (sizeof(ctail_item_format) + data->length);
43161+ else
43162+ /* paste */
43163+ return data->length;
43164+}
43165+
43166+/* ->init() method for this item plugin. */
43167+int init_ctail(coord_t * to /* coord of item */ ,
43168+ coord_t * from /* old_item */ ,
43169+ reiser4_item_data * data /* structure used for insertion */ )
43170+{
43171+ int cluster_shift; /* cpu value to convert */
43172+
43173+ if (data) {
43174+ assert("edward-463", data->length > sizeof(ctail_item_format));
43175+ cluster_shift = *((int *)(data->arg));
43176+ data->length -= sizeof(ctail_item_format);
43177+ } else {
43178+ assert("edward-464", from != NULL);
43179+ assert("edward-855", ctail_ok(from));
43180+ cluster_shift = (int)(cluster_shift_by_coord(from));
43181+ }
43182+ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
43183+ assert("edward-856", ctail_ok(to));
43184+ return 0;
43185+}
43186+
43187+/* plugin->u.item.b.lookup:
43188+ NULL: We are looking for item keys only */
43189+
43190+#if REISER4_DEBUG
43191+int ctail_ok(const coord_t * coord)
43192+{
43193+ return coord_is_unprepped_ctail(coord) ||
43194+ cluster_shift_ok(cluster_shift_by_coord(coord));
43195+}
43196+
43197+/* plugin->u.item.b.check */
43198+int check_ctail(const coord_t * coord, const char **error)
43199+{
43200+ if (!ctail_ok(coord)) {
43201+ if (error)
43202+ *error = "bad cluster shift in ctail";
43203+ return 1;
43204+ }
43205+ return 0;
43206+}
43207+#endif
43208+
43209+/* plugin->u.item.b.paste */
43210+int
43211+paste_ctail(coord_t * coord, reiser4_item_data * data,
43212+ carry_plugin_info * info UNUSED_ARG)
43213+{
43214+ unsigned old_nr_units;
43215+
43216+ assert("edward-268", data->data != NULL);
43217+ /* copy only from kernel space */
43218+ assert("edward-66", data->user == 0);
43219+
43220+ old_nr_units =
43221+ item_length_by_coord(coord) - sizeof(ctail_item_format) -
43222+ data->length;
43223+
43224+ /* ctail items never get pasted in the middle */
43225+
43226+ if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
43227+
43228+ /* paste at the beginning when create new item */
43229+ assert("edward-450",
43230+ item_length_by_coord(coord) ==
43231+ data->length + sizeof(ctail_item_format));
43232+ assert("edward-451", old_nr_units == 0);
43233+ } else if (coord->unit_pos == old_nr_units - 1
43234+ && coord->between == AFTER_UNIT) {
43235+
43236+ /* paste at the end */
43237+ coord->unit_pos++;
43238+ } else
43239+ impossible("edward-453", "bad paste position");
43240+
43241+ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
43242+
43243+ assert("edward-857", ctail_ok(coord));
43244+
43245+ return 0;
43246+}
43247+
43248+/* plugin->u.item.b.fast_paste */
43249+
43250+/* plugin->u.item.b.can_shift
43251+ number of units is returned via return value, number of bytes via @size. For
43252+ ctail items they coincide */
43253+int
43254+can_shift_ctail(unsigned free_space, coord_t * source,
43255+ znode * target, shift_direction direction UNUSED_ARG,
43256+ unsigned *size /* number of bytes */ , unsigned want)
43257+{
43258+ /* make sure that that we do not want to shift more than we have */
43259+ assert("edward-68", want > 0 && want <= nr_units_ctail(source));
43260+
43261+ *size = min(want, free_space);
43262+
43263+ if (!target) {
43264+ /* new item will be created */
43265+ if (*size <= sizeof(ctail_item_format)) {
43266+ *size = 0;
43267+ return 0;
43268+ }
43269+ return *size - sizeof(ctail_item_format);
43270+ }
43271+ return *size;
43272+}
43273+
43274+/* plugin->u.item.b.copy_units
43275+ cooperates with ->can_shift() */
43276+void
43277+copy_units_ctail(coord_t * target, coord_t * source,
43278+ unsigned from, unsigned count /* units */ ,
43279+ shift_direction where_is_free_space,
43280+ unsigned free_space /* bytes */ )
43281+{
43282+ /* make sure that item @target is expanded already */
43283+ assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
43284+ assert("edward-70", free_space == count || free_space == count + 1);
43285+
43286+ assert("edward-858", ctail_ok(source));
43287+
43288+ if (where_is_free_space == SHIFT_LEFT) {
43289+ /* append item @target with @count first bytes of @source:
43290+ this restriction came from ordinary tails */
43291+ assert("edward-71", from == 0);
43292+ assert("edward-860", ctail_ok(target));
43293+
43294+ memcpy(first_unit(target) + nr_units_ctail(target) - count,
43295+ first_unit(source), count);
43296+ } else {
43297+ /* target item is moved to right already */
43298+ reiser4_key key;
43299+
43300+ assert("edward-72", nr_units_ctail(source) == from + count);
43301+
43302+ if (free_space == count) {
43303+ init_ctail(target, source, NULL);
43304+ } else {
43305+ /* new item has been created */
43306+ assert("edward-862", ctail_ok(target));
43307+ }
43308+ memcpy(first_unit(target), first_unit(source) + from, count);
43309+
43310+ assert("edward-863", ctail_ok(target));
43311+
43312+ /* new units are inserted before first unit in an item,
43313+ therefore, we have to update item key */
43314+ item_key_by_coord(source, &key);
43315+ set_key_offset(&key, get_key_offset(&key) + from);
43316+
43317+ node_plugin_by_node(target->node)->update_item_key(target, &key,
43318+ NULL /*info */);
43319+ }
43320+}
43321+
43322+/* plugin->u.item.b.create_hook */
43323+int create_hook_ctail(const coord_t * coord, void *arg)
43324+{
43325+ assert("edward-864", znode_is_loaded(coord->node));
43326+
43327+ znode_set_convertible(coord->node);
43328+ return 0;
43329+}
43330+
43331+/* plugin->u.item.b.kill_hook */
43332+int kill_hook_ctail(const coord_t * coord, pos_in_node_t from,
43333+ pos_in_node_t count, carry_kill_data * kdata)
43334+{
43335+ struct inode *inode;
43336+
43337+ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
43338+ assert("edward-291", znode_is_write_locked(coord->node));
43339+
43340+ inode = kdata->inode;
43341+ if (inode) {
43342+ reiser4_key key;
43343+ struct cryptcompress_info * info;
43344+ cloff_t index;
43345+
43346+ item_key_by_coord(coord, &key);
43347+ info = cryptcompress_inode_data(inode);
43348+ index = off_to_clust(get_key_offset(&key), inode);
43349+
43350+ if (from == 0) {
43351+ info->trunc_index = index;
43352+ if (is_disk_cluster_key(&key, coord)) {
43353+ /*
43354+ * first item of disk cluster is to be killed
43355+ */
43356+ truncate_complete_page_cluster(
43357+ inode, index, kdata->params.truncate);
43358+ inode_sub_bytes(inode,
43359+ inode_cluster_size(inode));
43360+ }
43361+ }
43362+ }
43363+ return 0;
43364+}
43365+
43366+/* for shift_hook_ctail(),
43367+ return true if the first disk cluster item has dirty child
43368+*/
43369+static int ctail_convertible(const coord_t * coord)
43370+{
43371+ int result;
43372+ reiser4_key key;
43373+ jnode *child = NULL;
43374+
43375+ assert("edward-477", coord != NULL);
43376+ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
43377+
43378+ if (coord_is_unprepped_ctail(coord))
43379+ /* unprepped ctail should be converted */
43380+ return 1;
43381+
43382+ item_key_by_coord(coord, &key);
43383+ child = jlookup(current_tree,
43384+ get_key_objectid(&key),
43385+ off_to_pg(off_by_coord(coord)));
43386+ if (!child)
43387+ return 0;
43388+ result = JF_ISSET(child, JNODE_DIRTY);
43389+ jput(child);
43390+ return result;
43391+}
43392+
43393+/* FIXME-EDWARD */
43394+/* plugin->u.item.b.shift_hook */
43395+int shift_hook_ctail(const coord_t * item /* coord of item */ ,
43396+ unsigned from UNUSED_ARG /* start unit */ ,
43397+ unsigned count UNUSED_ARG /* stop unit */ ,
43398+ znode * old_node /* old parent */ )
43399+{
43400+ assert("edward-479", item != NULL);
43401+ assert("edward-480", item->node != old_node);
43402+
43403+ if (!znode_convertible(old_node) || znode_convertible(item->node))
43404+ return 0;
43405+ if (ctail_convertible(item))
43406+ znode_set_convertible(item->node);
43407+ return 0;
43408+}
43409+
43410+static int
43411+cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43412+ int cut, void *p, reiser4_key * smallest_removed,
43413+ reiser4_key * new_first)
43414+{
43415+ pos_in_node_t count; /* number of units to cut */
43416+ char *item;
43417+
43418+ count = to - from + 1;
43419+ item = item_body_by_coord(coord);
43420+
43421+ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
43422+
43423+ if (smallest_removed) {
43424+ /* store smallest key removed */
43425+ item_key_by_coord(coord, smallest_removed);
43426+ set_key_offset(smallest_removed,
43427+ get_key_offset(smallest_removed) + from);
43428+ }
43429+
43430+ if (new_first) {
43431+ assert("vs-1531", from == 0);
43432+
43433+ item_key_by_coord(coord, new_first);
43434+ set_key_offset(new_first,
43435+ get_key_offset(new_first) + from + count);
43436+ }
43437+
43438+ if (!cut)
43439+ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
43440+
43441+ if (from == 0) {
43442+ if (count != nr_units_ctail(coord)) {
43443+ /* part of item is removed, so move free space at the beginning
43444+ of the item and update item key */
43445+ reiser4_key key;
43446+ memcpy(item + to + 1, item, sizeof(ctail_item_format));
43447+ item_key_by_coord(coord, &key);
43448+ set_key_offset(&key, get_key_offset(&key) + count);
43449+ node_plugin_by_node(coord->node)->update_item_key(coord,
43450+ &key,
43451+ NULL);
43452+ } else {
43453+ /* cut_units should not be called to cut evrything */
43454+ assert("vs-1532", ergo(cut, 0));
43455+ /* whole item is cut, so more then amount of space occupied
43456+ by units got freed */
43457+ count += sizeof(ctail_item_format);
43458+ }
43459+ if (REISER4_DEBUG)
43460+ memset(item, 0, count);
43461+ } else if (REISER4_DEBUG)
43462+ memset(item + sizeof(ctail_item_format) + from, 0, count);
43463+ return count;
43464+}
43465+
43466+/* plugin->u.item.b.cut_units */
43467+int
43468+cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43469+ carry_cut_data * cdata, reiser4_key * smallest_removed,
43470+ reiser4_key * new_first)
43471+{
43472+ return cut_or_kill_ctail_units(item, from, to, 1, NULL,
43473+ smallest_removed, new_first);
43474+}
43475+
43476+/* plugin->u.item.b.kill_units */
43477+int
43478+kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43479+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
43480+ reiser4_key * new_first)
43481+{
43482+ return cut_or_kill_ctail_units(item, from, to, 0, kdata,
43483+ smallest_removed, new_first);
43484+}
43485+
43486+/* plugin->u.item.s.file.read */
43487+int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
43488+{
43489+ uf_coord_t *uf_coord;
43490+ coord_t *coord;
43491+
43492+ uf_coord = &hint->ext_coord;
43493+ coord = &uf_coord->coord;
43494+ assert("edward-127", f->user == 0);
43495+ assert("edward-129", coord && coord->node);
43496+ assert("edward-130", coord_is_existing_unit(coord));
43497+ assert("edward-132", znode_is_loaded(coord->node));
43498+
43499+ /* start read only from the beginning of ctail */
43500+ assert("edward-133", coord->unit_pos == 0);
43501+ /* read only whole ctails */
43502+ assert("edward-135", nr_units_ctail(coord) <= f->length);
43503+
43504+ assert("edward-136", reiser4_schedulable());
43505+ assert("edward-886", ctail_ok(coord));
43506+
43507+ if (f->data)
43508+ memcpy(f->data, (char *)first_unit(coord),
43509+ (size_t) nr_units_ctail(coord));
43510+
43511+ dclust_set_extension_shift(hint);
43512+ mark_page_accessed(znode_page(coord->node));
43513+ move_flow_forward(f, nr_units_ctail(coord));
43514+
43515+ return 0;
43516+}
43517+
43518+/**
43519+ * Prepare transform stream with plain text for page
43520+ * @page taking into account synchronization issues.
43521+ */
43522+static int ctail_read_disk_cluster(struct cluster_handle * clust,
43523+ struct inode * inode, struct page * page,
43524+ znode_lock_mode mode)
43525+{
43526+ int result;
43527+
43528+ assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
43529+ assert("edward-671", clust->hint != NULL);
43530+ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
43531+ assert("edward-672", cryptcompress_inode_ok(inode));
43532+ assert("edward-1527", PageLocked(page));
43533+
43534+ unlock_page(page);
43535+
43536+ /* set input stream */
43537+ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
43538+ if (result) {
43539+ lock_page(page);
43540+ return result;
43541+ }
43542+ result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
43543+ lock_page(page);
43544+ if (result)
43545+ return result;
43546+ /*
43547+ * at this point we have locked position in the tree
43548+ */
43549+ assert("edward-1528", znode_is_any_locked(clust->hint->lh.node));
43550+
43551+ if (page->mapping != inode->i_mapping) {
43552+ /* page was truncated */
43553+ reiser4_unset_hint(clust->hint);
43554+ reset_cluster_params(clust);
43555+ return AOP_TRUNCATED_PAGE;
43556+ }
43557+ if (PageUptodate(page)) {
43558+ /* disk cluster can be obsolete, don't use it! */
43559+ reiser4_unset_hint(clust->hint);
43560+ reset_cluster_params(clust);
43561+ return 0;
43562+ }
43563+ if (clust->dstat == FAKE_DISK_CLUSTER ||
43564+ clust->dstat == UNPR_DISK_CLUSTER ||
43565+ clust->dstat == TRNC_DISK_CLUSTER) {
43566+ /*
43567+ * this information about disk cluster will be valid
43568+ * as long as we keep the position in the tree locked
43569+ */
43570+ tfm_cluster_set_uptodate(&clust->tc);
43571+ return 0;
43572+ }
43573+ /* now prepare output stream.. */
43574+ result = grab_coa(&clust->tc, inode_compression_plugin(inode));
43575+ if (result)
43576+ return result;
43577+ /* ..and fill this with plain text */
43578+ result = reiser4_inflate_cluster(clust, inode);
43579+ if (result)
43580+ return result;
43581+ /*
43582+ * The stream is ready! It won't be obsolete as
43583+ * long as we keep last disk cluster item locked.
43584+ */
43585+ tfm_cluster_set_uptodate(&clust->tc);
43586+ return 0;
43587+}
43588+
43589+/*
43590+ * fill one page with plain text.
43591+ */
43592+int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust,
43593+ struct page *page, znode_lock_mode mode)
43594+{
43595+ int ret;
43596+ unsigned cloff;
43597+ char *data;
43598+ size_t to_page;
43599+ struct tfm_cluster * tc = &clust->tc;
43600+
43601+ assert("edward-212", PageLocked(page));
43602+
43603+ if (unlikely(page->mapping != inode->i_mapping))
43604+ return AOP_TRUNCATED_PAGE;
43605+ if (PageUptodate(page))
43606+ goto exit;
43607+ to_page = pbytes(page_index(page), inode);
43608+ if (to_page == 0) {
43609+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43610+ SetPageUptodate(page);
43611+ goto exit;
43612+ }
43613+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
43614+ clust->index = pg_to_clust(page->index, inode);
43615+
43616+ /* this will unlock/lock the page */
43617+ ret = ctail_read_disk_cluster(clust, inode, page, mode);
43618+
43619+ assert("edward-212", PageLocked(page));
43620+ if (ret)
43621+ return ret;
43622+
43623+ /* refresh bytes */
43624+ to_page = pbytes(page_index(page), inode);
43625+ if (to_page == 0) {
43626+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43627+ SetPageUptodate(page);
43628+ goto exit;
43629+ }
43630+ }
43631+ if (PageUptodate(page))
43632+ /* somebody else fill it already */
43633+ goto exit;
43634+
43635+ assert("edward-119", tfm_cluster_is_uptodate(tc));
43636+ assert("edward-1529", znode_is_any_locked(clust->hint->lh.node));
43637+
43638+ switch (clust->dstat) {
43639+ case UNPR_DISK_CLUSTER:
43640+ BUG_ON(1);
43641+ case TRNC_DISK_CLUSTER:
43642+ /*
43643+ * Race with truncate!
43644+ * We resolve it in favour of the last one (the only way,
43645+ * as in this case plain text is unrecoverable)
43646+ */
43647+ case FAKE_DISK_CLUSTER:
43648+ /* fill the page by zeroes */
43649+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43650+ SetPageUptodate(page);
43651+ break;
43652+ case PREP_DISK_CLUSTER:
43653+ /* fill page by transformed stream with plain text */
43654+ assert("edward-1058", !PageUptodate(page));
43655+ assert("edward-120", tc->len <= inode_cluster_size(inode));
43656+
43657+ /* page index in this logical cluster */
43658+ cloff = pg_to_off_to_cloff(page->index, inode);
43659+
43660+ data = kmap(page);
43661+ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page);
43662+ memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page);
43663+ flush_dcache_page(page);
43664+ kunmap(page);
43665+ SetPageUptodate(page);
43666+ break;
43667+ default:
43668+ impossible("edward-1169", "bad disk cluster state");
43669+ }
43670+ exit:
43671+ return 0;
43672+}
43673+
43674+/* plugin->u.item.s.file.readpage */
43675+int readpage_ctail(void *vp, struct page *page)
43676+{
43677+ int result;
43678+ hint_t * hint;
43679+ struct cluster_handle * clust = vp;
43680+
43681+ assert("edward-114", clust != NULL);
43682+ assert("edward-115", PageLocked(page));
43683+ assert("edward-116", !PageUptodate(page));
43684+ assert("edward-118", page->mapping && page->mapping->host);
43685+ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
43686+
43687+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
43688+ if (hint == NULL) {
43689+ unlock_page(page);
43690+ return RETERR(-ENOMEM);
43691+ }
43692+ clust->hint = hint;
43693+ result = load_file_hint(clust->file, hint);
43694+ if (result) {
43695+ kfree(hint);
43696+ unlock_page(page);
43697+ return result;
43698+ }
43699+ assert("vs-25", hint->ext_coord.lh == &hint->lh);
43700+
43701+ result = do_readpage_ctail(page->mapping->host, clust, page,
43702+ ZNODE_READ_LOCK);
43703+ assert("edward-213", PageLocked(page));
43704+ assert("edward-1163", ergo(!result, PageUptodate(page)));
43705+
43706+ unlock_page(page);
43707+ done_lh(&hint->lh);
43708+ hint->ext_coord.valid = 0;
43709+ save_file_hint(clust->file, hint);
43710+ kfree(hint);
43711+ tfm_cluster_clr_uptodate(&clust->tc);
43712+
43713+ return result;
43714+}
43715+
43716+/* Helper function for ->readpages() */
43717+static int ctail_read_page_cluster(struct cluster_handle * clust,
43718+ struct inode *inode)
43719+{
43720+ int i;
43721+ int result;
43722+ assert("edward-779", clust != NULL);
43723+ assert("edward-1059", clust->win == NULL);
43724+ assert("edward-780", inode != NULL);
43725+
43726+ result = prepare_page_cluster(inode, clust, READ_OP);
43727+ if (result)
43728+ return result;
43729+
43730+ assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc));
43731+
43732+ for (i = 0; i < clust->nr_pages; i++) {
43733+ struct page *page = clust->pages[i];
43734+ lock_page(page);
43735+ result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
43736+ unlock_page(page);
43737+ if (result)
43738+ break;
43739+ }
43740+ tfm_cluster_clr_uptodate(&clust->tc);
43741+ put_page_cluster(clust, inode, READ_OP);
43742+ return result;
43743+}
43744+
43745+/* filler for read_cache_pages() */
43746+static int ctail_readpages_filler(void * data, struct page * page)
43747+{
43748+ int ret = 0;
43749+ struct cluster_handle * clust = data;
43750+ struct inode * inode = clust->file->f_dentry->d_inode;
43751+
43752+ assert("edward-1525", page->mapping == inode->i_mapping);
43753+
43754+ if (PageUptodate(page)) {
43755+ unlock_page(page);
43756+ return 0;
43757+ }
43758+ if (pbytes(page_index(page), inode) == 0) {
43759+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43760+ SetPageUptodate(page);
43761+ unlock_page(page);
43762+ return 0;
43763+ }
43764+ move_cluster_forward(clust, inode, page->index);
43765+ unlock_page(page);
43766+ /*
43767+ * read the whole page cluster
43768+ */
43769+ ret = ctail_read_page_cluster(clust, inode);
43770+
43771+ assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
43772+ return ret;
43773+}
43774+
43775+/*
43776+ * We populate a bit more then upper readahead suggests:
43777+ * with each nominated page we read the whole page cluster
43778+ * this page belongs to.
43779+ */
43780+int readpages_ctail(struct file *file, struct address_space *mapping,
43781+ struct list_head *pages)
43782+{
43783+ int ret = 0;
43784+ hint_t *hint;
43785+ struct cluster_handle clust;
43786+ struct inode *inode = mapping->host;
43787+
43788+ assert("edward-1521", inode == file->f_dentry->d_inode);
43789+
43790+ cluster_init_read(&clust, NULL);
43791+ clust.file = file;
43792+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
43793+ if (hint == NULL) {
43794+ warning("vs-28", "failed to allocate hint");
43795+ ret = RETERR(-ENOMEM);
43796+ goto exit1;
43797+ }
43798+ clust.hint = hint;
43799+ ret = load_file_hint(clust.file, hint);
43800+ if (ret) {
43801+ warning("edward-1522", "failed to load hint");
43802+ goto exit2;
43803+ }
43804+ assert("vs-26", hint->ext_coord.lh == &hint->lh);
43805+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
43806+ if (ret) {
43807+ warning("edward-1523", "failed to alloc pgset");
43808+ goto exit3;
43809+ }
43810+ ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
43811+
43812+ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
43813+ exit3:
43814+ done_lh(&hint->lh);
43815+ save_file_hint(file, hint);
43816+ hint->ext_coord.valid = 0;
43817+ exit2:
43818+ kfree(hint);
43819+ exit1:
43820+ put_cluster_handle(&clust);
43821+ return ret;
43822+}
43823+
43824+/*
43825+ plugin->u.item.s.file.append_key
43826+ key of the first item of the next disk cluster
43827+*/
43828+reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
43829+{
43830+ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
43831+ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
43832+
43833+ item_key_by_coord(coord, key);
43834+ set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1)
43835+ << cluster_shift_by_coord(coord));
43836+ return key;
43837+}
43838+
43839+static int insert_unprepped_ctail(struct cluster_handle * clust,
43840+ struct inode *inode)
43841+{
43842+ int result;
43843+ char buf[UCTAIL_NR_UNITS];
43844+ reiser4_item_data data;
43845+ reiser4_key key;
43846+ int shift = (int)UCTAIL_SHIFT;
43847+
43848+ memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
43849+ result = key_by_inode_cryptcompress(inode,
43850+ clust_to_off(clust->index, inode),
43851+ &key);
43852+ if (result)
43853+ return result;
43854+ data.user = 0;
43855+ data.iplug = item_plugin_by_id(CTAIL_ID);
43856+ data.arg = &shift;
43857+ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
43858+ data.data = buf;
43859+
43860+ result = insert_by_coord(&clust->hint->ext_coord.coord,
43861+ &data, &key, clust->hint->ext_coord.lh, 0);
43862+ return result;
43863+}
43864+
43865+static int
43866+insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
43867+ struct inode *inode)
43868+{
43869+ int result;
43870+ carry_pool *pool;
43871+ carry_level *lowest_level;
43872+ reiser4_item_data *data;
43873+ carry_op *op;
43874+ int cluster_shift = inode_cluster_shift(inode);
43875+
43876+ pool =
43877+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
43878+ sizeof(*data));
43879+ if (IS_ERR(pool))
43880+ return PTR_ERR(pool);
43881+ lowest_level = (carry_level *) (pool + 1);
43882+ init_carry_level(lowest_level, pool);
43883+ data = (reiser4_item_data *) (lowest_level + 3);
43884+
43885+ assert("edward-466", coord->between == AFTER_ITEM
43886+ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
43887+ || coord->between == EMPTY_NODE
43888+ || coord->between == BEFORE_UNIT);
43889+
43890+ if (coord->between == AFTER_UNIT) {
43891+ coord->unit_pos = 0;
43892+ coord->between = AFTER_ITEM;
43893+ }
43894+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
43895+ 0 /* operate directly on coord -> node */);
43896+ if (IS_ERR(op) || (op == NULL)) {
43897+ done_carry_pool(pool);
43898+ return RETERR(op ? PTR_ERR(op) : -EIO);
43899+ }
43900+ data->user = 0;
43901+ data->iplug = item_plugin_by_id(CTAIL_ID);
43902+ data->arg = &cluster_shift;
43903+
43904+ data->length = 0;
43905+ data->data = NULL;
43906+
43907+ op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
43908+ op->u.insert_flow.insert_point = coord;
43909+ op->u.insert_flow.flow = f;
43910+ op->u.insert_flow.data = data;
43911+ op->u.insert_flow.new_nodes = 0;
43912+
43913+ lowest_level->track_type = CARRY_TRACK_CHANGE;
43914+ lowest_level->tracked = lh;
43915+
43916+ result = reiser4_carry(lowest_level, NULL);
43917+ done_carry_pool(pool);
43918+
43919+ return result;
43920+}
43921+
43922+/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
43923+static int insert_cryptcompress_flow_in_place(coord_t * coord,
43924+ lock_handle * lh, flow_t * f,
43925+ struct inode *inode)
43926+{
43927+ int ret;
43928+ coord_t pos;
43929+ lock_handle lock;
43930+
43931+ assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
43932+ assert("edward-484", coord->between == AT_UNIT
43933+ || coord->between == AFTER_ITEM);
43934+ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
43935+
43936+ coord_dup(&pos, coord);
43937+ pos.unit_pos = 0;
43938+ pos.between = AFTER_ITEM;
43939+
43940+ init_lh(&lock);
43941+ copy_lh(&lock, lh);
43942+
43943+ ret = insert_cryptcompress_flow(&pos, &lock, f, inode);
43944+ done_lh(&lock);
43945+ assert("edward-1347", znode_is_write_locked(lh->node));
43946+ assert("edward-1228", !ret);
43947+ return ret;
43948+}
43949+
43950+/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
43951+static int overwrite_ctail(coord_t * coord, flow_t * f)
43952+{
43953+ unsigned count;
43954+
43955+ assert("edward-269", f->user == 0);
43956+ assert("edward-270", f->data != NULL);
43957+ assert("edward-271", f->length > 0);
43958+ assert("edward-272", coord_is_existing_unit(coord));
43959+ assert("edward-273", coord->unit_pos == 0);
43960+ assert("edward-274", znode_is_write_locked(coord->node));
43961+ assert("edward-275", reiser4_schedulable());
43962+ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
43963+ assert("edward-1243", ctail_ok(coord));
43964+
43965+ count = nr_units_ctail(coord);
43966+
43967+ if (count > f->length)
43968+ count = f->length;
43969+ memcpy(first_unit(coord), f->data, count);
43970+ move_flow_forward(f, count);
43971+ coord->unit_pos += count;
43972+ return 0;
43973+}
43974+
43975+/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
43976+ cut ctail (part or whole) starting from next unit position */
43977+static int cut_ctail(coord_t * coord)
43978+{
43979+ coord_t stop;
43980+
43981+ assert("edward-435", coord->between == AT_UNIT &&
43982+ coord->item_pos < coord_num_items(coord) &&
43983+ coord->unit_pos <= coord_num_units(coord));
43984+
43985+ if (coord->unit_pos == coord_num_units(coord))
43986+ /* nothing to cut */
43987+ return 0;
43988+ coord_dup(&stop, coord);
43989+ stop.unit_pos = coord_last_unit_pos(coord);
43990+
43991+ return cut_node_content(coord, &stop, NULL, NULL, NULL);
43992+}
43993+
43994+int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
43995+ struct inode * inode)
43996+{
43997+ int result;
43998+ assert("edward-1244", inode != NULL);
43999+ assert("edward-1245", clust->hint != NULL);
44000+ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
44001+ assert("edward-1247", clust->reserved == 1);
44002+
44003+ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
44004+ if (cbk_errored(result))
44005+ return result;
44006+ assert("edward-1249", result == CBK_COORD_NOTFOUND);
44007+ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
44008+
44009+ assert("edward-1295",
44010+ clust->hint->ext_coord.lh->node ==
44011+ clust->hint->ext_coord.coord.node);
44012+
44013+ coord_set_between_clusters(&clust->hint->ext_coord.coord);
44014+
44015+ result = insert_unprepped_ctail(clust, inode);
44016+ all_grabbed2free();
44017+
44018+ assert("edward-1251", !result);
44019+ assert("edward-1252", cryptcompress_inode_ok(inode));
44020+ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
44021+ assert("edward-1254",
44022+ reiser4_clustered_blocks(reiser4_get_current_sb()));
44023+ assert("edward-1255",
44024+ znode_convertible(clust->hint->ext_coord.coord.node));
44025+
44026+ return result;
44027+}
44028+
44029+static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
44030+{
44031+ int result = 0;
44032+ struct convert_item_info * info;
44033+
44034+ assert("edward-468", pos != NULL);
44035+ assert("edward-469", pos->sq != NULL);
44036+ assert("edward-845", item_convert_data(pos) != NULL);
44037+
44038+ info = item_convert_data(pos);
44039+ assert("edward-679", info->flow.data != NULL);
44040+
44041+ switch (mode) {
44042+ case CRC_APPEND_ITEM:
44043+ assert("edward-1229", info->flow.length != 0);
44044+ assert("edward-1256",
44045+ cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
44046+ result =
44047+ insert_cryptcompress_flow_in_place(&pos->coord,
44048+ &pos->lock,
44049+ &info->flow,
44050+ info->inode);
44051+ break;
44052+ case CRC_OVERWRITE_ITEM:
44053+ assert("edward-1230", info->flow.length != 0);
44054+ overwrite_ctail(&pos->coord, &info->flow);
44055+ if (info->flow.length != 0)
44056+ break;
44057+ case CRC_CUT_ITEM:
44058+ assert("edward-1231", info->flow.length == 0);
44059+ result = cut_ctail(&pos->coord);
44060+ break;
44061+ default:
44062+ result = RETERR(-EIO);
44063+ impossible("edward-244", "bad convert mode");
44064+ }
44065+ return result;
44066+}
44067+
44068+/* plugin->u.item.f.scan */
44069+int scan_ctail(flush_scan * scan)
44070+{
44071+ int result = 0;
44072+ struct page *page;
44073+ struct inode *inode;
44074+ jnode *node = scan->node;
44075+
44076+ assert("edward-227", scan->node != NULL);
44077+ assert("edward-228", jnode_is_cluster_page(scan->node));
44078+ assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
44079+
44080+ page = jnode_page(node);
44081+ inode = page->mapping->host;
44082+
44083+ if (!reiser4_scanning_left(scan))
44084+ return result;
44085+ if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
44086+ znode_make_dirty(scan->parent_lock.node);
44087+
44088+ if (!znode_convertible(scan->parent_lock.node)) {
44089+ if (JF_ISSET(scan->node, JNODE_DIRTY))
44090+ znode_set_convertible(scan->parent_lock.node);
44091+ else {
44092+ warning("edward-681",
44093+ "cluster page is already processed");
44094+ return -EAGAIN;
44095+ }
44096+ }
44097+ return result;
44098+}
44099+
44100+/* If true, this function attaches children */
44101+static int should_attach_convert_idata(flush_pos_t * pos)
44102+{
44103+ int result;
44104+ assert("edward-431", pos != NULL);
44105+ assert("edward-432", pos->child == NULL);
44106+ assert("edward-619", znode_is_write_locked(pos->coord.node));
44107+ assert("edward-470",
44108+ item_plugin_by_coord(&pos->coord) ==
44109+ item_plugin_by_id(CTAIL_ID));
44110+
44111+ /* check for leftmost child */
44112+ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
44113+
44114+ if (!pos->child)
44115+ return 0;
44116+ spin_lock_jnode(pos->child);
44117+ result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
44118+ pos->child->atom == ZJNODE(pos->coord.node)->atom);
44119+ spin_unlock_jnode(pos->child);
44120+ if (!result && pos->child) {
44121+ /* existing child isn't to attach, clear up this one */
44122+ jput(pos->child);
44123+ pos->child = NULL;
44124+ }
44125+ return result;
44126+}
44127+
44128+/* plugin->init_convert_data() */
44129+static int
44130+init_convert_data_ctail(struct convert_item_info * idata, struct inode *inode)
44131+{
44132+ assert("edward-813", idata != NULL);
44133+ assert("edward-814", inode != NULL);
44134+
44135+ idata->inode = inode;
44136+ idata->d_cur = DC_FIRST_ITEM;
44137+ idata->d_next = DC_INVALID_STATE;
44138+
44139+ return 0;
44140+}
44141+
44142+static int alloc_item_convert_data(struct convert_info * sq)
44143+{
44144+ assert("edward-816", sq != NULL);
44145+ assert("edward-817", sq->itm == NULL);
44146+
44147+ sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
44148+ if (sq->itm == NULL)
44149+ return RETERR(-ENOMEM);
44150+ return 0;
44151+}
44152+
44153+static void free_item_convert_data(struct convert_info * sq)
44154+{
44155+ assert("edward-818", sq != NULL);
44156+ assert("edward-819", sq->itm != NULL);
44157+ assert("edward-820", sq->iplug != NULL);
44158+
44159+ kfree(sq->itm);
44160+ sq->itm = NULL;
44161+ return;
44162+}
44163+
44164+static int alloc_convert_data(flush_pos_t * pos)
44165+{
44166+ assert("edward-821", pos != NULL);
44167+ assert("edward-822", pos->sq == NULL);
44168+
44169+ pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
44170+ if (!pos->sq)
44171+ return RETERR(-ENOMEM);
44172+ memset(pos->sq, 0, sizeof(*pos->sq));
44173+ cluster_init_write(&pos->sq->clust, NULL);
44174+ return 0;
44175+}
44176+
44177+void free_convert_data(flush_pos_t * pos)
44178+{
44179+ struct convert_info *sq;
44180+
44181+ assert("edward-823", pos != NULL);
44182+ assert("edward-824", pos->sq != NULL);
44183+
44184+ sq = pos->sq;
44185+ if (sq->itm)
44186+ free_item_convert_data(sq);
44187+ put_cluster_handle(&sq->clust);
44188+ kfree(pos->sq);
44189+ pos->sq = NULL;
44190+ return;
44191+}
44192+
44193+static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
44194+{
44195+ struct convert_info *sq;
44196+
44197+ assert("edward-825", pos != NULL);
44198+ assert("edward-826", pos->sq != NULL);
44199+ assert("edward-827", item_convert_data(pos) != NULL);
44200+ assert("edward-828", inode != NULL);
44201+
44202+ sq = pos->sq;
44203+
44204+ memset(sq->itm, 0, sizeof(*sq->itm));
44205+
44206+ /* iplug->init_convert_data() */
44207+ return init_convert_data_ctail(sq->itm, inode);
44208+}
44209+
44210+/* create and attach disk cluster info used by 'convert' phase of the flush
44211+ squalloc() */
44212+static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
44213+{
44214+ int ret = 0;
44215+ struct convert_item_info *info;
44216+ struct cluster_handle *clust;
44217+ file_plugin *fplug = inode_file_plugin(inode);
44218+ compression_plugin *cplug = inode_compression_plugin(inode);
44219+
44220+ assert("edward-248", pos != NULL);
44221+ assert("edward-249", pos->child != NULL);
44222+ assert("edward-251", inode != NULL);
44223+ assert("edward-682", cryptcompress_inode_ok(inode));
44224+ assert("edward-252",
44225+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
44226+ assert("edward-473",
44227+ item_plugin_by_coord(&pos->coord) ==
44228+ item_plugin_by_id(CTAIL_ID));
44229+
44230+ if (!pos->sq) {
44231+ ret = alloc_convert_data(pos);
44232+ if (ret)
44233+ return ret;
44234+ }
44235+ clust = &pos->sq->clust;
44236+ ret = grab_coa(&clust->tc, cplug);
44237+ if (ret)
44238+ goto err;
44239+ ret = set_cluster_by_page(clust,
44240+ jnode_page(pos->child),
44241+ MAX_CLUSTER_NRPAGES);
44242+ if (ret)
44243+ goto err;
44244+
44245+ assert("edward-829", pos->sq != NULL);
44246+ assert("edward-250", item_convert_data(pos) == NULL);
44247+
44248+ pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
44249+
44250+ ret = alloc_item_convert_data(pos->sq);
44251+ if (ret)
44252+ goto err;
44253+ ret = init_item_convert_data(pos, inode);
44254+ if (ret)
44255+ goto err;
44256+ info = item_convert_data(pos);
44257+
44258+ ret = checkout_logical_cluster(clust, pos->child, inode);
44259+ if (ret)
44260+ goto err;
44261+
44262+ reiser4_deflate_cluster(clust, inode);
44263+ inc_item_convert_count(pos);
44264+
44265+ /* prepare flow for insertion */
44266+ fplug->flow_by_inode(info->inode,
44267+ (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
44268+ 0 /* kernel space */ ,
44269+ clust->tc.len,
44270+ clust_to_off(clust->index, inode),
44271+ WRITE_OP, &info->flow);
44272+ jput(pos->child);
44273+
44274+ assert("edward-683", cryptcompress_inode_ok(inode));
44275+ return 0;
44276+ err:
44277+ jput(pos->child);
44278+ free_convert_data(pos);
44279+ return ret;
44280+}
44281+
44282+/* clear up disk cluster info */
44283+static void detach_convert_idata(struct convert_info * sq)
44284+{
44285+ struct convert_item_info *info;
44286+
44287+ assert("edward-253", sq != NULL);
44288+ assert("edward-840", sq->itm != NULL);
44289+
44290+ info = sq->itm;
44291+ assert("edward-255", info->inode != NULL);
44292+ assert("edward-1212", info->flow.length == 0);
44293+
44294+ free_item_convert_data(sq);
44295+ return;
44296+}
44297+
44298+/* plugin->u.item.f.utmost_child */
44299+
44300+/* This function sets leftmost child for a first cluster item,
44301+ if the child exists, and NULL in other cases.
44302+ NOTE-EDWARD: Do not call this for RIGHT_SIDE */
44303+
44304+int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
44305+{
44306+ reiser4_key key;
44307+
44308+ item_key_by_coord(coord, &key);
44309+
44310+ assert("edward-257", coord != NULL);
44311+ assert("edward-258", child != NULL);
44312+ assert("edward-259", side == LEFT_SIDE);
44313+ assert("edward-260",
44314+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
44315+
44316+ if (!is_disk_cluster_key(&key, coord))
44317+ *child = NULL;
44318+ else
44319+ *child = jlookup(current_tree,
44320+ get_key_objectid(item_key_by_coord
44321+ (coord, &key)),
44322+ off_to_pg(get_key_offset(&key)));
44323+ return 0;
44324+}
44325+
44326+/* Returns true if @p2 is the next item to @p1
44327+ in the _same_ disk cluster.
44328+ Disk cluster is a set of items. If ->clustered() != NULL,
44329+ with each item the whole disk cluster should be read/modified
44330+*/
44331+
44332+/* Go rightward and check for next disk cluster item, set
44333+ * d_next to DC_CHAINED_ITEM, if the last one exists.
44334+ * If the current position is last item, go to right neighbor.
44335+ * Skip empty nodes. Note, that right neighbors may be not in
44336+ * the slum because of races. If so, make it dirty and
44337+ * convertible.
44338+ */
44339+static int next_item_dc_stat(flush_pos_t * pos)
44340+{
44341+ int ret = 0;
44342+ int stop = 0;
44343+ znode *cur;
44344+ coord_t coord;
44345+ lock_handle lh;
44346+ lock_handle right_lock;
44347+
44348+ assert("edward-1232", !node_is_empty(pos->coord.node));
44349+ assert("edward-1014",
44350+ pos->coord.item_pos < coord_num_items(&pos->coord));
44351+ assert("edward-1015", chaining_data_present(pos));
44352+ assert("edward-1017",
44353+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
44354+
44355+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
44356+
44357+ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
44358+ return ret;
44359+ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
44360+ return ret;
44361+
44362+ /* Check next slum item.
44363+ * Note, that it can not be killed by concurrent truncate,
44364+ * as the last one will want the lock held by us.
44365+ */
44366+ init_lh(&right_lock);
44367+ cur = pos->coord.node;
44368+
44369+ while (!stop) {
44370+ init_lh(&lh);
44371+ ret = reiser4_get_right_neighbor(&lh,
44372+ cur,
44373+ ZNODE_WRITE_LOCK,
44374+ GN_CAN_USE_UPPER_LEVELS);
44375+ if (ret)
44376+ break;
44377+ ret = zload(lh.node);
44378+ if (ret) {
44379+ done_lh(&lh);
44380+ break;
44381+ }
44382+ coord_init_before_first_item(&coord, lh.node);
44383+
44384+ if (node_is_empty(lh.node)) {
44385+ znode_make_dirty(lh.node);
44386+ znode_set_convertible(lh.node);
44387+ stop = 0;
44388+ } else if (same_disk_cluster(&pos->coord, &coord)) {
44389+
44390+ item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
44391+
44392+ if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
44393+ /*
44394+ warning("edward-1024",
44395+ "next slum item mergeable, "
44396+ "but znode %p isn't dirty\n",
44397+ lh.node);
44398+ */
44399+ znode_make_dirty(lh.node);
44400+ }
44401+ if (!znode_convertible(lh.node)) {
44402+ /*
44403+ warning("edward-1272",
44404+ "next slum item mergeable, "
44405+ "but znode %p isn't convertible\n",
44406+ lh.node);
44407+ */
44408+ znode_set_convertible(lh.node);
44409+ }
44410+ stop = 1;
44411+ } else
44412+ stop = 1;
44413+ zrelse(lh.node);
44414+ done_lh(&right_lock);
44415+ copy_lh(&right_lock, &lh);
44416+ done_lh(&lh);
44417+ cur = right_lock.node;
44418+ }
44419+ done_lh(&right_lock);
44420+
44421+ if (ret == -E_NO_NEIGHBOR)
44422+ ret = 0;
44423+ return ret;
44424+}
44425+
44426+static int
44427+assign_convert_mode(struct convert_item_info * idata,
44428+ cryptcompress_write_mode_t * mode)
44429+{
44430+ int result = 0;
44431+
44432+ assert("edward-1025", idata != NULL);
44433+
44434+ if (idata->flow.length) {
44435+ /* append or overwrite */
44436+ switch (idata->d_cur) {
44437+ case DC_FIRST_ITEM:
44438+ case DC_CHAINED_ITEM:
44439+ *mode = CRC_OVERWRITE_ITEM;
44440+ break;
44441+ case DC_AFTER_CLUSTER:
44442+ *mode = CRC_APPEND_ITEM;
44443+ break;
44444+ default:
44445+ impossible("edward-1018", "wrong current item state");
44446+ }
44447+ } else {
44448+ /* cut or invalidate */
44449+ switch (idata->d_cur) {
44450+ case DC_FIRST_ITEM:
44451+ case DC_CHAINED_ITEM:
44452+ *mode = CRC_CUT_ITEM;
44453+ break;
44454+ case DC_AFTER_CLUSTER:
44455+ result = 1;
44456+ break;
44457+ default:
44458+ impossible("edward-1019", "wrong current item state");
44459+ }
44460+ }
44461+ return result;
44462+}
44463+
44464+/* plugin->u.item.f.convert */
44465+/* write ctail in guessed mode */
44466+int convert_ctail(flush_pos_t * pos)
44467+{
44468+ int result;
44469+ int nr_items;
44470+ cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
44471+
44472+ assert("edward-1020", pos != NULL);
44473+ assert("edward-1213", coord_num_items(&pos->coord) != 0);
44474+ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
44475+ assert("edward-1258", ctail_ok(&pos->coord));
44476+ assert("edward-261", pos->coord.node != NULL);
44477+
44478+ nr_items = coord_num_items(&pos->coord);
44479+ if (!chaining_data_present(pos)) {
44480+ if (should_attach_convert_idata(pos)) {
44481+ /* attach convert item info */
44482+ struct inode *inode;
44483+
44484+ assert("edward-264", pos->child != NULL);
44485+ assert("edward-265", jnode_page(pos->child) != NULL);
44486+ assert("edward-266",
44487+ jnode_page(pos->child)->mapping != NULL);
44488+
44489+ inode = jnode_page(pos->child)->mapping->host;
44490+
44491+ assert("edward-267", inode != NULL);
44492+
44493+ /* attach item convert info by child and put the last one */
44494+ result = attach_convert_idata(pos, inode);
44495+ pos->child = NULL;
44496+ if (result == -E_REPEAT) {
44497+ /* jnode became clean, or there is no dirty
44498+ pages (nothing to update in disk cluster) */
44499+ warning("edward-1021",
44500+ "convert_ctail: nothing to attach");
44501+ return 0;
44502+ }
44503+ if (result != 0)
44504+ return result;
44505+ } else
44506+ /* unconvertible */
44507+ return 0;
44508+ } else {
44509+ /* use old convert info */
44510+
44511+ struct convert_item_info *idata;
44512+
44513+ idata = item_convert_data(pos);
44514+
44515+ result = assign_convert_mode(idata, &mode);
44516+ if (result) {
44517+ /* disk cluster is over,
44518+ nothing to update anymore */
44519+ detach_convert_idata(pos->sq);
44520+ return 0;
44521+ }
44522+ }
44523+
44524+ assert("edward-433", chaining_data_present(pos));
44525+ assert("edward-1022",
44526+ pos->coord.item_pos < coord_num_items(&pos->coord));
44527+
44528+ /* check if next item is of current disk cluster */
44529+ result = next_item_dc_stat(pos);
44530+ if (result) {
44531+ detach_convert_idata(pos->sq);
44532+ return result;
44533+ }
44534+ result = do_convert_ctail(pos, mode);
44535+ if (result) {
44536+ detach_convert_idata(pos->sq);
44537+ return result;
44538+ }
44539+ switch (mode) {
44540+ case CRC_CUT_ITEM:
44541+ assert("edward-1214", item_convert_data(pos)->flow.length == 0);
44542+ assert("edward-1215",
44543+ coord_num_items(&pos->coord) == nr_items ||
44544+ coord_num_items(&pos->coord) == nr_items - 1);
44545+ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
44546+ break;
44547+ if (coord_num_items(&pos->coord) != nr_items) {
44548+ /* the item was killed, no more chained items */
44549+ detach_convert_idata(pos->sq);
44550+ if (!node_is_empty(pos->coord.node))
44551+ /* make sure the next item will be scanned */
44552+ coord_init_before_item(&pos->coord);
44553+ break;
44554+ }
44555+ case CRC_APPEND_ITEM:
44556+ assert("edward-434", item_convert_data(pos)->flow.length == 0);
44557+ detach_convert_idata(pos->sq);
44558+ break;
44559+ case CRC_OVERWRITE_ITEM:
44560+ if (coord_is_unprepped_ctail(&pos->coord)) {
44561+ /* convert unpprepped ctail to prepped one */
44562+ int shift;
44563+ shift =
44564+ inode_cluster_shift(item_convert_data(pos)->inode);
44565+ assert("edward-1259", cluster_shift_ok(shift));
44566+ put_unaligned((d8)shift,
44567+ &ctail_formatted_at(&pos->coord)->
44568+ cluster_shift);
44569+ }
44570+ break;
44571+ }
44572+ return result;
44573+}
44574+
44575+/* Make Linus happy.
44576+ Local variables:
44577+ c-indentation-style: "K&R"
44578+ mode-name: "LC"
44579+ c-basic-offset: 8
44580+ tab-width: 8
44581+ fill-column: 120
44582+ End:
44583+*/
44584diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.23/fs/reiser4/plugin/item/ctail.h
44585--- linux-2.6.23.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 03:00:00.000000000 +0300
44586+++ linux-2.6.23/fs/reiser4/plugin/item/ctail.h 2007-12-04 16:49:30.000000000 +0300
44587@@ -0,0 +1,102 @@
44588+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44589+
44590+/* Ctail items are fragments (or bodies) of special tipe to provide
44591+ optimal storage of encrypted and(or) compressed files. */
44592+
44593+
44594+#if !defined( __FS_REISER4_CTAIL_H__ )
44595+#define __FS_REISER4_CTAIL_H__
44596+
44597+/* Disk format of ctail item */
44598+typedef struct ctail_item_format {
44599+ /* packed shift;
44600+ if its value is different from UCTAIL_SHIFT (see below), then
44601+ size of disk cluster is calculated as (1 << cluster_shift) */
44602+ d8 cluster_shift;
44603+ /* ctail body */
44604+ d8 body[0];
44605+} __attribute__ ((packed)) ctail_item_format;
44606+
44607+/* "Unprepped" disk cluster is represented by a single ctail item
44608+ with the following "magic" attributes: */
44609+/* "magic" cluster_shift */
44610+#define UCTAIL_SHIFT 0xff
44611+/* How many units unprepped ctail item has */
44612+#define UCTAIL_NR_UNITS 1
44613+
44614+/* The following is a set of various item states in a disk cluster.
44615+ Disk cluster is a set of items whose keys belong to the interval
44616+ [dc_key , dc_key + disk_cluster_size - 1] */
44617+typedef enum {
44618+ DC_INVALID_STATE = 0,
44619+ DC_FIRST_ITEM = 1,
44620+ DC_CHAINED_ITEM = 2,
44621+ DC_AFTER_CLUSTER = 3
44622+} dc_item_stat;
44623+
44624+/* ctail-specific extension.
44625+ In particular this describes parameters of disk cluster an item belongs to */
44626+struct ctail_coord_extension {
44627+ int shift; /* this contains cluster_shift extracted from
44628+ ctail_item_format (above), or UCTAIL_SHIFT
44629+ (the last one is the "magic" of unprepped disk clusters)*/
44630+ int dsize; /* size of a prepped disk cluster */
44631+ int ncount; /* count of nodes occupied by a disk cluster */
44632+};
44633+
44634+struct cut_list;
44635+
44636+/* plugin->item.b.* */
44637+int can_contain_key_ctail(const coord_t *, const reiser4_key *,
44638+ const reiser4_item_data *);
44639+int mergeable_ctail(const coord_t * p1, const coord_t * p2);
44640+pos_in_node_t nr_units_ctail(const coord_t * coord);
44641+int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
44642+void print_ctail(const char *prefix, coord_t * coord);
44643+lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
44644+
44645+int paste_ctail(coord_t * coord, reiser4_item_data * data,
44646+ carry_plugin_info * info UNUSED_ARG);
44647+int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
44648+int can_shift_ctail(unsigned free_space, coord_t * coord,
44649+ znode * target, shift_direction pend, unsigned *size,
44650+ unsigned want);
44651+void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
44652+ unsigned count, shift_direction where_is_free_space,
44653+ unsigned free_space);
44654+int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44655+ carry_cut_data *, reiser4_key * smallest_removed,
44656+ reiser4_key * new_first);
44657+int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44658+ carry_kill_data *, reiser4_key * smallest_removed,
44659+ reiser4_key * new_first);
44660+int ctail_ok(const coord_t * coord);
44661+int check_ctail(const coord_t * coord, const char **error);
44662+
44663+/* plugin->u.item.s.* */
44664+int read_ctail(struct file *, flow_t *, hint_t *);
44665+int readpage_ctail(void *, struct page *);
44666+int readpages_ctail(struct file *, struct address_space *, struct list_head *);
44667+reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
44668+int create_hook_ctail(const coord_t * coord, void *arg);
44669+int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
44670+ carry_kill_data *);
44671+int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
44672+
44673+/* plugin->u.item.f */
44674+int utmost_child_ctail(const coord_t *, sideof, jnode **);
44675+int scan_ctail(flush_scan *);
44676+int convert_ctail(flush_pos_t *);
44677+size_t inode_scaled_cluster_size(struct inode *);
44678+
44679+#endif /* __FS_REISER4_CTAIL_H__ */
44680+
44681+/* Make Linus happy.
44682+ Local variables:
44683+ c-indentation-style: "K&R"
44684+ mode-name: "LC"
44685+ c-basic-offset: 8
44686+ tab-width: 8
44687+ fill-column: 120
44688+ End:
44689+*/
44690diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/extent.c linux-2.6.23/fs/reiser4/plugin/item/extent.c
44691--- linux-2.6.23.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 03:00:00.000000000 +0300
44692+++ linux-2.6.23/fs/reiser4/plugin/item/extent.c 2007-12-04 16:49:30.000000000 +0300
44693@@ -0,0 +1,197 @@
44694+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44695+
44696+#include "item.h"
44697+#include "../../key.h"
44698+#include "../../super.h"
44699+#include "../../carry.h"
44700+#include "../../inode.h"
44701+#include "../../page_cache.h"
44702+#include "../../flush.h"
44703+#include "../object.h"
44704+
44705+/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
44706+/* Audited by: green(2002.06.13) */
44707+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
44708+ int nr_extents)
44709+{
44710+ data->data = ext_unit;
44711+ /* data->data is kernel space */
44712+ data->user = 0;
44713+ data->length = sizeof(reiser4_extent) * nr_extents;
44714+ data->arg = NULL;
44715+ data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
44716+ return data;
44717+}
44718+
44719+/* how many bytes are addressed by @nr first extents of the extent item */
44720+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
44721+{
44722+ pos_in_node_t i;
44723+ reiser4_block_nr blocks;
44724+ reiser4_extent *ext;
44725+
44726+ ext = item_body_by_coord(coord);
44727+ assert("vs-263", nr <= nr_units_extent(coord));
44728+
44729+ blocks = 0;
44730+ for (i = 0; i < nr; i++, ext++) {
44731+ blocks += extent_get_width(ext);
44732+ }
44733+
44734+ return blocks * current_blocksize;
44735+}
44736+
44737+extent_state state_of_extent(reiser4_extent * ext)
44738+{
44739+ switch ((int)extent_get_start(ext)) {
44740+ case 0:
44741+ return HOLE_EXTENT;
44742+ case 1:
44743+ return UNALLOCATED_EXTENT;
44744+ default:
44745+ break;
44746+ }
44747+ return ALLOCATED_EXTENT;
44748+}
44749+
44750+int extent_is_unallocated(const coord_t * item)
44751+{
44752+ assert("jmacd-5133", item_is_extent(item));
44753+
44754+ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
44755+}
44756+
44757+/* set extent's start and width */
44758+void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
44759+ reiser4_block_nr width)
44760+{
44761+ extent_set_start(ext, start);
44762+ extent_set_width(ext, width);
44763+}
44764+
44765+/**
44766+ * reiser4_replace_extent - replace extent and paste 1 or 2 after it
44767+ * @un_extent: coordinate of extent to be overwritten
44768+ * @lh: need better comment
44769+ * @key: need better comment
44770+ * @exts_to_add: data prepared for insertion into tree
44771+ * @replace: need better comment
44772+ * @flags: need better comment
44773+ * @return_insert_position: need better comment
44774+ *
44775+ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
44776+ * @return_inserted_position is 1 - @un_extent and @lh are returned set to
44777+ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
44778+ * set to extent which was overwritten.
44779+ */
44780+int reiser4_replace_extent(struct replace_handle *h,
44781+ int return_inserted_position)
44782+{
44783+ int result;
44784+ znode *orig_znode;
44785+ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
44786+
44787+ assert("vs-990", coord_is_existing_unit(h->coord));
44788+ assert("vs-1375", znode_is_write_locked(h->coord->node));
44789+ assert("vs-1426", extent_get_width(&h->overwrite) != 0);
44790+ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
44791+ assert("vs-1427", ergo(h->nr_new_extents == 2,
44792+ extent_get_width(&h->new_extents[1]) != 0));
44793+
44794+ /* compose structure for paste */
44795+ init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
44796+
44797+ coord_dup(&h->coord_after, h->coord);
44798+ init_lh(&h->lh_after);
44799+ copy_lh(&h->lh_after, h->lh);
44800+ reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
44801+ reiser4_tap_monitor(&h->watch);
44802+
44803+ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
44804+ orig_znode = h->coord->node;
44805+
44806+#if REISER4_DEBUG
44807+ /* make sure that key is set properly */
44808+ unit_key_by_coord(h->coord, &h->tmp);
44809+ set_key_offset(&h->tmp,
44810+ get_key_offset(&h->tmp) +
44811+ extent_get_width(&h->overwrite) * current_blocksize);
44812+ assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
44813+#endif
44814+
44815+ /* set insert point after unit to be replaced */
44816+ h->coord->between = AFTER_UNIT;
44817+
44818+ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
44819+ &h->paste_key, &h->item, h->flags);
44820+ if (!result) {
44821+ /* now we have to replace the unit after which new units were
44822+ inserted. Its position is tracked by @watch */
44823+ reiser4_extent *ext;
44824+ znode *node;
44825+
44826+ node = h->coord_after.node;
44827+ if (node != orig_znode) {
44828+ coord_clear_iplug(&h->coord_after);
44829+ result = zload(node);
44830+ }
44831+
44832+ if (likely(!result)) {
44833+ ext = extent_by_coord(&h->coord_after);
44834+
44835+ assert("vs-987", znode_is_loaded(node));
44836+ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
44837+
44838+ /* overwrite extent unit */
44839+ memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
44840+ znode_make_dirty(node);
44841+
44842+ if (node != orig_znode)
44843+ zrelse(node);
44844+
44845+ if (return_inserted_position == 0) {
44846+ /* coord and lh are to be set to overwritten
44847+ extent */
44848+ assert("vs-1662",
44849+ WITH_DATA(node, !memcmp(&h->overwrite,
44850+ extent_by_coord(
44851+ &h->coord_after),
44852+ sizeof(reiser4_extent))));
44853+
44854+ *h->coord = h->coord_after;
44855+ done_lh(h->lh);
44856+ copy_lh(h->lh, &h->lh_after);
44857+ } else {
44858+ /* h->coord and h->lh are to be set to first of
44859+ inserted units */
44860+ assert("vs-1663",
44861+ WITH_DATA(h->coord->node,
44862+ !memcmp(&h->new_extents[0],
44863+ extent_by_coord(h->coord),
44864+ sizeof(reiser4_extent))));
44865+ assert("vs-1664", h->lh->node == h->coord->node);
44866+ }
44867+ }
44868+ }
44869+ reiser4_tap_done(&h->watch);
44870+
44871+ return result;
44872+}
44873+
44874+lock_handle *znode_lh(znode *node)
44875+{
44876+ assert("vs-1371", znode_is_write_locked(node));
44877+ assert("vs-1372", znode_is_wlocked_once(node));
44878+ return list_entry(node->lock.owners.next, lock_handle, owners_link);
44879+}
44880+
44881+/*
44882+ * Local variables:
44883+ * c-indentation-style: "K&R"
44884+ * mode-name: "LC"
44885+ * c-basic-offset: 8
44886+ * tab-width: 8
44887+ * fill-column: 79
44888+ * scroll-step: 1
44889+ * End:
44890+ */
44891diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.23/fs/reiser4/plugin/item/extent_file_ops.c
44892--- linux-2.6.23.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 03:00:00.000000000 +0300
44893+++ linux-2.6.23/fs/reiser4/plugin/item/extent_file_ops.c 2007-12-04 23:04:00.738308094 +0300
44894@@ -0,0 +1,1453 @@
44895+/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44896+
44897+#include "item.h"
44898+#include "../../inode.h"
44899+#include "../../page_cache.h"
44900+#include "../object.h"
44901+
44902+#include <linux/quotaops.h>
44903+#include <linux/swap.h>
44904+
44905+static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
44906+{
44907+ reiser4_extent *ext;
44908+
44909+ ext = (reiser4_extent *) (zdata(node) + offset);
44910+ return ext;
44911+}
44912+
44913+/**
44914+ * check_uf_coord - verify coord extension
44915+ * @uf_coord:
44916+ * @key:
44917+ *
44918+ * Makes sure that all fields of @uf_coord are set properly. If @key is
44919+ * specified - check whether @uf_coord is set correspondingly.
44920+ */
44921+static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
44922+{
44923+#if REISER4_DEBUG
44924+ const coord_t *coord;
44925+ const struct extent_coord_extension *ext_coord;
44926+ reiser4_extent *ext;
44927+
44928+ coord = &uf_coord->coord;
44929+ ext_coord = &uf_coord->extension.extent;
44930+ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
44931+
44932+ assert("",
44933+ WITH_DATA(coord->node,
44934+ (uf_coord->valid == 1 &&
44935+ coord_is_iplug_set(coord) &&
44936+ item_is_extent(coord) &&
44937+ ext_coord->nr_units == nr_units_extent(coord) &&
44938+ ext == extent_by_coord(coord) &&
44939+ ext_coord->width == extent_get_width(ext) &&
44940+ coord->unit_pos < ext_coord->nr_units &&
44941+ ext_coord->pos_in_unit < ext_coord->width &&
44942+ memcmp(ext, &ext_coord->extent,
44943+ sizeof(reiser4_extent)) == 0)));
44944+ if (key) {
44945+ reiser4_key coord_key;
44946+
44947+ unit_key_by_coord(&uf_coord->coord, &coord_key);
44948+ set_key_offset(&coord_key,
44949+ get_key_offset(&coord_key) +
44950+ (uf_coord->extension.extent.
44951+ pos_in_unit << PAGE_CACHE_SHIFT));
44952+ assert("", keyeq(key, &coord_key));
44953+ }
44954+#endif
44955+}
44956+
44957+static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
44958+{
44959+ check_uf_coord(uf_coord, NULL);
44960+
44961+ return ext_by_offset(uf_coord->coord.node,
44962+ uf_coord->extension.extent.ext_offset);
44963+}
44964+
44965+#if REISER4_DEBUG
44966+
44967+/**
44968+ * offset_is_in_unit
44969+ *
44970+ *
44971+ *
44972+ */
44973+/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
44974+ pos_in_unit inside of unit correspondingly */
44975+static int offset_is_in_unit(const coord_t *coord, loff_t off)
44976+{
44977+ reiser4_key unit_key;
44978+ __u64 unit_off;
44979+ reiser4_extent *ext;
44980+
44981+ ext = extent_by_coord(coord);
44982+
44983+ unit_key_extent(coord, &unit_key);
44984+ unit_off = get_key_offset(&unit_key);
44985+ if (off < unit_off)
44986+ return 0;
44987+ if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
44988+ return 0;
44989+ return 1;
44990+}
44991+
44992+static int
44993+coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
44994+{
44995+ reiser4_key item_key;
44996+
44997+ assert("vs-771", coord_is_existing_unit(coord));
44998+ assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
44999+ assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
45000+
45001+ return offset_is_in_unit(coord, get_key_offset(key));
45002+}
45003+
45004+#endif
45005+
45006+/**
45007+ * can_append -
45008+ * @key:
45009+ * @coord:
45010+ *
45011+ * Returns 1 if @key is equal to an append key of item @coord is set to
45012+ */
45013+static int can_append(const reiser4_key *key, const coord_t *coord)
45014+{
45015+ reiser4_key append_key;
45016+
45017+ return keyeq(key, append_key_extent(coord, &append_key));
45018+}
45019+
45020+/**
45021+ * append_hole
45022+ * @coord:
45023+ * @lh:
45024+ * @key:
45025+ *
45026+ */
45027+static int append_hole(coord_t *coord, lock_handle *lh,
45028+ const reiser4_key *key)
45029+{
45030+ reiser4_key append_key;
45031+ reiser4_block_nr hole_width;
45032+ reiser4_extent *ext, new_ext;
45033+ reiser4_item_data idata;
45034+
45035+ /* last item of file may have to be appended with hole */
45036+ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
45037+ assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
45038+
45039+ /* key of first byte which is not addressed by this extent */
45040+ append_key_extent(coord, &append_key);
45041+
45042+ assert("", keyle(&append_key, key));
45043+
45044+ /*
45045+ * extent item has to be appended with hole. Calculate length of that
45046+ * hole
45047+ */
45048+ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
45049+ current_blocksize - 1) >> current_blocksize_bits);
45050+ assert("vs-954", hole_width > 0);
45051+
45052+ /* set coord after last unit */
45053+ coord_init_after_item_end(coord);
45054+
45055+ /* get last extent in the item */
45056+ ext = extent_by_coord(coord);
45057+ if (state_of_extent(ext) == HOLE_EXTENT) {
45058+ /*
45059+ * last extent of a file is hole extent. Widen that extent by
45060+ * @hole_width blocks. Note that we do not worry about
45061+ * overflowing - extent width is 64 bits
45062+ */
45063+ reiser4_set_extent(ext, HOLE_EXTENT_START,
45064+ extent_get_width(ext) + hole_width);
45065+ znode_make_dirty(coord->node);
45066+ return 0;
45067+ }
45068+
45069+ /* append last item of the file with hole extent unit */
45070+ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
45071+ state_of_extent(ext) == UNALLOCATED_EXTENT));
45072+
45073+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45074+ init_new_extent(&idata, &new_ext, 1);
45075+ return insert_into_item(coord, lh, &append_key, &idata, 0);
45076+}
45077+
45078+/**
45079+ * check_jnodes
45080+ * @twig: longterm locked twig node
45081+ * @key:
45082+ *
45083+ */
45084+static void check_jnodes(znode *twig, const reiser4_key *key, int count)
45085+{
45086+#if REISER4_DEBUG
45087+ coord_t c;
45088+ reiser4_key node_key, jnode_key;
45089+
45090+ jnode_key = *key;
45091+
45092+ assert("", twig != NULL);
45093+ assert("", znode_get_level(twig) == TWIG_LEVEL);
45094+ assert("", znode_is_write_locked(twig));
45095+
45096+ zload(twig);
45097+ /* get the smallest key in twig node */
45098+ coord_init_first_unit(&c, twig);
45099+ unit_key_by_coord(&c, &node_key);
45100+ assert("", keyle(&node_key, &jnode_key));
45101+
45102+ coord_init_last_unit(&c, twig);
45103+ unit_key_by_coord(&c, &node_key);
45104+ if (item_plugin_by_coord(&c)->s.file.append_key)
45105+ item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
45106+ set_key_offset(&jnode_key,
45107+ get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
45108+ assert("", keylt(&jnode_key, &node_key));
45109+ zrelse(twig);
45110+#endif
45111+}
45112+
45113+/**
45114+ * append_last_extent - append last file item
45115+ * @uf_coord: coord to start insertion from
45116+ * @jnodes: array of jnodes
45117+ * @count: number of jnodes in the array
45118+ *
45119+ * There is already at least one extent item of file @inode in the tree. Append
45120+ * the last of them with unallocated extent unit of width @count. Assign
45121+ * fake block numbers to jnodes corresponding to the inserted extent.
45122+ */
45123+static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45124+ jnode **jnodes, int count)
45125+{
45126+ int result;
45127+ reiser4_extent new_ext;
45128+ reiser4_item_data idata;
45129+ coord_t *coord;
45130+ struct extent_coord_extension *ext_coord;
45131+ reiser4_extent *ext;
45132+ reiser4_block_nr block;
45133+ jnode *node;
45134+ int i;
45135+
45136+ coord = &uf_coord->coord;
45137+ ext_coord = &uf_coord->extension.extent;
45138+ ext = ext_by_ext_coord(uf_coord);
45139+
45140+ /* check correctness of position in the item */
45141+ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
45142+ assert("vs-1311", coord->between == AFTER_UNIT);
45143+ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
45144+
45145+ if (!can_append(key, coord)) {
45146+ /* hole extent has to be inserted */
45147+ result = append_hole(coord, uf_coord->lh, key);
45148+ uf_coord->valid = 0;
45149+ return result;
45150+ }
45151+
45152+ if (count == 0)
45153+ return 0;
45154+
45155+ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
45156+
45157+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
45158+ count);
45159+ BUG_ON(result != 0);
45160+
45161+ switch (state_of_extent(ext)) {
45162+ case UNALLOCATED_EXTENT:
45163+ /*
45164+ * last extent unit of the file is unallocated one. Increase
45165+ * its width by @count
45166+ */
45167+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
45168+ extent_get_width(ext) + count);
45169+ znode_make_dirty(coord->node);
45170+
45171+ /* update coord extension */
45172+ ext_coord->width += count;
45173+ ON_DEBUG(extent_set_width
45174+ (&uf_coord->extension.extent.extent,
45175+ ext_coord->width));
45176+ break;
45177+
45178+ case HOLE_EXTENT:
45179+ case ALLOCATED_EXTENT:
45180+ /*
45181+ * last extent unit of the file is either hole or allocated
45182+ * one. Append one unallocated extent of width @count
45183+ */
45184+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45185+ init_new_extent(&idata, &new_ext, 1);
45186+ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
45187+ uf_coord->valid = 0;
45188+ if (result)
45189+ return result;
45190+ break;
45191+
45192+ default:
45193+ return RETERR(-EIO);
45194+ }
45195+
45196+ /*
45197+ * make sure that we hold long term locked twig node containing all
45198+ * jnodes we are about to capture
45199+ */
45200+ check_jnodes(uf_coord->lh->node, key, count);
45201+
45202+ /*
45203+ * assign fake block numbers to all jnodes. FIXME: make sure whether
45204+ * twig node containing inserted extent item is locked
45205+ */
45206+ block = fake_blocknr_unformatted(count);
45207+ for (i = 0; i < count; i ++, block ++) {
45208+ node = jnodes[i];
45209+ spin_lock_jnode(node);
45210+ JF_SET(node, JNODE_CREATED);
45211+ jnode_set_block(node, &block);
45212+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45213+ BUG_ON(result != 0);
45214+ jnode_make_dirty_locked(node);
45215+ spin_unlock_jnode(node);
45216+ }
45217+ return count;
45218+}
45219+
45220+/**
45221+ * insert_first_hole - inser hole extent into tree
45222+ * @coord:
45223+ * @lh:
45224+ * @key:
45225+ *
45226+ *
45227+ */
45228+static int insert_first_hole(coord_t *coord, lock_handle *lh,
45229+ const reiser4_key *key)
45230+{
45231+ reiser4_extent new_ext;
45232+ reiser4_item_data idata;
45233+ reiser4_key item_key;
45234+ reiser4_block_nr hole_width;
45235+
45236+ /* @coord must be set for inserting of new item */
45237+ assert("vs-711", coord_is_between_items(coord));
45238+
45239+ item_key = *key;
45240+ set_key_offset(&item_key, 0ull);
45241+
45242+ hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
45243+ current_blocksize_bits);
45244+ assert("vs-710", hole_width > 0);
45245+
45246+ /* compose body of hole extent and insert item into tree */
45247+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45248+ init_new_extent(&idata, &new_ext, 1);
45249+ return insert_extent_by_coord(coord, &idata, &item_key, lh);
45250+}
45251+
45252+
45253+/**
45254+ * insert_first_extent - insert first file item
45255+ * @inode: inode of file
45256+ * @uf_coord: coord to start insertion from
45257+ * @jnodes: array of jnodes
45258+ * @count: number of jnodes in the array
45259+ * @inode:
45260+ *
45261+ * There are no items of file @inode in the tree yet. Insert unallocated extent
45262+ * of width @count into tree or hole extent if writing not to the
45263+ * beginning. Assign fake block numbers to jnodes corresponding to the inserted
45264+ * unallocated extent. Returns number of jnodes or error code.
45265+ */
45266+static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45267+ jnode **jnodes, int count,
45268+ struct inode *inode)
45269+{
45270+ int result;
45271+ int i;
45272+ reiser4_extent new_ext;
45273+ reiser4_item_data idata;
45274+ reiser4_block_nr block;
45275+ struct unix_file_info *uf_info;
45276+ jnode *node;
45277+
45278+ /* first extent insertion starts at leaf level */
45279+ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
45280+ assert("vs-711", coord_is_between_items(&uf_coord->coord));
45281+
45282+ if (get_key_offset(key) != 0) {
45283+ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
45284+ uf_coord->valid = 0;
45285+ uf_info = unix_file_inode_data(inode);
45286+
45287+ /*
45288+ * first item insertion is only possible when writing to empty
45289+ * file or performing tail conversion
45290+ */
45291+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
45292+ (reiser4_inode_get_flag(inode,
45293+ REISER4_PART_MIXED) &&
45294+ reiser4_inode_get_flag(inode,
45295+ REISER4_PART_IN_CONV))));
45296+ /* if file was empty - update its state */
45297+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
45298+ uf_info->container = UF_CONTAINER_EXTENTS;
45299+ return result;
45300+ }
45301+
45302+ if (count == 0)
45303+ return 0;
45304+
45305+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
45306+ BUG_ON(result != 0);
45307+
45308+ /*
45309+ * prepare for tree modification: compose body of item and item data
45310+ * structure needed for insertion
45311+ */
45312+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45313+ init_new_extent(&idata, &new_ext, 1);
45314+
45315+ /* insert extent item into the tree */
45316+ result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
45317+ uf_coord->lh);
45318+ if (result)
45319+ return result;
45320+
45321+ /*
45322+ * make sure that we hold long term locked twig node containing all
45323+ * jnodes we are about to capture
45324+ */
45325+ check_jnodes(uf_coord->lh->node, key, count);
45326+ /*
45327+ * assign fake block numbers to all jnodes, capture and mark them dirty
45328+ */
45329+ block = fake_blocknr_unformatted(count);
45330+ for (i = 0; i < count; i ++, block ++) {
45331+ node = jnodes[i];
45332+ spin_lock_jnode(node);
45333+ JF_SET(node, JNODE_CREATED);
45334+ jnode_set_block(node, &block);
45335+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45336+ BUG_ON(result != 0);
45337+ jnode_make_dirty_locked(node);
45338+ spin_unlock_jnode(node);
45339+ }
45340+
45341+ /*
45342+ * invalidate coordinate, research must be performed to continue
45343+ * because write will continue on twig level
45344+ */
45345+ uf_coord->valid = 0;
45346+ return count;
45347+}
45348+
45349+/**
45350+ * plug_hole - replace hole extent with unallocated and holes
45351+ * @uf_coord:
45352+ * @key:
45353+ * @node:
45354+ * @h: structure containing coordinate, lock handle, key, etc
45355+ *
45356+ * Creates an unallocated extent of width 1 within a hole. In worst case two
45357+ * additional extents can be created.
45358+ */
45359+static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
45360+{
45361+ struct replace_handle rh;
45362+ reiser4_extent *ext;
45363+ reiser4_block_nr width, pos_in_unit;
45364+ coord_t *coord;
45365+ struct extent_coord_extension *ext_coord;
45366+ int return_inserted_position;
45367+
45368+ check_uf_coord(uf_coord, key);
45369+
45370+ rh.coord = coord_by_uf_coord(uf_coord);
45371+ rh.lh = uf_coord->lh;
45372+ rh.flags = 0;
45373+
45374+ coord = coord_by_uf_coord(uf_coord);
45375+ ext_coord = ext_coord_by_uf_coord(uf_coord);
45376+ ext = ext_by_ext_coord(uf_coord);
45377+
45378+ width = ext_coord->width;
45379+ pos_in_unit = ext_coord->pos_in_unit;
45380+
45381+ *how = 0;
45382+ if (width == 1) {
45383+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
45384+ znode_make_dirty(coord->node);
45385+ /* update uf_coord */
45386+ ON_DEBUG(ext_coord->extent = *ext);
45387+ *how = 1;
45388+ return 0;
45389+ } else if (pos_in_unit == 0) {
45390+ /* we deal with first element of extent */
45391+ if (coord->unit_pos) {
45392+ /* there is an extent to the left */
45393+ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
45394+ /*
45395+ * left neighboring unit is an unallocated
45396+ * extent. Increase its width and decrease
45397+ * width of hole
45398+ */
45399+ extent_set_width(ext - 1,
45400+ extent_get_width(ext - 1) + 1);
45401+ extent_set_width(ext, width - 1);
45402+ znode_make_dirty(coord->node);
45403+
45404+ /* update coord extension */
45405+ coord->unit_pos--;
45406+ ext_coord->width = extent_get_width(ext - 1);
45407+ ext_coord->pos_in_unit = ext_coord->width - 1;
45408+ ext_coord->ext_offset -= sizeof(reiser4_extent);
45409+ ON_DEBUG(ext_coord->extent =
45410+ *extent_by_coord(coord));
45411+ *how = 2;
45412+ return 0;
45413+ }
45414+ }
45415+ /* extent for replace */
45416+ reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
45417+ /* extent to be inserted */
45418+ reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
45419+ width - 1);
45420+ rh.nr_new_extents = 1;
45421+
45422+ /* have reiser4_replace_extent to return with @coord and
45423+ @uf_coord->lh set to unit which was replaced */
45424+ return_inserted_position = 0;
45425+ *how = 3;
45426+ } else if (pos_in_unit == width - 1) {
45427+ /* we deal with last element of extent */
45428+ if (coord->unit_pos < nr_units_extent(coord) - 1) {
45429+ /* there is an extent unit to the right */
45430+ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
45431+ /*
45432+ * right neighboring unit is an unallocated
45433+ * extent. Increase its width and decrease
45434+ * width of hole
45435+ */
45436+ extent_set_width(ext + 1,
45437+ extent_get_width(ext + 1) + 1);
45438+ extent_set_width(ext, width - 1);
45439+ znode_make_dirty(coord->node);
45440+
45441+ /* update coord extension */
45442+ coord->unit_pos++;
45443+ ext_coord->width = extent_get_width(ext + 1);
45444+ ext_coord->pos_in_unit = 0;
45445+ ext_coord->ext_offset += sizeof(reiser4_extent);
45446+ ON_DEBUG(ext_coord->extent =
45447+ *extent_by_coord(coord));
45448+ *how = 4;
45449+ return 0;
45450+ }
45451+ }
45452+ /* extent for replace */
45453+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
45454+ /* extent to be inserted */
45455+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45456+ 1);
45457+ rh.nr_new_extents = 1;
45458+
45459+ /* have reiser4_replace_extent to return with @coord and
45460+ @uf_coord->lh set to unit which was inserted */
45461+ return_inserted_position = 1;
45462+ *how = 5;
45463+ } else {
45464+ /* extent for replace */
45465+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
45466+ pos_in_unit);
45467+ /* extents to be inserted */
45468+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45469+ 1);
45470+ reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
45471+ width - pos_in_unit - 1);
45472+ rh.nr_new_extents = 2;
45473+
45474+ /* have reiser4_replace_extent to return with @coord and
45475+ @uf_coord->lh set to first of units which were inserted */
45476+ return_inserted_position = 1;
45477+ *how = 6;
45478+ }
45479+ unit_key_by_coord(coord, &rh.paste_key);
45480+ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
45481+ extent_get_width(&rh.overwrite) * current_blocksize);
45482+
45483+ uf_coord->valid = 0;
45484+ return reiser4_replace_extent(&rh, return_inserted_position);
45485+}
45486+
45487+/**
45488+ * overwrite_one_block -
45489+ * @uf_coord:
45490+ * @key:
45491+ * @node:
45492+ *
45493+ * If @node corresponds to hole extent - create unallocated extent for it and
45494+ * assign fake block number. If @node corresponds to allocated extent - assign
45495+ * block number of jnode
45496+ */
45497+static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
45498+ jnode *node, int *hole_plugged)
45499+{
45500+ int result;
45501+ struct extent_coord_extension *ext_coord;
45502+ reiser4_extent *ext;
45503+ reiser4_block_nr block;
45504+ int how;
45505+
45506+ assert("vs-1312", uf_coord->coord.between == AT_UNIT);
45507+
45508+ result = 0;
45509+ ext_coord = ext_coord_by_uf_coord(uf_coord);
45510+ ext = ext_by_ext_coord(uf_coord);
45511+ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
45512+
45513+ switch (state_of_extent(ext)) {
45514+ case ALLOCATED_EXTENT:
45515+ block = extent_get_start(ext) + ext_coord->pos_in_unit;
45516+ break;
45517+
45518+ case HOLE_EXTENT:
45519+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
45520+ BUG_ON(result != 0);
45521+ result = plug_hole(uf_coord, key, &how);
45522+ if (result)
45523+ return result;
45524+ block = fake_blocknr_unformatted(1);
45525+ if (hole_plugged)
45526+ *hole_plugged = 1;
45527+ JF_SET(node, JNODE_CREATED);
45528+ break;
45529+
45530+ default:
45531+ return RETERR(-EIO);
45532+ }
45533+
45534+ jnode_set_block(node, &block);
45535+ return 0;
45536+}
45537+
45538+/**
45539+ * move_coord - move coordinate forward
45540+ * @uf_coord:
45541+ *
45542+ * Move coordinate one data block pointer forward. Return 1 if coord is set to
45543+ * the last one already or is invalid.
45544+ */
45545+static int move_coord(uf_coord_t *uf_coord)
45546+{
45547+ struct extent_coord_extension *ext_coord;
45548+
45549+ if (uf_coord->valid == 0)
45550+ return 1;
45551+ ext_coord = &uf_coord->extension.extent;
45552+ ext_coord->pos_in_unit ++;
45553+ if (ext_coord->pos_in_unit < ext_coord->width)
45554+ /* coordinate moved within the unit */
45555+ return 0;
45556+
45557+ /* end of unit is reached. Try to move to next unit */
45558+ ext_coord->pos_in_unit = 0;
45559+ uf_coord->coord.unit_pos ++;
45560+ if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
45561+ /* coordinate moved to next unit */
45562+ ext_coord->ext_offset += sizeof(reiser4_extent);
45563+ ext_coord->width =
45564+ extent_get_width(ext_by_offset
45565+ (uf_coord->coord.node,
45566+ ext_coord->ext_offset));
45567+ ON_DEBUG(ext_coord->extent =
45568+ *ext_by_offset(uf_coord->coord.node,
45569+ ext_coord->ext_offset));
45570+ return 0;
45571+ }
45572+ /* end of item is reached */
45573+ uf_coord->valid = 0;
45574+ return 1;
45575+}
45576+
45577+/**
45578+ * overwrite_extent -
45579+ * @inode:
45580+ *
45581+ * Returns number of handled jnodes.
45582+ */
45583+static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45584+ jnode **jnodes, int count, int *plugged_hole)
45585+{
45586+ int result;
45587+ reiser4_key k;
45588+ int i;
45589+ jnode *node;
45590+
45591+ k = *key;
45592+ for (i = 0; i < count; i ++) {
45593+ node = jnodes[i];
45594+ if (*jnode_get_block(node) == 0) {
45595+ result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
45596+ if (result)
45597+ return result;
45598+ }
45599+ /*
45600+ * make sure that we hold long term locked twig node containing
45601+ * all jnodes we are about to capture
45602+ */
45603+ check_jnodes(uf_coord->lh->node, &k, 1);
45604+ /*
45605+ * assign fake block numbers to all jnodes, capture and mark
45606+ * them dirty
45607+ */
45608+ spin_lock_jnode(node);
45609+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45610+ BUG_ON(result != 0);
45611+ jnode_make_dirty_locked(node);
45612+ spin_unlock_jnode(node);
45613+
45614+ if (uf_coord->valid == 0)
45615+ return i + 1;
45616+
45617+ check_uf_coord(uf_coord, &k);
45618+
45619+ if (move_coord(uf_coord)) {
45620+ /*
45621+ * failed to move to the next node pointer. Either end
45622+ * of file or end of twig node is reached. In the later
45623+ * case we might go to the right neighbor.
45624+ */
45625+ uf_coord->valid = 0;
45626+ return i + 1;
45627+ }
45628+ set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
45629+ }
45630+
45631+ return count;
45632+}
45633+
45634+/**
45635+ * reiser4_update_extent
45636+ * @file:
45637+ * @jnodes:
45638+ * @count:
45639+ * @off:
45640+ *
45641+ */
45642+int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
45643+ int *plugged_hole)
45644+{
45645+ int result;
45646+ znode *loaded;
45647+ uf_coord_t uf_coord;
45648+ coord_t *coord;
45649+ lock_handle lh;
45650+ reiser4_key key;
45651+
45652+ assert("", reiser4_lock_counters()->d_refs == 0);
45653+
45654+ key_by_inode_and_offset_common(inode, pos, &key);
45655+
45656+ init_uf_coord(&uf_coord, &lh);
45657+ coord = &uf_coord.coord;
45658+ result = find_file_item_nohint(coord, &lh, &key,
45659+ ZNODE_WRITE_LOCK, inode);
45660+ if (IS_CBKERR(result)) {
45661+ assert("", reiser4_lock_counters()->d_refs == 0);
45662+ return result;
45663+ }
45664+
45665+ result = zload(coord->node);
45666+ BUG_ON(result != 0);
45667+ loaded = coord->node;
45668+
45669+ if (coord->between == AFTER_UNIT) {
45670+ /*
45671+ * append existing extent item with unallocated extent of width
45672+ * nr_jnodes
45673+ */
45674+ init_coord_extension_extent(&uf_coord,
45675+ get_key_offset(&key));
45676+ result = append_last_extent(&uf_coord, &key,
45677+ &node, 1);
45678+ } else if (coord->between == AT_UNIT) {
45679+ /*
45680+ * overwrite
45681+ * not optimal yet. Will be optimized if new write will show
45682+ * performance win.
45683+ */
45684+ init_coord_extension_extent(&uf_coord,
45685+ get_key_offset(&key));
45686+ result = overwrite_extent(&uf_coord, &key,
45687+ &node, 1, plugged_hole);
45688+ } else {
45689+ /*
45690+ * there are no items of this file in the tree yet. Create
45691+ * first item of the file inserting one unallocated extent of
45692+ * width nr_jnodes
45693+ */
45694+ result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
45695+ }
45696+ assert("", result == 1 || result < 0);
45697+ zrelse(loaded);
45698+ done_lh(&lh);
45699+ assert("", reiser4_lock_counters()->d_refs == 0);
45700+ return (result == 1) ? 0 : result;
45701+}
45702+
45703+/**
45704+ * update_extents
45705+ * @file:
45706+ * @jnodes:
45707+ * @count:
45708+ * @off:
45709+ *
45710+ */
45711+static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
45712+{
45713+ struct inode *inode;
45714+ struct hint hint;
45715+ reiser4_key key;
45716+ int result;
45717+ znode *loaded;
45718+
45719+ result = load_file_hint(file, &hint);
45720+ BUG_ON(result != 0);
45721+
45722+ inode = file->f_dentry->d_inode;
45723+ if (count != 0)
45724+ /*
45725+ * count == 0 is special case: expanding truncate
45726+ */
45727+ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
45728+ key_by_inode_and_offset_common(inode, pos, &key);
45729+
45730+ assert("", reiser4_lock_counters()->d_refs == 0);
45731+
45732+ do {
45733+ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
45734+ if (IS_CBKERR(result)) {
45735+ assert("", reiser4_lock_counters()->d_refs == 0);
45736+ return result;
45737+ }
45738+
45739+ result = zload(hint.ext_coord.coord.node);
45740+ BUG_ON(result != 0);
45741+ loaded = hint.ext_coord.coord.node;
45742+
45743+ if (hint.ext_coord.coord.between == AFTER_UNIT) {
45744+ /*
45745+ * append existing extent item with unallocated extent
45746+ * of width nr_jnodes
45747+ */
45748+ if (hint.ext_coord.valid == 0)
45749+ /* NOTE: get statistics on this */
45750+ init_coord_extension_extent(&hint.ext_coord,
45751+ get_key_offset(&key));
45752+ result = append_last_extent(&hint.ext_coord, &key,
45753+ jnodes, count);
45754+ } else if (hint.ext_coord.coord.between == AT_UNIT) {
45755+ /*
45756+ * overwrite
45757+ * not optimal yet. Will be optimized if new write will
45758+ * show performance win.
45759+ */
45760+ if (hint.ext_coord.valid == 0)
45761+ /* NOTE: get statistics on this */
45762+ init_coord_extension_extent(&hint.ext_coord,
45763+ get_key_offset(&key));
45764+ result = overwrite_extent(&hint.ext_coord, &key,
45765+ jnodes, count, NULL);
45766+ } else {
45767+ /*
45768+ * there are no items of this file in the tree
45769+ * yet. Create first item of the file inserting one
45770+ * unallocated extent of * width nr_jnodes
45771+ */
45772+ result = insert_first_extent(&hint.ext_coord, &key,
45773+ jnodes, count, inode);
45774+ }
45775+ zrelse(loaded);
45776+ if (result < 0) {
45777+ done_lh(hint.ext_coord.lh);
45778+ break;
45779+ }
45780+
45781+ jnodes += result;
45782+ count -= result;
45783+ set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
45784+
45785+ /* seal and unlock znode */
45786+ if (hint.ext_coord.valid)
45787+ reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
45788+ else
45789+ reiser4_unset_hint(&hint);
45790+
45791+ } while (count > 0);
45792+
45793+ save_file_hint(file, &hint);
45794+ assert("", reiser4_lock_counters()->d_refs == 0);
45795+ return result;
45796+}
45797+
45798+/**
45799+ * write_extent_reserve_space - reserve space for extent write operation
45800+ * @inode:
45801+ *
45802+ * Estimates and reserves space which may be required for writing
45803+ * WRITE_GRANULARITY pages of file.
45804+ */
45805+static int write_extent_reserve_space(struct inode *inode)
45806+{
45807+ __u64 count;
45808+ reiser4_tree *tree;
45809+
45810+ /*
45811+ * to write WRITE_GRANULARITY pages to a file by extents we have to
45812+ * reserve disk space for:
45813+
45814+ * 1. find_file_item may have to insert empty node to the tree (empty
45815+ * leaf node between two extent items). This requires 1 block and
45816+ * number of blocks which are necessary to perform insertion of an
45817+ * internal item into twig level.
45818+
45819+ * 2. for each of written pages there might be needed 1 block and
45820+ * number of blocks which might be necessary to perform insertion of or
45821+ * paste to an extent item.
45822+
45823+ * 3. stat data update
45824+ */
45825+ tree = reiser4_tree_by_inode(inode);
45826+ count = estimate_one_insert_item(tree) +
45827+ WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
45828+ estimate_one_insert_item(tree);
45829+ grab_space_enable();
45830+ return reiser4_grab_space(count, 0 /* flags */);
45831+}
45832+
45833+/*
45834+ * filemap_copy_from_user no longer exists in generic code, because it
45835+ * is deadlocky (copying from user while holding the page lock is bad).
45836+ * As a temporary fix for reiser4, just define it here.
45837+ */
45838+static inline size_t
45839+filemap_copy_from_user(struct page *page, unsigned long offset,
45840+ const char __user *buf, unsigned bytes)
45841+{
45842+ char *kaddr;
45843+ int left;
45844+
45845+ kaddr = kmap_atomic(page, KM_USER0);
45846+ left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
45847+ kunmap_atomic(kaddr, KM_USER0);
45848+
45849+ if (left != 0) {
45850+ /* Do it the slow way */
45851+ kaddr = kmap(page);
45852+ left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
45853+ kunmap(page);
45854+ }
45855+ return bytes - left;
45856+}
45857+
45858+/**
45859+ * reiser4_write_extent - write method of extent item plugin
45860+ * @file: file to write to
45861+ * @buf: address of user-space buffer
45862+ * @count: number of bytes to write
45863+ * @pos: position in file to write to
45864+ *
45865+ */
45866+ssize_t reiser4_write_extent(struct file *file, const char __user *buf,
45867+ size_t count, loff_t *pos)
45868+{
45869+ int have_to_update_extent;
45870+ int nr_pages, nr_dirty;
45871+ struct page *page;
45872+ jnode *jnodes[WRITE_GRANULARITY + 1];
45873+ struct inode *inode;
45874+ unsigned long index;
45875+ unsigned long end;
45876+ int i;
45877+ int to_page, page_off;
45878+ size_t left, written;
45879+ int result = 0;
45880+
45881+ inode = file->f_dentry->d_inode;
45882+ if (write_extent_reserve_space(inode))
45883+ return RETERR(-ENOSPC);
45884+
45885+ if (count == 0) {
45886+ /* truncate case */
45887+ update_extents(file, jnodes, 0, *pos);
45888+ return 0;
45889+ }
45890+
45891+ BUG_ON(get_current_context()->trans->atom != NULL);
45892+
45893+ left = count;
45894+ index = *pos >> PAGE_CACHE_SHIFT;
45895+ /* calculate number of pages which are to be written */
45896+ end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
45897+ nr_pages = end - index + 1;
45898+ nr_dirty = 0;
45899+ assert("", nr_pages <= WRITE_GRANULARITY + 1);
45900+
45901+ /* get pages and jnodes */
45902+ for (i = 0; i < nr_pages; i ++) {
45903+ page = find_or_create_page(inode->i_mapping, index + i,
45904+ reiser4_ctx_gfp_mask_get());
45905+ if (page == NULL) {
45906+ nr_pages = i;
45907+ result = RETERR(-ENOMEM);
45908+ goto out;
45909+ }
45910+
45911+ jnodes[i] = jnode_of_page(page);
45912+ if (IS_ERR(jnodes[i])) {
45913+ unlock_page(page);
45914+ page_cache_release(page);
45915+ nr_pages = i;
45916+ result = RETERR(-ENOMEM);
45917+ goto out;
45918+ }
45919+ /* prevent jnode and page from disconnecting */
45920+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
45921+ unlock_page(page);
45922+ }
45923+
45924+ BUG_ON(get_current_context()->trans->atom != NULL);
45925+
45926+ have_to_update_extent = 0;
45927+
45928+ page_off = (*pos & (PAGE_CACHE_SIZE - 1));
45929+ for (i = 0; i < nr_pages; i ++) {
45930+ to_page = PAGE_CACHE_SIZE - page_off;
45931+ if (to_page > left)
45932+ to_page = left;
45933+ page = jnode_page(jnodes[i]);
45934+ if (page_offset(page) < inode->i_size &&
45935+ !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
45936+ /*
45937+ * the above is not optimal for partial write to last
45938+ * page of file when file size is not at boundary of
45939+ * page
45940+ */
45941+ lock_page(page);
45942+ if (!PageUptodate(page)) {
45943+ result = readpage_unix_file(NULL, page);
45944+ BUG_ON(result != 0);
45945+ /* wait for read completion */
45946+ lock_page(page);
45947+ BUG_ON(!PageUptodate(page));
45948+ } else
45949+ result = 0;
45950+ unlock_page(page);
45951+ }
45952+
45953+ BUG_ON(get_current_context()->trans->atom != NULL);
45954+ fault_in_pages_readable(buf, to_page);
45955+ BUG_ON(get_current_context()->trans->atom != NULL);
45956+
45957+ lock_page(page);
45958+ if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
45959+ simple_prepare_write(file, page, page_off,
45960+ page_off + to_page);
45961+
45962+ written = filemap_copy_from_user(page, page_off, buf, to_page);
45963+ if (unlikely(written != to_page)) {
45964+ unlock_page(page);
45965+ result = RETERR(-EFAULT);
45966+ break;
45967+ }
45968+
45969+ flush_dcache_page(page);
45970+ reiser4_set_page_dirty_internal(page);
45971+ unlock_page(page);
45972+ nr_dirty++;
45973+
45974+ mark_page_accessed(page);
45975+ SetPageUptodate(page);
45976+
45977+ if (jnodes[i]->blocknr == 0)
45978+ have_to_update_extent ++;
45979+
45980+ page_off = 0;
45981+ buf += to_page;
45982+ left -= to_page;
45983+ BUG_ON(get_current_context()->trans->atom != NULL);
45984+ }
45985+
45986+ if (have_to_update_extent) {
45987+ update_extents(file, jnodes, nr_dirty, *pos);
45988+ } else {
45989+ for (i = 0; i < nr_dirty; i ++) {
45990+ int ret;
45991+ spin_lock_jnode(jnodes[i]);
45992+ ret = reiser4_try_capture(jnodes[i],
45993+ ZNODE_WRITE_LOCK, 0);
45994+ BUG_ON(ret != 0);
45995+ jnode_make_dirty_locked(jnodes[i]);
45996+ spin_unlock_jnode(jnodes[i]);
45997+ }
45998+ }
45999+out:
46000+ for (i = 0; i < nr_pages; i ++) {
46001+ page_cache_release(jnode_page(jnodes[i]));
46002+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
46003+ jput(jnodes[i]);
46004+ }
46005+
46006+ /* the only errors handled so far is ENOMEM and
46007+ EFAULT on copy_from_user */
46008+
46009+ return (count - left) ? (count - left) : result;
46010+}
46011+
46012+int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
46013+ struct page *page)
46014+{
46015+ jnode *j;
46016+ struct address_space *mapping;
46017+ unsigned long index;
46018+ oid_t oid;
46019+ reiser4_block_nr block;
46020+
46021+ mapping = page->mapping;
46022+ oid = get_inode_oid(mapping->host);
46023+ index = page->index;
46024+
46025+ switch (state_of_extent(ext)) {
46026+ case HOLE_EXTENT:
46027+ /*
46028+ * it is possible to have hole page with jnode, if page was
46029+ * eflushed previously.
46030+ */
46031+ j = jfind(mapping, index);
46032+ if (j == NULL) {
46033+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
46034+ SetPageUptodate(page);
46035+ unlock_page(page);
46036+ return 0;
46037+ }
46038+ spin_lock_jnode(j);
46039+ if (!jnode_page(j)) {
46040+ jnode_attach_page(j, page);
46041+ } else {
46042+ BUG_ON(jnode_page(j) != page);
46043+ assert("vs-1504", jnode_page(j) == page);
46044+ }
46045+ block = *jnode_get_io_block(j);
46046+ spin_unlock_jnode(j);
46047+ if (block == 0) {
46048+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
46049+ SetPageUptodate(page);
46050+ unlock_page(page);
46051+ jput(j);
46052+ return 0;
46053+ }
46054+ break;
46055+
46056+ case ALLOCATED_EXTENT:
46057+ j = jnode_of_page(page);
46058+ if (IS_ERR(j))
46059+ return PTR_ERR(j);
46060+ if (*jnode_get_block(j) == 0) {
46061+ reiser4_block_nr blocknr;
46062+
46063+ blocknr = extent_get_start(ext) + pos;
46064+ jnode_set_block(j, &blocknr);
46065+ } else
46066+ assert("vs-1403",
46067+ j->blocknr == extent_get_start(ext) + pos);
46068+ break;
46069+
46070+ case UNALLOCATED_EXTENT:
46071+ j = jfind(mapping, index);
46072+ assert("nikita-2688", j);
46073+ assert("vs-1426", jnode_page(j) == NULL);
46074+
46075+ spin_lock_jnode(j);
46076+ jnode_attach_page(j, page);
46077+ spin_unlock_jnode(j);
46078+ break;
46079+
46080+ default:
46081+ warning("vs-957", "wrong extent\n");
46082+ return RETERR(-EIO);
46083+ }
46084+
46085+ BUG_ON(j == 0);
46086+ reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
46087+ jput(j);
46088+ return 0;
46089+}
46090+
46091+/* Implements plugin->u.item.s.file.read operation for extent items. */
46092+int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
46093+{
46094+ int result;
46095+ struct page *page;
46096+ unsigned long cur_page, next_page;
46097+ unsigned long page_off, count;
46098+ struct address_space *mapping;
46099+ loff_t file_off;
46100+ uf_coord_t *uf_coord;
46101+ coord_t *coord;
46102+ struct extent_coord_extension *ext_coord;
46103+ unsigned long nr_pages;
46104+ char *kaddr;
46105+
46106+ assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
46107+ assert("vs-572", flow->user == 1);
46108+ assert("vs-1351", flow->length > 0);
46109+
46110+ uf_coord = &hint->ext_coord;
46111+
46112+ check_uf_coord(uf_coord, NULL);
46113+ assert("vs-33", uf_coord->lh == &hint->lh);
46114+
46115+ coord = &uf_coord->coord;
46116+ assert("vs-1119", znode_is_rlocked(coord->node));
46117+ assert("vs-1120", znode_is_loaded(coord->node));
46118+ assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
46119+
46120+ mapping = file->f_dentry->d_inode->i_mapping;
46121+ ext_coord = &uf_coord->extension.extent;
46122+
46123+ /* offset in a file to start read from */
46124+ file_off = get_key_offset(&flow->key);
46125+ /* offset within the page to start read from */
46126+ page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
46127+ /* bytes which can be read from the page which contains file_off */
46128+ count = PAGE_CACHE_SIZE - page_off;
46129+
46130+ /* index of page containing offset read is to start from */
46131+ cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
46132+ next_page = cur_page;
46133+ /* number of pages flow spans over */
46134+ nr_pages =
46135+ ((file_off + flow->length + PAGE_CACHE_SIZE -
46136+ 1) >> PAGE_CACHE_SHIFT) - cur_page;
46137+
46138+ /* we start having twig node read locked. However, we do not want to
46139+ keep that lock all the time readahead works. So, set a sel and
46140+ release twig node. */
46141+ reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
46142+ /* &hint->lh is done-ed */
46143+
46144+ do {
46145+ reiser4_txn_restart_current();
46146+ page = read_mapping_page(mapping, cur_page, file);
46147+ if (IS_ERR(page))
46148+ return PTR_ERR(page);
46149+ lock_page(page);
46150+ if (!PageUptodate(page)) {
46151+ unlock_page(page);
46152+ page_cache_release(page);
46153+ warning("jmacd-97178", "extent_read: page is not up to date");
46154+ return RETERR(-EIO);
46155+ }
46156+ mark_page_accessed(page);
46157+ unlock_page(page);
46158+
46159+ /* If users can be writing to this page using arbitrary virtual
46160+ addresses, take care about potential aliasing before reading
46161+ the page on the kernel side.
46162+ */
46163+ if (mapping_writably_mapped(mapping))
46164+ flush_dcache_page(page);
46165+
46166+ assert("nikita-3034", reiser4_schedulable());
46167+
46168+ /* number of bytes which are to be read from the page */
46169+ if (count > flow->length)
46170+ count = flow->length;
46171+
46172+ result = fault_in_pages_writeable(flow->data, count);
46173+ if (result) {
46174+ page_cache_release(page);
46175+ return RETERR(-EFAULT);
46176+ }
46177+
46178+ kaddr = kmap_atomic(page, KM_USER0);
46179+ result = __copy_to_user_inatomic(flow->data,
46180+ kaddr + page_off, count);
46181+ kunmap_atomic(kaddr, KM_USER0);
46182+ if (result != 0) {
46183+ kaddr = kmap(page);
46184+ result = __copy_to_user(flow->data, kaddr + page_off, count);
46185+ kunmap(page);
46186+ if (unlikely(result))
46187+ return RETERR(-EFAULT);
46188+ }
46189+
46190+ page_cache_release(page);
46191+
46192+ /* increase key (flow->key), update user area pointer (flow->data) */
46193+ move_flow_forward(flow, count);
46194+
46195+ page_off = 0;
46196+ cur_page ++;
46197+ count = PAGE_CACHE_SIZE;
46198+ nr_pages--;
46199+ } while (flow->length);
46200+
46201+ return 0;
46202+}
46203+
46204+/*
46205+ plugin->s.file.readpage
46206+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
46207+ or
46208+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
46209+
46210+ At the beginning: coord->node is read locked, zloaded, page is
46211+ locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
46212+*/
46213+int reiser4_readpage_extent(void *vp, struct page *page)
46214+{
46215+ uf_coord_t *uf_coord = vp;
46216+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
46217+ ON_DEBUG(reiser4_key key);
46218+
46219+ assert("vs-1040", PageLocked(page));
46220+ assert("vs-1050", !PageUptodate(page));
46221+ assert("vs-1039", page->mapping && page->mapping->host);
46222+
46223+ assert("vs-1044", znode_is_loaded(coord->node));
46224+ assert("vs-758", item_is_extent(coord));
46225+ assert("vs-1046", coord_is_existing_unit(coord));
46226+ assert("vs-1045", znode_is_rlocked(coord->node));
46227+ assert("vs-1047",
46228+ page->mapping->host->i_ino ==
46229+ get_key_objectid(item_key_by_coord(coord, &key)));
46230+ check_uf_coord(uf_coord, NULL);
46231+
46232+ return reiser4_do_readpage_extent(
46233+ ext_by_ext_coord(uf_coord),
46234+ uf_coord->extension.extent.pos_in_unit, page);
46235+}
46236+
46237+/**
46238+ * get_block_address_extent
46239+ * @coord:
46240+ * @block:
46241+ * @result:
46242+ *
46243+ *
46244+ */
46245+int get_block_address_extent(const coord_t *coord, sector_t block,
46246+ sector_t *result)
46247+{
46248+ reiser4_extent *ext;
46249+
46250+ if (!coord_is_existing_unit(coord))
46251+ return RETERR(-EINVAL);
46252+
46253+ ext = extent_by_coord(coord);
46254+
46255+ if (state_of_extent(ext) != ALLOCATED_EXTENT)
46256+ /* FIXME: bad things may happen if it is unallocated extent */
46257+ *result = 0;
46258+ else {
46259+ reiser4_key key;
46260+
46261+ unit_key_by_coord(coord, &key);
46262+ assert("vs-1645",
46263+ block >= get_key_offset(&key) >> current_blocksize_bits);
46264+ assert("vs-1646",
46265+ block <
46266+ (get_key_offset(&key) >> current_blocksize_bits) +
46267+ extent_get_width(ext));
46268+ *result =
46269+ extent_get_start(ext) + (block -
46270+ (get_key_offset(&key) >>
46271+ current_blocksize_bits));
46272+ }
46273+ return 0;
46274+}
46275+
46276+/*
46277+ plugin->u.item.s.file.append_key
46278+ key of first byte which is the next to last byte by addressed by this extent
46279+*/
46280+reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
46281+{
46282+ item_key_by_coord(coord, key);
46283+ set_key_offset(key,
46284+ get_key_offset(key) + reiser4_extent_size(coord,
46285+ nr_units_extent
46286+ (coord)));
46287+
46288+ assert("vs-610", get_key_offset(key)
46289+ && (get_key_offset(key) & (current_blocksize - 1)) == 0);
46290+ return key;
46291+}
46292+
46293+/* plugin->u.item.s.file.init_coord_extension */
46294+void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
46295+{
46296+ coord_t *coord;
46297+ struct extent_coord_extension *ext_coord;
46298+ reiser4_key key;
46299+ loff_t offset;
46300+
46301+ assert("vs-1295", uf_coord->valid == 0);
46302+
46303+ coord = &uf_coord->coord;
46304+ assert("vs-1288", coord_is_iplug_set(coord));
46305+ assert("vs-1327", znode_is_loaded(coord->node));
46306+
46307+ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
46308+ return;
46309+
46310+ ext_coord = &uf_coord->extension.extent;
46311+ ext_coord->nr_units = nr_units_extent(coord);
46312+ ext_coord->ext_offset =
46313+ (char *)extent_by_coord(coord) - zdata(coord->node);
46314+ ext_coord->width = extent_get_width(extent_by_coord(coord));
46315+ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
46316+ uf_coord->valid = 1;
46317+
46318+ /* pos_in_unit is the only uninitialized field in extended coord */
46319+ if (coord->between == AFTER_UNIT) {
46320+ assert("vs-1330",
46321+ coord->unit_pos == nr_units_extent(coord) - 1);
46322+
46323+ ext_coord->pos_in_unit = ext_coord->width - 1;
46324+ } else {
46325+ /* AT_UNIT */
46326+ unit_key_by_coord(coord, &key);
46327+ offset = get_key_offset(&key);
46328+
46329+ assert("vs-1328", offset <= lookuped);
46330+ assert("vs-1329",
46331+ lookuped <
46332+ offset + ext_coord->width * current_blocksize);
46333+ ext_coord->pos_in_unit =
46334+ ((lookuped - offset) >> current_blocksize_bits);
46335+ }
46336+}
46337+
46338+/*
46339+ * Local variables:
46340+ * c-indentation-style: "K&R"
46341+ * mode-name: "LC"
46342+ * c-basic-offset: 8
46343+ * tab-width: 8
46344+ * fill-column: 79
46345+ * scroll-step: 1
46346+ * End:
46347+ */
46348diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.23/fs/reiser4/plugin/item/extent_flush_ops.c
46349--- linux-2.6.23.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 03:00:00.000000000 +0300
46350+++ linux-2.6.23/fs/reiser4/plugin/item/extent_flush_ops.c 2007-12-04 16:49:30.000000000 +0300
46351@@ -0,0 +1,1028 @@
46352+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46353+
46354+#include "item.h"
46355+#include "../../tree.h"
46356+#include "../../jnode.h"
46357+#include "../../super.h"
46358+#include "../../flush.h"
46359+#include "../../carry.h"
46360+#include "../object.h"
46361+
46362+#include <linux/pagemap.h>
46363+
46364+static reiser4_block_nr extent_unit_start(const coord_t * item);
46365+
46366+/* Return either first or last extent (depending on @side) of the item
46367+ @coord is set to. Set @pos_in_unit either to first or to last block
46368+ of extent. */
46369+static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
46370+ reiser4_block_nr * pos_in_unit)
46371+{
46372+ reiser4_extent *ext;
46373+
46374+ if (side == LEFT_SIDE) {
46375+ /* get first extent of item */
46376+ ext = extent_item(coord);
46377+ *pos_in_unit = 0;
46378+ } else {
46379+ /* get last extent of item and last position within it */
46380+ assert("vs-363", side == RIGHT_SIDE);
46381+ ext = extent_item(coord) + coord_last_unit_pos(coord);
46382+ *pos_in_unit = extent_get_width(ext) - 1;
46383+ }
46384+
46385+ return ext;
46386+}
46387+
46388+/* item_plugin->f.utmost_child */
46389+/* Return the child. Coord is set to extent item. Find jnode corresponding
46390+ either to first or to last unformatted node pointed by the item */
46391+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
46392+{
46393+ reiser4_extent *ext;
46394+ reiser4_block_nr pos_in_unit;
46395+
46396+ ext = extent_utmost_ext(coord, side, &pos_in_unit);
46397+
46398+ switch (state_of_extent(ext)) {
46399+ case HOLE_EXTENT:
46400+ *childp = NULL;
46401+ return 0;
46402+ case ALLOCATED_EXTENT:
46403+ case UNALLOCATED_EXTENT:
46404+ break;
46405+ default:
46406+ /* this should never happen */
46407+ assert("vs-1417", 0);
46408+ }
46409+
46410+ {
46411+ reiser4_key key;
46412+ reiser4_tree *tree;
46413+ unsigned long index;
46414+
46415+ if (side == LEFT_SIDE) {
46416+ /* get key of first byte addressed by the extent */
46417+ item_key_by_coord(coord, &key);
46418+ } else {
46419+ /* get key of byte which next after last byte addressed by the extent */
46420+ append_key_extent(coord, &key);
46421+ }
46422+
46423+ assert("vs-544",
46424+ (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
46425+ /* index of first or last (depending on @side) page addressed
46426+ by the extent */
46427+ index =
46428+ (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
46429+ if (side == RIGHT_SIDE)
46430+ index--;
46431+
46432+ tree = coord->node->zjnode.tree;
46433+ *childp = jlookup(tree, get_key_objectid(&key), index);
46434+ }
46435+
46436+ return 0;
46437+}
46438+
46439+/* item_plugin->f.utmost_child_real_block */
46440+/* Return the child's block, if allocated. */
46441+int
46442+utmost_child_real_block_extent(const coord_t * coord, sideof side,
46443+ reiser4_block_nr * block)
46444+{
46445+ reiser4_extent *ext;
46446+
46447+ ext = extent_by_coord(coord);
46448+
46449+ switch (state_of_extent(ext)) {
46450+ case ALLOCATED_EXTENT:
46451+ *block = extent_get_start(ext);
46452+ if (side == RIGHT_SIDE)
46453+ *block += extent_get_width(ext) - 1;
46454+ break;
46455+ case HOLE_EXTENT:
46456+ case UNALLOCATED_EXTENT:
46457+ *block = 0;
46458+ break;
46459+ default:
46460+ /* this should never happen */
46461+ assert("vs-1418", 0);
46462+ }
46463+
46464+ return 0;
46465+}
46466+
46467+/* item_plugin->f.scan */
46468+/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
46469+ This scan continues, advancing the parent coordinate, until either it encounters a
46470+ formatted child or it finishes scanning this node.
46471+
46472+ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
46473+ not sure this is last property (same atom) is enforced, but it should be the case since
46474+ one atom must write the parent and the others must read the parent, thus fusing?). In
46475+ any case, the code below asserts this case for unallocated extents. Unallocated
46476+ extents are thus optimized because we can skip to the endpoint when scanning.
46477+
46478+ It returns control to reiser4_scan_extent, handles these terminating conditions,
46479+ e.g., by loading the next twig.
46480+*/
46481+int reiser4_scan_extent(flush_scan * scan)
46482+{
46483+ coord_t coord;
46484+ jnode *neighbor;
46485+ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
46486+ reiser4_block_nr unit_start;
46487+ __u64 oid;
46488+ reiser4_key key;
46489+ int ret = 0, allocated, incr;
46490+ reiser4_tree *tree;
46491+
46492+ if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
46493+ scan->stop = 1;
46494+ return 0; /* Race with truncate, this node is already
46495+ * truncated. */
46496+ }
46497+
46498+ coord_dup(&coord, &scan->parent_coord);
46499+
46500+ assert("jmacd-1404", !reiser4_scan_finished(scan));
46501+ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
46502+ assert("jmacd-1406", jnode_is_unformatted(scan->node));
46503+
46504+ /* The scan_index variable corresponds to the current page index of the
46505+ unformatted block scan position. */
46506+ scan_index = index_jnode(scan->node);
46507+
46508+ assert("jmacd-7889", item_is_extent(&coord));
46509+
46510+ repeat:
46511+ /* objectid of file */
46512+ oid = get_key_objectid(item_key_by_coord(&coord, &key));
46513+
46514+ allocated = !extent_is_unallocated(&coord);
46515+ /* Get the values of this extent unit: */
46516+ unit_index = extent_unit_index(&coord);
46517+ unit_width = extent_unit_width(&coord);
46518+ unit_start = extent_unit_start(&coord);
46519+
46520+ assert("jmacd-7187", unit_width > 0);
46521+ assert("jmacd-7188", scan_index >= unit_index);
46522+ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
46523+
46524+ /* Depending on the scan direction, we set different maximum values for scan_index
46525+ (scan_max) and the number of nodes that would be passed if the scan goes the
46526+ entire way (scan_dist). Incr is an integer reflecting the incremental
46527+ direction of scan_index. */
46528+ if (reiser4_scanning_left(scan)) {
46529+ scan_max = unit_index;
46530+ scan_dist = scan_index - unit_index;
46531+ incr = -1;
46532+ } else {
46533+ scan_max = unit_index + unit_width - 1;
46534+ scan_dist = scan_max - unit_index;
46535+ incr = +1;
46536+ }
46537+
46538+ tree = coord.node->zjnode.tree;
46539+
46540+ /* If the extent is allocated we have to check each of its blocks. If the extent
46541+ is unallocated we can skip to the scan_max. */
46542+ if (allocated) {
46543+ do {
46544+ neighbor = jlookup(tree, oid, scan_index);
46545+ if (neighbor == NULL)
46546+ goto stop_same_parent;
46547+
46548+ if (scan->node != neighbor
46549+ && !reiser4_scan_goto(scan, neighbor)) {
46550+ /* @neighbor was jput() by reiser4_scan_goto */
46551+ goto stop_same_parent;
46552+ }
46553+
46554+ ret = scan_set_current(scan, neighbor, 1, &coord);
46555+ if (ret != 0) {
46556+ goto exit;
46557+ }
46558+
46559+ /* reference to @neighbor is stored in @scan, no need
46560+ to jput(). */
46561+ scan_index += incr;
46562+
46563+ } while (incr + scan_max != scan_index);
46564+
46565+ } else {
46566+ /* Optimized case for unallocated extents, skip to the end. */
46567+ neighbor = jlookup(tree, oid, scan_max /*index */ );
46568+ if (neighbor == NULL) {
46569+ /* Race with truncate */
46570+ scan->stop = 1;
46571+ ret = 0;
46572+ goto exit;
46573+ }
46574+
46575+ assert("zam-1043",
46576+ reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
46577+
46578+ ret = scan_set_current(scan, neighbor, scan_dist, &coord);
46579+ if (ret != 0) {
46580+ goto exit;
46581+ }
46582+ }
46583+
46584+ if (coord_sideof_unit(&coord, scan->direction) == 0
46585+ && item_is_extent(&coord)) {
46586+ /* Continue as long as there are more extent units. */
46587+
46588+ scan_index =
46589+ extent_unit_index(&coord) +
46590+ (reiser4_scanning_left(scan) ?
46591+ extent_unit_width(&coord) - 1 : 0);
46592+ goto repeat;
46593+ }
46594+
46595+ if (0) {
46596+ stop_same_parent:
46597+
46598+ /* If we are scanning left and we stop in the middle of an allocated
46599+ extent, we know the preceder immediately.. */
46600+ /* middle of extent is (scan_index - unit_index) != 0. */
46601+ if (reiser4_scanning_left(scan) &&
46602+ (scan_index - unit_index) != 0) {
46603+ /* FIXME(B): Someone should step-through and verify that this preceder
46604+ calculation is indeed correct. */
46605+ /* @unit_start is starting block (number) of extent
46606+ unit. Flush stopped at the @scan_index block from
46607+ the beginning of the file, which is (scan_index -
46608+ unit_index) block within extent.
46609+ */
46610+ if (unit_start) {
46611+ /* skip preceder update when we are at hole */
46612+ scan->preceder_blk =
46613+ unit_start + scan_index - unit_index;
46614+ check_preceder(scan->preceder_blk);
46615+ }
46616+ }
46617+
46618+ /* In this case, we leave coord set to the parent of scan->node. */
46619+ scan->stop = 1;
46620+
46621+ } else {
46622+ /* In this case, we are still scanning, coord is set to the next item which is
46623+ either off-the-end of the node or not an extent. */
46624+ assert("jmacd-8912", scan->stop == 0);
46625+ assert("jmacd-7812",
46626+ (coord_is_after_sideof_unit(&coord, scan->direction)
46627+ || !item_is_extent(&coord)));
46628+ }
46629+
46630+ ret = 0;
46631+ exit:
46632+ return ret;
46633+}
46634+
46635+/* ask block allocator for some blocks */
46636+static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
46637+ reiser4_block_nr wanted_count,
46638+ reiser4_block_nr *first_allocated,
46639+ reiser4_block_nr *allocated,
46640+ block_stage_t block_stage)
46641+{
46642+ *allocated = wanted_count;
46643+ preceder->max_dist = 0; /* scan whole disk, if needed */
46644+
46645+ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
46646+ preceder->block_stage = block_stage;
46647+
46648+ /* FIXME: we do not handle errors here now */
46649+ check_me("vs-420",
46650+ reiser4_alloc_blocks(preceder, first_allocated, allocated,
46651+ BA_PERMANENT) == 0);
46652+ /* update flush_pos's preceder to last allocated block number */
46653+ preceder->blk = *first_allocated + *allocated - 1;
46654+}
46655+
46656+/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
46657+ will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
46658+ to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
46659+static reiser4_block_nr reserve_replace(void)
46660+{
46661+ reiser4_block_nr grabbed, needed;
46662+
46663+ grabbed = get_current_context()->grabbed_blocks;
46664+ needed = estimate_one_insert_into_item(current_tree);
46665+ check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
46666+ return grabbed;
46667+}
46668+
46669+static void free_replace_reserved(reiser4_block_nr grabbed)
46670+{
46671+ reiser4_context *ctx;
46672+
46673+ ctx = get_current_context();
46674+ grabbed2free(ctx, get_super_private(ctx->super),
46675+ ctx->grabbed_blocks - grabbed);
46676+}
46677+
46678+/* Block offset of first block addressed by unit */
46679+__u64 extent_unit_index(const coord_t * item)
46680+{
46681+ reiser4_key key;
46682+
46683+ assert("vs-648", coord_is_existing_unit(item));
46684+ unit_key_by_coord(item, &key);
46685+ return get_key_offset(&key) >> current_blocksize_bits;
46686+}
46687+
46688+/* AUDIT shouldn't return value be of reiser4_block_nr type?
46689+ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
46690+__u64 extent_unit_width(const coord_t * item)
46691+{
46692+ assert("vs-649", coord_is_existing_unit(item));
46693+ return width_by_coord(item);
46694+}
46695+
46696+/* Starting block location of this unit */
46697+static reiser4_block_nr extent_unit_start(const coord_t * item)
46698+{
46699+ return extent_get_start(extent_by_coord(item));
46700+}
46701+
46702+/**
46703+ * split_allocated_extent -
46704+ * @coord:
46705+ * @pos_in_unit:
46706+ *
46707+ * replace allocated extent with two allocated extents
46708+ */
46709+static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
46710+{
46711+ int result;
46712+ struct replace_handle *h;
46713+ reiser4_extent *ext;
46714+ reiser4_block_nr grabbed;
46715+
46716+ ext = extent_by_coord(coord);
46717+ assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
46718+ assert("vs-1411", extent_get_width(ext) > pos_in_unit);
46719+
46720+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
46721+ if (h == NULL)
46722+ return RETERR(-ENOMEM);
46723+ h->coord = coord;
46724+ h->lh = znode_lh(coord->node);
46725+ h->pkey = &h->key;
46726+ unit_key_by_coord(coord, h->pkey);
46727+ set_key_offset(h->pkey,
46728+ (get_key_offset(h->pkey) +
46729+ pos_in_unit * current_blocksize));
46730+ reiser4_set_extent(&h->overwrite, extent_get_start(ext),
46731+ pos_in_unit);
46732+ reiser4_set_extent(&h->new_extents[0],
46733+ extent_get_start(ext) + pos_in_unit,
46734+ extent_get_width(ext) - pos_in_unit);
46735+ h->nr_new_extents = 1;
46736+ h->flags = COPI_DONT_SHIFT_LEFT;
46737+ h->paste_key = h->key;
46738+
46739+ /* reserve space for extent unit paste, @grabbed is reserved before */
46740+ grabbed = reserve_replace();
46741+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
46742+ extent */);
46743+ /* restore reserved */
46744+ free_replace_reserved(grabbed);
46745+ kfree(h);
46746+ return result;
46747+}
46748+
46749+/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
46750+ one). Return 1 if it succeeded, 0 - otherwise */
46751+static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
46752+ reiser4_extent *replace)
46753+{
46754+ assert("vs-1415", extent_by_coord(coord) == ext);
46755+
46756+ if (coord->unit_pos == 0
46757+ || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
46758+ /* @ext either does not exist or is not allocated extent */
46759+ return 0;
46760+ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
46761+ extent_get_start(replace))
46762+ return 0;
46763+
46764+ /* we can glue, widen previous unit */
46765+ extent_set_width(ext - 1,
46766+ extent_get_width(ext - 1) + extent_get_width(replace));
46767+
46768+ if (extent_get_width(ext) != extent_get_width(replace)) {
46769+ /* make current extent narrower */
46770+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
46771+ extent_set_start(ext,
46772+ extent_get_start(ext) +
46773+ extent_get_width(replace));
46774+ extent_set_width(ext,
46775+ extent_get_width(ext) -
46776+ extent_get_width(replace));
46777+ } else {
46778+ /* current extent completely glued with its left neighbor, remove it */
46779+ coord_t from, to;
46780+
46781+ coord_dup(&from, coord);
46782+ from.unit_pos = nr_units_extent(coord) - 1;
46783+ coord_dup(&to, &from);
46784+
46785+ /* currently cut from extent can cut either from the beginning or from the end. Move place which got
46786+ freed after unit removal to end of item */
46787+ memmove(ext, ext + 1,
46788+ (from.unit_pos -
46789+ coord->unit_pos) * sizeof(reiser4_extent));
46790+ /* wipe part of item which is going to be cut, so that node_check will not be confused */
46791+ cut_node_content(&from, &to, NULL, NULL, NULL);
46792+ }
46793+ znode_make_dirty(coord->node);
46794+ /* move coord back */
46795+ coord->unit_pos--;
46796+ return 1;
46797+}
46798+
46799+/**
46800+ * conv_extent - replace extent with 2 ones
46801+ * @coord: coordinate of extent to be replaced
46802+ * @replace: extent to overwrite the one @coord is set to
46803+ *
46804+ * Overwrites extent @coord is set to and paste one extent unit after
46805+ * overwritten one if @replace is shorter than initial extent
46806+ */
46807+static int conv_extent(coord_t *coord, reiser4_extent *replace)
46808+{
46809+ int result;
46810+ struct replace_handle *h;
46811+ reiser4_extent *ext;
46812+ reiser4_block_nr start, width, new_width;
46813+ reiser4_block_nr grabbed;
46814+ extent_state state;
46815+
46816+ ext = extent_by_coord(coord);
46817+ state = state_of_extent(ext);
46818+ start = extent_get_start(ext);
46819+ width = extent_get_width(ext);
46820+ new_width = extent_get_width(replace);
46821+
46822+ assert("vs-1458", (state == UNALLOCATED_EXTENT ||
46823+ state == ALLOCATED_EXTENT));
46824+ assert("vs-1459", width >= new_width);
46825+
46826+ if (try_to_merge_with_left(coord, ext, replace)) {
46827+ /* merged @replace with left neighbor. Current unit is either
46828+ removed or narrowed */
46829+ return 0;
46830+ }
46831+
46832+ if (width == new_width) {
46833+ /* replace current extent with @replace */
46834+ *ext = *replace;
46835+ znode_make_dirty(coord->node);
46836+ return 0;
46837+ }
46838+
46839+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
46840+ if (h == NULL)
46841+ return RETERR(-ENOMEM);
46842+ h->coord = coord;
46843+ h->lh = znode_lh(coord->node);
46844+ h->pkey = &h->key;
46845+ unit_key_by_coord(coord, h->pkey);
46846+ set_key_offset(h->pkey,
46847+ (get_key_offset(h->pkey) + new_width * current_blocksize));
46848+ h->overwrite = *replace;
46849+
46850+ /* replace @ext with @replace and padding extent */
46851+ reiser4_set_extent(&h->new_extents[0],
46852+ (state == ALLOCATED_EXTENT) ?
46853+ (start + new_width) :
46854+ UNALLOCATED_EXTENT_START,
46855+ width - new_width);
46856+ h->nr_new_extents = 1;
46857+ h->flags = COPI_DONT_SHIFT_LEFT;
46858+ h->paste_key = h->key;
46859+
46860+ /* reserve space for extent unit paste, @grabbed is reserved before */
46861+ grabbed = reserve_replace();
46862+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
46863+ extent */);
46864+
46865+ /* restore reserved */
46866+ free_replace_reserved(grabbed);
46867+ kfree(h);
46868+ return result;
46869+}
46870+
46871+/**
46872+ * assign_real_blocknrs
46873+ * @flush_pos:
46874+ * @oid: objectid of file jnodes to assign block number to belongs to
46875+ * @index: first jnode on the range
46876+ * @count: number of jnodes to assign block numbers to
46877+ * @first: start of allocated block range
46878+ *
46879+ * Assigns block numbers to each of @count jnodes. Index of first jnode is
46880+ * @index. Jnodes get lookuped with jlookup.
46881+ */
46882+static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
46883+ unsigned long index, reiser4_block_nr count,
46884+ reiser4_block_nr first)
46885+{
46886+ unsigned long i;
46887+ reiser4_tree *tree;
46888+ txn_atom *atom;
46889+ int nr;
46890+
46891+ atom = atom_locked_by_fq(flush_pos->fq);
46892+ assert("vs-1468", atom);
46893+ BUG_ON(atom == NULL);
46894+
46895+ nr = 0;
46896+ tree = current_tree;
46897+ for (i = 0; i < count; ++i, ++index) {
46898+ jnode *node;
46899+
46900+ node = jlookup(tree, oid, index);
46901+ assert("", node != NULL);
46902+ BUG_ON(node == NULL);
46903+
46904+ spin_lock_jnode(node);
46905+ assert("", !jnode_is_flushprepped(node));
46906+ assert("vs-1475", node->atom == atom);
46907+ assert("vs-1476", atomic_read(&node->x_count) > 0);
46908+
46909+ JF_CLR(node, JNODE_FLUSH_RESERVED);
46910+ jnode_set_block(node, &first);
46911+ unformatted_make_reloc(node, flush_pos->fq);
46912+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
46913+ FQ_LIST, 0));
46914+ spin_unlock_jnode(node);
46915+ first++;
46916+
46917+ atomic_dec(&node->x_count);
46918+ nr ++;
46919+ }
46920+
46921+ spin_unlock_atom(atom);
46922+ return;
46923+}
46924+
46925+/**
46926+ * make_node_ovrwr - assign node to overwrite set
46927+ * @jnodes: overwrite set list head
46928+ * @node: jnode to belong to overwrite set
46929+ *
46930+ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
46931+ * which is an accumulator for nodes before they get to overwrite set list of
46932+ * atom.
46933+ */
46934+static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
46935+{
46936+ spin_lock_jnode(node);
46937+
46938+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
46939+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
46940+
46941+ JF_SET(node, JNODE_OVRWR);
46942+ list_move_tail(&node->capture_link, jnodes);
46943+ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
46944+
46945+ spin_unlock_jnode(node);
46946+}
46947+
46948+/**
46949+ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
46950+ * @flush_pos: flush position
46951+ * @oid: objectid of file jnodes belong to
46952+ * @index: starting index
46953+ * @width: extent width
46954+ *
46955+ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
46956+ * overwrite set. Starting from the one with index @index. If end of slum is
46957+ * detected (node is not found or flushprepped) - stop iterating and set flush
46958+ * position's state to POS_INVALID.
46959+ */
46960+static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
46961+ unsigned long index, reiser4_block_nr width)
46962+{
46963+ unsigned long i;
46964+ reiser4_tree *tree;
46965+ jnode *node;
46966+ txn_atom *atom;
46967+ LIST_HEAD(jnodes);
46968+
46969+ tree = current_tree;
46970+
46971+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
46972+ assert("vs-1478", atom);
46973+
46974+ for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
46975+ node = jlookup(tree, oid, index);
46976+ if (!node) {
46977+ flush_pos->state = POS_INVALID;
46978+ break;
46979+ }
46980+ if (jnode_check_flushprepped(node)) {
46981+ flush_pos->state = POS_INVALID;
46982+ atomic_dec(&node->x_count);
46983+ break;
46984+ }
46985+ if (node->atom != atom) {
46986+ flush_pos->state = POS_INVALID;
46987+ atomic_dec(&node->x_count);
46988+ break;
46989+ }
46990+ make_node_ovrwr(&jnodes, node);
46991+ atomic_dec(&node->x_count);
46992+ }
46993+
46994+ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
46995+ spin_unlock_atom(atom);
46996+}
46997+
46998+/**
46999+ * allocated_extent_slum_size
47000+ * @flush_pos:
47001+ * @oid:
47002+ * @index:
47003+ * @count:
47004+ *
47005+ *
47006+ */
47007+static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
47008+ unsigned long index, unsigned long count)
47009+{
47010+ unsigned long i;
47011+ reiser4_tree *tree;
47012+ txn_atom *atom;
47013+ int nr;
47014+
47015+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
47016+ assert("vs-1468", atom);
47017+
47018+ nr = 0;
47019+ tree = current_tree;
47020+ for (i = 0; i < count; ++i, ++index) {
47021+ jnode *node;
47022+
47023+ node = jlookup(tree, oid, index);
47024+ if (!node)
47025+ break;
47026+
47027+ if (jnode_check_flushprepped(node)) {
47028+ atomic_dec(&node->x_count);
47029+ break;
47030+ }
47031+
47032+ if (node->atom != atom) {
47033+ /*
47034+ * this is possible on overwrite: extent_write may
47035+ * capture several unformatted nodes without capturing
47036+ * any formatted nodes.
47037+ */
47038+ atomic_dec(&node->x_count);
47039+ break;
47040+ }
47041+
47042+ assert("vs-1476", atomic_read(&node->x_count) > 1);
47043+ atomic_dec(&node->x_count);
47044+ nr ++;
47045+ }
47046+
47047+ spin_unlock_atom(atom);
47048+ return nr;
47049+}
47050+
47051+/**
47052+ * alloc_extent
47053+ * @flush_pos:
47054+ *
47055+ *
47056+ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
47057+ * is set to. It is to prepare for flushing sequence of not flushprepped nodes
47058+ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
47059+ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
47060+ * set to 1 and to overwrite set otherwise
47061+ */
47062+int reiser4_alloc_extent(flush_pos_t *flush_pos)
47063+{
47064+ coord_t *coord;
47065+ reiser4_extent *ext;
47066+ reiser4_extent replace_ext;
47067+ oid_t oid;
47068+ reiser4_block_nr protected;
47069+ reiser4_block_nr start;
47070+ __u64 index;
47071+ __u64 width;
47072+ extent_state state;
47073+ int result;
47074+ reiser4_block_nr first_allocated;
47075+ __u64 allocated;
47076+ reiser4_key key;
47077+ block_stage_t block_stage;
47078+
47079+ assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
47080+ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
47081+ && item_is_extent(&flush_pos->coord));
47082+
47083+ coord = &flush_pos->coord;
47084+
47085+ ext = extent_by_coord(coord);
47086+ state = state_of_extent(ext);
47087+ if (state == HOLE_EXTENT) {
47088+ flush_pos->state = POS_INVALID;
47089+ return 0;
47090+ }
47091+
47092+ item_key_by_coord(coord, &key);
47093+ oid = get_key_objectid(&key);
47094+ index = extent_unit_index(coord) + flush_pos->pos_in_unit;
47095+ start = extent_get_start(ext);
47096+ width = extent_get_width(ext);
47097+
47098+ assert("vs-1457", width > flush_pos->pos_in_unit);
47099+
47100+ if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
47101+ /* relocate */
47102+ if (flush_pos->pos_in_unit) {
47103+ /* split extent unit into two */
47104+ result =
47105+ split_allocated_extent(coord,
47106+ flush_pos->pos_in_unit);
47107+ flush_pos->pos_in_unit = 0;
47108+ return result;
47109+ }
47110+
47111+ /* limit number of nodes to allocate */
47112+ if (flush_pos->nr_to_write < width)
47113+ width = flush_pos->nr_to_write;
47114+
47115+ if (state == ALLOCATED_EXTENT) {
47116+ /*
47117+ * all protected nodes are not flushprepped, therefore
47118+ * they are counted as flush_reserved
47119+ */
47120+ block_stage = BLOCK_FLUSH_RESERVED;
47121+ protected = allocated_extent_slum_size(flush_pos, oid,
47122+ index, width);
47123+ if (protected == 0) {
47124+ flush_pos->state = POS_INVALID;
47125+ flush_pos->pos_in_unit = 0;
47126+ return 0;
47127+ }
47128+ } else {
47129+ block_stage = BLOCK_UNALLOCATED;
47130+ protected = width;
47131+ }
47132+
47133+ /*
47134+ * look at previous unit if possible. If it is allocated, make
47135+ * preceder more precise
47136+ */
47137+ if (coord->unit_pos &&
47138+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47139+ reiser4_pos_hint(flush_pos)->blk =
47140+ extent_get_start(ext - 1) +
47141+ extent_get_width(ext - 1);
47142+
47143+ /* allocate new block numbers for protected nodes */
47144+ extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47145+ protected,
47146+ &first_allocated, &allocated,
47147+ block_stage);
47148+
47149+ if (state == ALLOCATED_EXTENT)
47150+ /*
47151+ * on relocating - free nodes which are going to be
47152+ * relocated
47153+ */
47154+ reiser4_dealloc_blocks(&start, &allocated,
47155+ BLOCK_ALLOCATED, BA_DEFER);
47156+
47157+ /* assign new block numbers to protected nodes */
47158+ assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
47159+
47160+ /* prepare extent which will replace current one */
47161+ reiser4_set_extent(&replace_ext, first_allocated, allocated);
47162+
47163+ /* adjust extent item */
47164+ result = conv_extent(coord, &replace_ext);
47165+ if (result != 0 && result != -ENOMEM) {
47166+ warning("vs-1461",
47167+ "Failed to allocate extent. Should not happen\n");
47168+ return result;
47169+ }
47170+
47171+ /*
47172+ * break flush: we prepared for flushing as many blocks as we
47173+ * were asked for
47174+ */
47175+ if (flush_pos->nr_to_write == allocated)
47176+ flush_pos->state = POS_INVALID;
47177+ } else {
47178+ /* overwrite */
47179+ mark_jnodes_overwrite(flush_pos, oid, index, width);
47180+ }
47181+ flush_pos->pos_in_unit = 0;
47182+ return 0;
47183+}
47184+
47185+/* if @key is glueable to the item @coord is set to */
47186+static int must_insert(const coord_t *coord, const reiser4_key *key)
47187+{
47188+ reiser4_key last;
47189+
47190+ if (item_id_by_coord(coord) == EXTENT_POINTER_ID
47191+ && keyeq(append_key_extent(coord, &last), key))
47192+ return 0;
47193+ return 1;
47194+}
47195+
47196+/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
47197+ or modify last unit of last item to have greater width */
47198+static int put_unit_to_end(znode *node, const reiser4_key *key,
47199+ reiser4_extent *copy_ext)
47200+{
47201+ int result;
47202+ coord_t coord;
47203+ cop_insert_flag flags;
47204+ reiser4_extent *last_ext;
47205+ reiser4_item_data data;
47206+
47207+ /* set coord after last unit in an item */
47208+ coord_init_last_unit(&coord, node);
47209+ coord.between = AFTER_UNIT;
47210+
47211+ flags =
47212+ COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
47213+ if (must_insert(&coord, key)) {
47214+ result =
47215+ insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
47216+ key, NULL /*lh */ , flags);
47217+
47218+ } else {
47219+ /* try to glue with last unit */
47220+ last_ext = extent_by_coord(&coord);
47221+ if (state_of_extent(last_ext) &&
47222+ extent_get_start(last_ext) + extent_get_width(last_ext) ==
47223+ extent_get_start(copy_ext)) {
47224+ /* widen last unit of node */
47225+ extent_set_width(last_ext,
47226+ extent_get_width(last_ext) +
47227+ extent_get_width(copy_ext));
47228+ znode_make_dirty(node);
47229+ return 0;
47230+ }
47231+
47232+ /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
47233+ result =
47234+ insert_into_item(&coord, NULL /*lh */ , key,
47235+ init_new_extent(&data, copy_ext, 1),
47236+ flags);
47237+ }
47238+
47239+ assert("vs-438", result == 0 || result == -E_NODE_FULL);
47240+ return result;
47241+}
47242+
47243+/* @coord is set to extent unit */
47244+squeeze_result squalloc_extent(znode *left, const coord_t *coord,
47245+ flush_pos_t *flush_pos,
47246+ reiser4_key *stop_key)
47247+{
47248+ reiser4_extent *ext;
47249+ __u64 index;
47250+ __u64 width;
47251+ reiser4_block_nr start;
47252+ extent_state state;
47253+ oid_t oid;
47254+ reiser4_block_nr first_allocated;
47255+ __u64 allocated;
47256+ __u64 protected;
47257+ reiser4_extent copy_extent;
47258+ reiser4_key key;
47259+ int result;
47260+ block_stage_t block_stage;
47261+
47262+ assert("vs-1457", flush_pos->pos_in_unit == 0);
47263+ assert("vs-1467", coord_is_leftmost_unit(coord));
47264+ assert("vs-1467", item_is_extent(coord));
47265+
47266+ ext = extent_by_coord(coord);
47267+ index = extent_unit_index(coord);
47268+ start = extent_get_start(ext);
47269+ width = extent_get_width(ext);
47270+ state = state_of_extent(ext);
47271+ unit_key_by_coord(coord, &key);
47272+ oid = get_key_objectid(&key);
47273+
47274+ if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
47275+ (state == UNALLOCATED_EXTENT)) {
47276+ /* relocate */
47277+ if (state == ALLOCATED_EXTENT) {
47278+ /* all protected nodes are not flushprepped, therefore
47279+ * they are counted as flush_reserved */
47280+ block_stage = BLOCK_FLUSH_RESERVED;
47281+ protected = allocated_extent_slum_size(flush_pos, oid,
47282+ index, width);
47283+ if (protected == 0) {
47284+ flush_pos->state = POS_INVALID;
47285+ flush_pos->pos_in_unit = 0;
47286+ return 0;
47287+ }
47288+ } else {
47289+ block_stage = BLOCK_UNALLOCATED;
47290+ protected = width;
47291+ }
47292+
47293+ /*
47294+ * look at previous unit if possible. If it is allocated, make
47295+ * preceder more precise
47296+ */
47297+ if (coord->unit_pos &&
47298+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47299+ reiser4_pos_hint(flush_pos)->blk =
47300+ extent_get_start(ext - 1) +
47301+ extent_get_width(ext - 1);
47302+
47303+ /* allocate new block numbers for protected nodes */
47304+ extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47305+ protected,
47306+ &first_allocated, &allocated,
47307+ block_stage);
47308+
47309+ /* prepare extent which will be copied to left */
47310+ reiser4_set_extent(&copy_extent, first_allocated, allocated);
47311+
47312+ result = put_unit_to_end(left, &key, &copy_extent);
47313+ if (result == -E_NODE_FULL) {
47314+ int target_block_stage;
47315+
47316+ /* free blocks which were just allocated */
47317+ target_block_stage =
47318+ (state ==
47319+ ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
47320+ BLOCK_UNALLOCATED;
47321+ reiser4_dealloc_blocks(&first_allocated, &allocated,
47322+ target_block_stage,
47323+ BA_PERMANENT);
47324+
47325+ /* rewind the preceder. */
47326+ flush_pos->preceder.blk = first_allocated;
47327+ check_preceder(flush_pos->preceder.blk);
47328+
47329+ return SQUEEZE_TARGET_FULL;
47330+ }
47331+
47332+ if (state == ALLOCATED_EXTENT) {
47333+ /* free nodes which were relocated */
47334+ reiser4_dealloc_blocks(&start, &allocated,
47335+ BLOCK_ALLOCATED, BA_DEFER);
47336+ }
47337+
47338+ /* assign new block numbers to protected nodes */
47339+ assign_real_blocknrs(flush_pos, oid, index, allocated,
47340+ first_allocated);
47341+
47342+ set_key_offset(&key,
47343+ get_key_offset(&key) +
47344+ (allocated << current_blocksize_bits));
47345+ } else {
47346+ /*
47347+ * overwrite: try to copy unit as it is to left neighbor and
47348+ * make all first not flushprepped nodes overwrite nodes
47349+ */
47350+ reiser4_set_extent(&copy_extent, start, width);
47351+ result = put_unit_to_end(left, &key, &copy_extent);
47352+ if (result == -E_NODE_FULL)
47353+ return SQUEEZE_TARGET_FULL;
47354+
47355+ if (state != HOLE_EXTENT)
47356+ mark_jnodes_overwrite(flush_pos, oid, index, width);
47357+ set_key_offset(&key,
47358+ get_key_offset(&key) +
47359+ (width << current_blocksize_bits));
47360+ }
47361+ *stop_key = key;
47362+ return SQUEEZE_CONTINUE;
47363+}
47364+
47365+int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
47366+{
47367+ return key_by_inode_and_offset_common(inode, off, key);
47368+}
47369+
47370+/*
47371+ * Local variables:
47372+ * c-indentation-style: "K&R"
47373+ * mode-name: "LC"
47374+ * c-basic-offset: 8
47375+ * tab-width: 8
47376+ * fill-column: 79
47377+ * scroll-step: 1
47378+ * End:
47379+ */
47380diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/extent.h linux-2.6.23/fs/reiser4/plugin/item/extent.h
47381--- linux-2.6.23.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 03:00:00.000000000 +0300
47382+++ linux-2.6.23/fs/reiser4/plugin/item/extent.h 2007-12-04 16:49:30.000000000 +0300
47383@@ -0,0 +1,231 @@
47384+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47385+
47386+#ifndef __REISER4_EXTENT_H__
47387+#define __REISER4_EXTENT_H__
47388+
47389+/* on disk extent */
47390+typedef struct {
47391+ reiser4_dblock_nr start;
47392+ reiser4_dblock_nr width;
47393+} reiser4_extent;
47394+
47395+struct extent_stat {
47396+ int unallocated_units;
47397+ int unallocated_blocks;
47398+ int allocated_units;
47399+ int allocated_blocks;
47400+ int hole_units;
47401+ int hole_blocks;
47402+};
47403+
47404+/* extents in an extent item can be either holes, or unallocated or allocated
47405+ extents */
47406+typedef enum {
47407+ HOLE_EXTENT,
47408+ UNALLOCATED_EXTENT,
47409+ ALLOCATED_EXTENT
47410+} extent_state;
47411+
47412+#define HOLE_EXTENT_START 0
47413+#define UNALLOCATED_EXTENT_START 1
47414+#define UNALLOCATED_EXTENT_START2 2
47415+
47416+struct extent_coord_extension {
47417+ reiser4_block_nr pos_in_unit;
47418+ reiser4_block_nr width; /* width of current unit */
47419+ pos_in_node_t nr_units; /* number of units */
47420+ int ext_offset; /* offset from the beginning of zdata() */
47421+ unsigned long expected_page;
47422+#if REISER4_DEBUG
47423+ reiser4_extent extent;
47424+#endif
47425+};
47426+
47427+/* macros to set/get fields of on-disk extent */
47428+static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47429+{
47430+ return le64_to_cpu(ext->start);
47431+}
47432+
47433+static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47434+{
47435+ return le64_to_cpu(ext->width);
47436+}
47437+
47438+extern __u64 reiser4_current_block_count(void);
47439+
47440+static inline void
47441+extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47442+{
47443+ cassert(sizeof(ext->start) == 8);
47444+ assert("nikita-2510",
47445+ ergo(start > 1, start < reiser4_current_block_count()));
47446+ put_unaligned(cpu_to_le64(start), &ext->start);
47447+}
47448+
47449+static inline void
47450+extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
47451+{
47452+ cassert(sizeof(ext->width) == 8);
47453+ assert("", width > 0);
47454+ put_unaligned(cpu_to_le64(width), &ext->width);
47455+ assert("nikita-2511",
47456+ ergo(extent_get_start(ext) > 1,
47457+ extent_get_start(ext) + width <=
47458+ reiser4_current_block_count()));
47459+}
47460+
47461+#define extent_item(coord) \
47462+({ \
47463+ assert("nikita-3143", item_is_extent(coord)); \
47464+ ((reiser4_extent *)item_body_by_coord (coord)); \
47465+})
47466+
47467+#define extent_by_coord(coord) \
47468+({ \
47469+ assert("nikita-3144", item_is_extent(coord)); \
47470+ (extent_item (coord) + (coord)->unit_pos); \
47471+})
47472+
47473+#define width_by_coord(coord) \
47474+({ \
47475+ assert("nikita-3145", item_is_extent(coord)); \
47476+ extent_get_width (extent_by_coord(coord)); \
47477+})
47478+
47479+struct carry_cut_data;
47480+struct carry_kill_data;
47481+
47482+/* plugin->u.item.b.* */
47483+reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
47484+int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47485+ const reiser4_item_data *);
47486+int mergeable_extent(const coord_t * p1, const coord_t * p2);
47487+pos_in_node_t nr_units_extent(const coord_t *);
47488+lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
47489+void init_coord_extent(coord_t *);
47490+int init_extent(coord_t *, reiser4_item_data *);
47491+int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
47492+int can_shift_extent(unsigned free_space,
47493+ coord_t * source, znode * target, shift_direction,
47494+ unsigned *size, unsigned want);
47495+void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
47496+ unsigned count, shift_direction where_is_free_space,
47497+ unsigned free_space);
47498+int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
47499+ struct carry_kill_data *);
47500+int create_hook_extent(const coord_t * coord, void *arg);
47501+int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47502+ struct carry_cut_data *, reiser4_key * smallest_removed,
47503+ reiser4_key * new_first);
47504+int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47505+ struct carry_kill_data *, reiser4_key * smallest_removed,
47506+ reiser4_key * new_first);
47507+reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
47508+reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
47509+void print_extent(const char *, coord_t *);
47510+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
47511+int utmost_child_real_block_extent(const coord_t * coord, sideof side,
47512+ reiser4_block_nr * block);
47513+void item_stat_extent(const coord_t * coord, void *vp);
47514+int reiser4_check_extent(const coord_t * coord, const char **error);
47515+
47516+/* plugin->u.item.s.file.* */
47517+ssize_t reiser4_write_extent(struct file *, const char __user *,
47518+ size_t, loff_t *);
47519+int reiser4_read_extent(struct file *, flow_t *, hint_t *);
47520+int reiser4_readpage_extent(void *, struct page *);
47521+int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
47522+reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
47523+void init_coord_extension_extent(uf_coord_t *, loff_t offset);
47524+int get_block_address_extent(const coord_t *, sector_t block,
47525+ sector_t * result);
47526+
47527+/* these are used in flush.c
47528+ FIXME-VS: should they be somewhere in item_plugin? */
47529+int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
47530+int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
47531+ reiser4_key * stop_key);
47532+
47533+int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
47534+__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
47535+__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
47536+
47537+/* plugin->u.item.f. */
47538+int reiser4_scan_extent(flush_scan * scan);
47539+extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
47540+
47541+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47542+ int nr_extents);
47543+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
47544+extent_state state_of_extent(reiser4_extent * ext);
47545+void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
47546+ reiser4_block_nr width);
47547+int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
47548+ int *plugged_hole);
47549+
47550+#include "../../coord.h"
47551+#include "../../lock.h"
47552+#include "../../tap.h"
47553+
47554+struct replace_handle {
47555+ /* these are to be set before calling reiser4_replace_extent */
47556+ coord_t *coord;
47557+ lock_handle *lh;
47558+ reiser4_key key;
47559+ reiser4_key *pkey;
47560+ reiser4_extent overwrite;
47561+ reiser4_extent new_extents[2];
47562+ int nr_new_extents;
47563+ unsigned flags;
47564+
47565+ /* these are used by reiser4_replace_extent */
47566+ reiser4_item_data item;
47567+ coord_t coord_after;
47568+ lock_handle lh_after;
47569+ tap_t watch;
47570+ reiser4_key paste_key;
47571+#if REISER4_DEBUG
47572+ reiser4_extent orig_ext;
47573+ reiser4_key tmp;
47574+#endif
47575+};
47576+
47577+/* this structure is kmalloced before calling make_extent to avoid excessive
47578+ stack consumption on plug_hole->reiser4_replace_extent */
47579+struct make_extent_handle {
47580+ uf_coord_t *uf_coord;
47581+ reiser4_block_nr blocknr;
47582+ int created;
47583+ struct inode *inode;
47584+ union {
47585+ struct {
47586+ } append;
47587+ struct replace_handle replace;
47588+ } u;
47589+};
47590+
47591+int reiser4_replace_extent(struct replace_handle *,
47592+ int return_inserted_position);
47593+lock_handle *znode_lh(znode *);
47594+
47595+/* the reiser4 repacker support */
47596+struct repacker_cursor;
47597+extern int process_extent_backward_for_repacking(tap_t *,
47598+ struct repacker_cursor *);
47599+extern int mark_extent_for_repacking(tap_t *, int);
47600+
47601+#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
47602+#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
47603+
47604+/* __REISER4_EXTENT_H__ */
47605+#endif
47606+/*
47607+ Local variables:
47608+ c-indentation-style: "K&R"
47609+ mode-name: "LC"
47610+ c-basic-offset: 8
47611+ tab-width: 8
47612+ fill-column: 120
47613+ End:
47614+*/
47615diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.23/fs/reiser4/plugin/item/extent_item_ops.c
47616--- linux-2.6.23.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 03:00:00.000000000 +0300
47617+++ linux-2.6.23/fs/reiser4/plugin/item/extent_item_ops.c 2007-12-04 16:49:30.000000000 +0300
47618@@ -0,0 +1,889 @@
47619+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47620+
47621+#include "item.h"
47622+#include "../../inode.h"
47623+#include "../../tree_walk.h" /* check_sibling_list() */
47624+#include "../../page_cache.h"
47625+#include "../../carry.h"
47626+
47627+#include <linux/quotaops.h>
47628+
47629+/* item_plugin->b.max_key_inside */
47630+reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
47631+{
47632+ item_key_by_coord(coord, key);
47633+ set_key_offset(key, get_key_offset(reiser4_max_key()));
47634+ return key;
47635+}
47636+
47637+/* item_plugin->b.can_contain_key
47638+ this checks whether @key of @data is matching to position set by @coord */
47639+int
47640+can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47641+ const reiser4_item_data * data)
47642+{
47643+ reiser4_key item_key;
47644+
47645+ if (item_plugin_by_coord(coord) != data->iplug)
47646+ return 0;
47647+
47648+ item_key_by_coord(coord, &item_key);
47649+ if (get_key_locality(key) != get_key_locality(&item_key) ||
47650+ get_key_objectid(key) != get_key_objectid(&item_key) ||
47651+ get_key_ordering(key) != get_key_ordering(&item_key))
47652+ return 0;
47653+
47654+ return 1;
47655+}
47656+
47657+/* item_plugin->b.mergeable
47658+ first item is of extent type */
47659+/* Audited by: green(2002.06.13) */
47660+int mergeable_extent(const coord_t * p1, const coord_t * p2)
47661+{
47662+ reiser4_key key1, key2;
47663+
47664+ assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
47665+ /* FIXME-VS: Which is it? Assert or return 0 */
47666+ if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
47667+ return 0;
47668+ }
47669+
47670+ item_key_by_coord(p1, &key1);
47671+ item_key_by_coord(p2, &key2);
47672+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
47673+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
47674+ get_key_ordering(&key1) != get_key_ordering(&key2) ||
47675+ get_key_type(&key1) != get_key_type(&key2))
47676+ return 0;
47677+ if (get_key_offset(&key1) +
47678+ reiser4_extent_size(p1, nr_units_extent(p1)) !=
47679+ get_key_offset(&key2))
47680+ return 0;
47681+ return 1;
47682+}
47683+
47684+/* item_plugin->b.nr_units */
47685+pos_in_node_t nr_units_extent(const coord_t * coord)
47686+{
47687+ /* length of extent item has to be multiple of extent size */
47688+ assert("vs-1424",
47689+ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
47690+ return item_length_by_coord(coord) / sizeof(reiser4_extent);
47691+}
47692+
47693+/* item_plugin->b.lookup */
47694+lookup_result
47695+lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
47696+ coord_t * coord)
47697+{ /* znode and item_pos are
47698+ set to an extent item to
47699+ look through */
47700+ reiser4_key item_key;
47701+ reiser4_block_nr lookuped, offset;
47702+ unsigned i, nr_units;
47703+ reiser4_extent *ext;
47704+ unsigned blocksize;
47705+ unsigned char blocksize_bits;
47706+
47707+ item_key_by_coord(coord, &item_key);
47708+ offset = get_key_offset(&item_key);
47709+
47710+ /* key we are looking for must be greater than key of item @coord */
47711+ assert("vs-414", keygt(key, &item_key));
47712+
47713+ assert("umka-99945",
47714+ !keygt(key, max_key_inside_extent(coord, &item_key)));
47715+
47716+ ext = extent_item(coord);
47717+ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
47718+
47719+ blocksize = current_blocksize;
47720+ blocksize_bits = current_blocksize_bits;
47721+
47722+ /* offset we are looking for */
47723+ lookuped = get_key_offset(key);
47724+
47725+ nr_units = nr_units_extent(coord);
47726+ /* go through all extents until the one which address given offset */
47727+ for (i = 0; i < nr_units; i++, ext++) {
47728+ offset += (extent_get_width(ext) << blocksize_bits);
47729+ if (offset > lookuped) {
47730+ /* desired byte is somewhere in this extent */
47731+ coord->unit_pos = i;
47732+ coord->between = AT_UNIT;
47733+ return CBK_COORD_FOUND;
47734+ }
47735+ }
47736+
47737+ /* set coord after last unit */
47738+ coord->unit_pos = nr_units - 1;
47739+ coord->between = AFTER_UNIT;
47740+ return CBK_COORD_FOUND;
47741+}
47742+
47743+/* item_plugin->b.paste
47744+ item @coord is set to has been appended with @data->length of free
47745+ space. data->data contains data to be pasted into the item in position
47746+ @coord->in_item.unit_pos. It must fit into that free space.
47747+ @coord must be set between units.
47748+*/
47749+int
47750+paste_extent(coord_t * coord, reiser4_item_data * data,
47751+ carry_plugin_info * info UNUSED_ARG)
47752+{
47753+ unsigned old_nr_units;
47754+ reiser4_extent *ext;
47755+ int item_length;
47756+
47757+ ext = extent_item(coord);
47758+ item_length = item_length_by_coord(coord);
47759+ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
47760+
47761+ /* this is also used to copy extent into newly created item, so
47762+ old_nr_units could be 0 */
47763+ assert("vs-260", item_length >= data->length);
47764+
47765+ /* make sure that coord is set properly */
47766+ assert("vs-35",
47767+ ((!coord_is_existing_unit(coord))
47768+ || (!old_nr_units && !coord->unit_pos)));
47769+
47770+ /* first unit to be moved */
47771+ switch (coord->between) {
47772+ case AFTER_UNIT:
47773+ coord->unit_pos++;
47774+ case BEFORE_UNIT:
47775+ coord->between = AT_UNIT;
47776+ break;
47777+ case AT_UNIT:
47778+ assert("vs-331", !old_nr_units && !coord->unit_pos);
47779+ break;
47780+ default:
47781+ impossible("vs-330", "coord is set improperly");
47782+ }
47783+
47784+ /* prepare space for new units */
47785+ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
47786+ ext + coord->unit_pos,
47787+ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
47788+
47789+ /* copy new data from kernel space */
47790+ assert("vs-556", data->user == 0);
47791+ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
47792+
47793+ /* after paste @coord is set to first of pasted units */
47794+ assert("vs-332", coord_is_existing_unit(coord));
47795+ assert("vs-333",
47796+ !memcmp(data->data, extent_by_coord(coord),
47797+ (unsigned)data->length));
47798+ return 0;
47799+}
47800+
47801+/* item_plugin->b.can_shift */
47802+int
47803+can_shift_extent(unsigned free_space, coord_t * source,
47804+ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
47805+ unsigned *size, unsigned want)
47806+{
47807+ *size = item_length_by_coord(source);
47808+ if (*size > free_space)
47809+ /* never split a unit of extent item */
47810+ *size = free_space - free_space % sizeof(reiser4_extent);
47811+
47812+ /* we can shift *size bytes, calculate how many do we want to shift */
47813+ if (*size > want * sizeof(reiser4_extent))
47814+ *size = want * sizeof(reiser4_extent);
47815+
47816+ if (*size % sizeof(reiser4_extent) != 0)
47817+ impossible("vs-119", "Wrong extent size: %i %zd", *size,
47818+ sizeof(reiser4_extent));
47819+ return *size / sizeof(reiser4_extent);
47820+
47821+}
47822+
47823+/* item_plugin->b.copy_units */
47824+void
47825+copy_units_extent(coord_t * target, coord_t * source,
47826+ unsigned from, unsigned count,
47827+ shift_direction where_is_free_space, unsigned free_space)
47828+{
47829+ char *from_ext, *to_ext;
47830+
47831+ assert("vs-217", free_space == count * sizeof(reiser4_extent));
47832+
47833+ from_ext = item_body_by_coord(source);
47834+ to_ext = item_body_by_coord(target);
47835+
47836+ if (where_is_free_space == SHIFT_LEFT) {
47837+ assert("vs-215", from == 0);
47838+
47839+ /* At this moment, item length was already updated in the item
47840+ header by shifting code, hence nr_units_extent() will
47841+ return "new" number of units---one we obtain after copying
47842+ units.
47843+ */
47844+ to_ext +=
47845+ (nr_units_extent(target) - count) * sizeof(reiser4_extent);
47846+ } else {
47847+ reiser4_key key;
47848+ coord_t coord;
47849+
47850+ assert("vs-216",
47851+ from + count == coord_last_unit_pos(source) + 1);
47852+
47853+ from_ext += item_length_by_coord(source) - free_space;
47854+
47855+ /* new units are inserted before first unit in an item,
47856+ therefore, we have to update item key */
47857+ coord = *source;
47858+ coord.unit_pos = from;
47859+ unit_key_extent(&coord, &key);
47860+
47861+ node_plugin_by_node(target->node)->update_item_key(target, &key,
47862+ NULL /*info */);
47863+ }
47864+
47865+ memcpy(to_ext, from_ext, free_space);
47866+}
47867+
47868+/* item_plugin->b.create_hook
47869+ @arg is znode of leaf node for which we need to update right delimiting key */
47870+int create_hook_extent(const coord_t * coord, void *arg)
47871+{
47872+ coord_t *child_coord;
47873+ znode *node;
47874+ reiser4_key key;
47875+ reiser4_tree *tree;
47876+
47877+ if (!arg)
47878+ return 0;
47879+
47880+ child_coord = arg;
47881+ tree = znode_get_tree(coord->node);
47882+
47883+ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
47884+
47885+ write_lock_tree(tree);
47886+ write_lock_dk(tree);
47887+ /* find a node on the left level for which right delimiting key has to
47888+ be updated */
47889+ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
47890+ assert("vs-411", znode_is_left_connected(child_coord->node));
47891+ node = child_coord->node->left;
47892+ } else {
47893+ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
47894+ node = child_coord->node;
47895+ assert("nikita-3314", node != NULL);
47896+ }
47897+
47898+ if (node != NULL) {
47899+ znode_set_rd_key(node, item_key_by_coord(coord, &key));
47900+
47901+ assert("nikita-3282", check_sibling_list(node));
47902+ /* break sibling links */
47903+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
47904+ ON_DEBUG(node->right->left_version =
47905+ atomic_inc_return(&delim_key_version);
47906+ node->right_version =
47907+ atomic_inc_return(&delim_key_version););
47908+
47909+ node->right->left = NULL;
47910+ node->right = NULL;
47911+ }
47912+ }
47913+ write_unlock_dk(tree);
47914+ write_unlock_tree(tree);
47915+ return 0;
47916+}
47917+
47918+#define ITEM_TAIL_KILLED 0
47919+#define ITEM_HEAD_KILLED 1
47920+#define ITEM_KILLED 2
47921+
47922+/* item_plugin->b.kill_hook
47923+ this is called when @count units starting from @from-th one are going to be removed
47924+ */
47925+int
47926+kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
47927+ struct carry_kill_data *kdata)
47928+{
47929+ reiser4_extent *ext;
47930+ reiser4_block_nr start, length;
47931+ const reiser4_key *pfrom_key, *pto_key;
47932+ struct inode *inode;
47933+ reiser4_tree *tree;
47934+ pgoff_t from_off, to_off, offset, skip;
47935+ int retval;
47936+
47937+ /* these are located in memory kmalloc-ed by kill_node_content */
47938+ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
47939+ coord_t *dup, *next;
47940+
47941+ assert("zam-811", znode_is_write_locked(coord->node));
47942+ assert("nikita-3315", kdata != NULL);
47943+ assert("vs-34", kdata->buf != NULL);
47944+
47945+ /* map structures to kdata->buf */
47946+ min_item_key = (reiser4_key *) (kdata->buf);
47947+ max_item_key = min_item_key + 1;
47948+ from_key = max_item_key + 1;
47949+ to_key = from_key + 1;
47950+ key = to_key + 1;
47951+ dup = (coord_t *) (key + 1);
47952+ next = dup + 1;
47953+
47954+ item_key_by_coord(coord, min_item_key);
47955+ max_item_key_by_coord(coord, max_item_key);
47956+
47957+ if (kdata->params.from_key) {
47958+ pfrom_key = kdata->params.from_key;
47959+ pto_key = kdata->params.to_key;
47960+ } else {
47961+ assert("vs-1549", from == coord->unit_pos);
47962+ unit_key_by_coord(coord, from_key);
47963+ pfrom_key = from_key;
47964+
47965+ coord_dup(dup, coord);
47966+ dup->unit_pos = from + count - 1;
47967+ max_unit_key_by_coord(dup, to_key);
47968+ pto_key = to_key;
47969+ }
47970+
47971+ if (!keylt(pto_key, max_item_key)) {
47972+ if (!keygt(pfrom_key, min_item_key)) {
47973+ znode *left, *right;
47974+
47975+ /* item is to be removed completely */
47976+ assert("nikita-3316", kdata->left != NULL
47977+ && kdata->right != NULL);
47978+
47979+ left = kdata->left->node;
47980+ right = kdata->right->node;
47981+
47982+ tree = current_tree;
47983+ /* we have to do two things:
47984+ *
47985+ * 1. link left and right formatted neighbors of
47986+ * extent being removed, and
47987+ *
47988+ * 2. update their delimiting keys.
47989+ *
47990+ * atomicity of these operations is protected by
47991+ * taking dk-lock and tree-lock.
47992+ */
47993+ /* if neighbors of item being removed are znodes -
47994+ * link them */
47995+ write_lock_tree(tree);
47996+ write_lock_dk(tree);
47997+ link_left_and_right(left, right);
47998+ if (left) {
47999+ /* update right delimiting key of left
48000+ * neighbor of extent item */
48001+ /*coord_t next;
48002+ reiser4_key key; */
48003+
48004+ coord_dup(next, coord);
48005+
48006+ if (coord_next_item(next))
48007+ *key = *znode_get_rd_key(coord->node);
48008+ else
48009+ item_key_by_coord(next, key);
48010+ znode_set_rd_key(left, key);
48011+ }
48012+ write_unlock_dk(tree);
48013+ write_unlock_tree(tree);
48014+
48015+ from_off =
48016+ get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
48017+ to_off =
48018+ (get_key_offset(max_item_key) +
48019+ 1) >> PAGE_CACHE_SHIFT;
48020+ retval = ITEM_KILLED;
48021+ } else {
48022+ /* tail of item is to be removed */
48023+ from_off =
48024+ (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
48025+ 1) >> PAGE_CACHE_SHIFT;
48026+ to_off =
48027+ (get_key_offset(max_item_key) +
48028+ 1) >> PAGE_CACHE_SHIFT;
48029+ retval = ITEM_TAIL_KILLED;
48030+ }
48031+ } else {
48032+ /* head of item is to be removed */
48033+ assert("vs-1571", keyeq(pfrom_key, min_item_key));
48034+ assert("vs-1572",
48035+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
48036+ 0);
48037+ assert("vs-1573",
48038+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48039+ 1)) == 0);
48040+
48041+ if (kdata->left->node) {
48042+ /* update right delimiting key of left neighbor of extent item */
48043+ /*reiser4_key key; */
48044+
48045+ *key = *pto_key;
48046+ set_key_offset(key, get_key_offset(pto_key) + 1);
48047+
48048+ write_lock_dk(current_tree);
48049+ znode_set_rd_key(kdata->left->node, key);
48050+ write_unlock_dk(current_tree);
48051+ }
48052+
48053+ from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
48054+ to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
48055+ retval = ITEM_HEAD_KILLED;
48056+ }
48057+
48058+ inode = kdata->inode;
48059+ assert("vs-1545", inode != NULL);
48060+ if (inode != NULL)
48061+ /* take care of pages and jnodes corresponding to part of item being killed */
48062+ reiser4_invalidate_pages(inode->i_mapping, from_off,
48063+ to_off - from_off,
48064+ kdata->params.truncate);
48065+
48066+ ext = extent_item(coord) + from;
48067+ offset =
48068+ (get_key_offset(min_item_key) +
48069+ reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
48070+
48071+ assert("vs-1551", from_off >= offset);
48072+ assert("vs-1552", from_off - offset <= extent_get_width(ext));
48073+ skip = from_off - offset;
48074+ offset = from_off;
48075+
48076+ while (offset < to_off) {
48077+ length = extent_get_width(ext) - skip;
48078+ if (state_of_extent(ext) == HOLE_EXTENT) {
48079+ skip = 0;
48080+ offset += length;
48081+ ext++;
48082+ continue;
48083+ }
48084+
48085+ if (offset + length > to_off) {
48086+ length = to_off - offset;
48087+ }
48088+
48089+ DQUOT_FREE_BLOCK_NODIRTY(inode, length);
48090+
48091+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48092+ /* some jnodes corresponding to this unallocated extent */
48093+ fake_allocated2free(length, 0 /* unformatted */ );
48094+
48095+ skip = 0;
48096+ offset += length;
48097+ ext++;
48098+ continue;
48099+ }
48100+
48101+ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
48102+
48103+ if (length != 0) {
48104+ start = extent_get_start(ext) + skip;
48105+
48106+ /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
48107+ immediately */
48108+ reiser4_dealloc_blocks(&start, &length,
48109+ 0 /* not used */ ,
48110+ BA_DEFER
48111+ /* unformatted with defer */ );
48112+ }
48113+ skip = 0;
48114+ offset += length;
48115+ ext++;
48116+ }
48117+ return retval;
48118+}
48119+
48120+/* item_plugin->b.kill_units */
48121+int
48122+kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48123+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
48124+ reiser4_key * new_first)
48125+{
48126+ reiser4_extent *ext;
48127+ reiser4_key item_key;
48128+ pos_in_node_t count;
48129+ reiser4_key from_key, to_key;
48130+ const reiser4_key *pfrom_key, *pto_key;
48131+ loff_t off;
48132+ int result;
48133+
48134+ assert("vs-1541",
48135+ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
48136+ || (kdata->params.from_key != NULL
48137+ && kdata->params.to_key != NULL)));
48138+
48139+ if (kdata->params.from_key) {
48140+ pfrom_key = kdata->params.from_key;
48141+ pto_key = kdata->params.to_key;
48142+ } else {
48143+ coord_t dup;
48144+
48145+ /* calculate key range of kill */
48146+ assert("vs-1549", from == coord->unit_pos);
48147+ unit_key_by_coord(coord, &from_key);
48148+ pfrom_key = &from_key;
48149+
48150+ coord_dup(&dup, coord);
48151+ dup.unit_pos = to;
48152+ max_unit_key_by_coord(&dup, &to_key);
48153+ pto_key = &to_key;
48154+ }
48155+
48156+ item_key_by_coord(coord, &item_key);
48157+
48158+#if REISER4_DEBUG
48159+ {
48160+ reiser4_key max_item_key;
48161+
48162+ max_item_key_by_coord(coord, &max_item_key);
48163+
48164+ if (new_first) {
48165+ /* head of item is to be cut */
48166+ assert("vs-1542", keyeq(pfrom_key, &item_key));
48167+ assert("vs-1538", keylt(pto_key, &max_item_key));
48168+ } else {
48169+ /* tail of item is to be cut */
48170+ assert("vs-1540", keygt(pfrom_key, &item_key));
48171+ assert("vs-1543", !keylt(pto_key, &max_item_key));
48172+ }
48173+ }
48174+#endif
48175+
48176+ if (smallest_removed)
48177+ *smallest_removed = *pfrom_key;
48178+
48179+ if (new_first) {
48180+ /* item head is cut. Item key will change. This new key is calculated here */
48181+ assert("vs-1556",
48182+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48183+ (PAGE_CACHE_SIZE - 1));
48184+ *new_first = *pto_key;
48185+ set_key_offset(new_first, get_key_offset(new_first) + 1);
48186+ }
48187+
48188+ count = to - from + 1;
48189+ result = kill_hook_extent(coord, from, count, kdata);
48190+ if (result == ITEM_TAIL_KILLED) {
48191+ assert("vs-1553",
48192+ get_key_offset(pfrom_key) >=
48193+ get_key_offset(&item_key) +
48194+ reiser4_extent_size(coord, from));
48195+ off =
48196+ get_key_offset(pfrom_key) -
48197+ (get_key_offset(&item_key) +
48198+ reiser4_extent_size(coord, from));
48199+ if (off) {
48200+ /* unit @from is to be cut partially. Its width decreases */
48201+ ext = extent_item(coord) + from;
48202+ extent_set_width(ext,
48203+ (off + PAGE_CACHE_SIZE -
48204+ 1) >> PAGE_CACHE_SHIFT);
48205+ count--;
48206+ }
48207+ } else {
48208+ __u64 max_to_offset;
48209+ __u64 rest;
48210+
48211+ assert("vs-1575", result == ITEM_HEAD_KILLED);
48212+ assert("", from == 0);
48213+ assert("",
48214+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48215+ 1)) == 0);
48216+ assert("",
48217+ get_key_offset(pto_key) + 1 >
48218+ get_key_offset(&item_key) +
48219+ reiser4_extent_size(coord, to));
48220+ max_to_offset =
48221+ get_key_offset(&item_key) +
48222+ reiser4_extent_size(coord, to + 1) - 1;
48223+ assert("", get_key_offset(pto_key) <= max_to_offset);
48224+
48225+ rest =
48226+ (max_to_offset -
48227+ get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
48228+ if (rest) {
48229+ /* unit @to is to be cut partially */
48230+ ext = extent_item(coord) + to;
48231+
48232+ assert("", extent_get_width(ext) > rest);
48233+
48234+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
48235+ extent_set_start(ext,
48236+ extent_get_start(ext) +
48237+ (extent_get_width(ext) -
48238+ rest));
48239+
48240+ extent_set_width(ext, rest);
48241+ count--;
48242+ }
48243+ }
48244+ return count * sizeof(reiser4_extent);
48245+}
48246+
48247+/* item_plugin->b.cut_units
48248+ this is too similar to kill_units_extent */
48249+int
48250+cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48251+ struct carry_cut_data *cdata, reiser4_key * smallest_removed,
48252+ reiser4_key * new_first)
48253+{
48254+ reiser4_extent *ext;
48255+ reiser4_key item_key;
48256+ pos_in_node_t count;
48257+ reiser4_key from_key, to_key;
48258+ const reiser4_key *pfrom_key, *pto_key;
48259+ loff_t off;
48260+
48261+ assert("vs-1541",
48262+ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
48263+ || (cdata->params.from_key != NULL
48264+ && cdata->params.to_key != NULL)));
48265+
48266+ if (cdata->params.from_key) {
48267+ pfrom_key = cdata->params.from_key;
48268+ pto_key = cdata->params.to_key;
48269+ } else {
48270+ coord_t dup;
48271+
48272+ /* calculate key range of kill */
48273+ coord_dup(&dup, coord);
48274+ dup.unit_pos = from;
48275+ unit_key_by_coord(&dup, &from_key);
48276+
48277+ dup.unit_pos = to;
48278+ max_unit_key_by_coord(&dup, &to_key);
48279+
48280+ pfrom_key = &from_key;
48281+ pto_key = &to_key;
48282+ }
48283+
48284+ assert("vs-1555",
48285+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
48286+ assert("vs-1556",
48287+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48288+ (PAGE_CACHE_SIZE - 1));
48289+
48290+ item_key_by_coord(coord, &item_key);
48291+
48292+#if REISER4_DEBUG
48293+ {
48294+ reiser4_key max_item_key;
48295+
48296+ assert("vs-1584",
48297+ get_key_locality(pfrom_key) ==
48298+ get_key_locality(&item_key));
48299+ assert("vs-1585",
48300+ get_key_type(pfrom_key) == get_key_type(&item_key));
48301+ assert("vs-1586",
48302+ get_key_objectid(pfrom_key) ==
48303+ get_key_objectid(&item_key));
48304+ assert("vs-1587",
48305+ get_key_ordering(pfrom_key) ==
48306+ get_key_ordering(&item_key));
48307+
48308+ max_item_key_by_coord(coord, &max_item_key);
48309+
48310+ if (new_first != NULL) {
48311+ /* head of item is to be cut */
48312+ assert("vs-1542", keyeq(pfrom_key, &item_key));
48313+ assert("vs-1538", keylt(pto_key, &max_item_key));
48314+ } else {
48315+ /* tail of item is to be cut */
48316+ assert("vs-1540", keygt(pfrom_key, &item_key));
48317+ assert("vs-1543", keyeq(pto_key, &max_item_key));
48318+ }
48319+ }
48320+#endif
48321+
48322+ if (smallest_removed)
48323+ *smallest_removed = *pfrom_key;
48324+
48325+ if (new_first) {
48326+ /* item head is cut. Item key will change. This new key is calculated here */
48327+ *new_first = *pto_key;
48328+ set_key_offset(new_first, get_key_offset(new_first) + 1);
48329+ }
48330+
48331+ count = to - from + 1;
48332+
48333+ assert("vs-1553",
48334+ get_key_offset(pfrom_key) >=
48335+ get_key_offset(&item_key) + reiser4_extent_size(coord, from));
48336+ off =
48337+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
48338+ reiser4_extent_size(coord, from));
48339+ if (off) {
48340+ /* tail of unit @from is to be cut partially. Its width decreases */
48341+ assert("vs-1582", new_first == NULL);
48342+ ext = extent_item(coord) + from;
48343+ extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
48344+ count--;
48345+ }
48346+
48347+ assert("vs-1554",
48348+ get_key_offset(pto_key) <=
48349+ get_key_offset(&item_key) +
48350+ reiser4_extent_size(coord, to + 1) - 1);
48351+ off =
48352+ (get_key_offset(&item_key) +
48353+ reiser4_extent_size(coord, to + 1) - 1) -
48354+ get_key_offset(pto_key);
48355+ if (off) {
48356+ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
48357+ and width decreased. */
48358+ assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
48359+ ext = extent_item(coord) + to;
48360+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
48361+ extent_set_start(ext,
48362+ extent_get_start(ext) +
48363+ (extent_get_width(ext) -
48364+ (off >> PAGE_CACHE_SHIFT)));
48365+
48366+ extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
48367+ count--;
48368+ }
48369+ return count * sizeof(reiser4_extent);
48370+}
48371+
48372+/* item_plugin->b.unit_key */
48373+reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
48374+{
48375+ assert("vs-300", coord_is_existing_unit(coord));
48376+
48377+ item_key_by_coord(coord, key);
48378+ set_key_offset(key,
48379+ (get_key_offset(key) +
48380+ reiser4_extent_size(coord, coord->unit_pos)));
48381+
48382+ return key;
48383+}
48384+
48385+/* item_plugin->b.max_unit_key */
48386+reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
48387+{
48388+ assert("vs-300", coord_is_existing_unit(coord));
48389+
48390+ item_key_by_coord(coord, key);
48391+ set_key_offset(key,
48392+ (get_key_offset(key) +
48393+ reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
48394+ return key;
48395+}
48396+
48397+/* item_plugin->b.estimate
48398+ item_plugin->b.item_data_by_flow */
48399+
48400+#if REISER4_DEBUG
48401+
48402+/* item_plugin->b.check
48403+ used for debugging, every item should have here the most complete
48404+ possible check of the consistency of the item that the inventor can
48405+ construct
48406+*/
48407+int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
48408+ const char **error /* where to store error message */)
48409+{
48410+ reiser4_extent *ext, *first;
48411+ unsigned i, j;
48412+ reiser4_block_nr start, width, blk_cnt;
48413+ unsigned num_units;
48414+ reiser4_tree *tree;
48415+ oid_t oid;
48416+ reiser4_key key;
48417+ coord_t scan;
48418+
48419+ assert("vs-933", REISER4_DEBUG);
48420+
48421+ if (znode_get_level(coord->node) != TWIG_LEVEL) {
48422+ *error = "Extent on the wrong level";
48423+ return -1;
48424+ }
48425+ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
48426+ *error = "Wrong item size";
48427+ return -1;
48428+ }
48429+ ext = first = extent_item(coord);
48430+ blk_cnt = reiser4_block_count(reiser4_get_current_sb());
48431+ num_units = coord_num_units(coord);
48432+ tree = znode_get_tree(coord->node);
48433+ item_key_by_coord(coord, &key);
48434+ oid = get_key_objectid(&key);
48435+ coord_dup(&scan, coord);
48436+
48437+ for (i = 0; i < num_units; ++i, ++ext) {
48438+ __u64 index;
48439+
48440+ scan.unit_pos = i;
48441+ index = extent_unit_index(&scan);
48442+
48443+#if 0
48444+ /* check that all jnodes are present for the unallocated
48445+ * extent */
48446+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48447+ for (j = 0; j < extent_get_width(ext); j++) {
48448+ jnode *node;
48449+
48450+ node = jlookup(tree, oid, index + j);
48451+ if (node == NULL) {
48452+ print_coord("scan", &scan, 0);
48453+ *error = "Jnode missing";
48454+ return -1;
48455+ }
48456+ jput(node);
48457+ }
48458+ }
48459+#endif
48460+
48461+ start = extent_get_start(ext);
48462+ if (start < 2)
48463+ continue;
48464+ /* extent is allocated one */
48465+ width = extent_get_width(ext);
48466+ if (start >= blk_cnt) {
48467+ *error = "Start too large";
48468+ return -1;
48469+ }
48470+ if (start + width > blk_cnt) {
48471+ *error = "End too large";
48472+ return -1;
48473+ }
48474+ /* make sure that this extent does not overlap with other
48475+ allocated extents extents */
48476+ for (j = 0; j < i; j++) {
48477+ if (state_of_extent(first + j) != ALLOCATED_EXTENT)
48478+ continue;
48479+ if (!
48480+ ((extent_get_start(ext) >=
48481+ extent_get_start(first + j) +
48482+ extent_get_width(first + j))
48483+ || (extent_get_start(ext) +
48484+ extent_get_width(ext) <=
48485+ extent_get_start(first + j)))) {
48486+ *error = "Extent overlaps with others";
48487+ return -1;
48488+ }
48489+ }
48490+
48491+ }
48492+
48493+ return 0;
48494+}
48495+
48496+#endif /* REISER4_DEBUG */
48497+
48498+/*
48499+ Local variables:
48500+ c-indentation-style: "K&R"
48501+ mode-name: "LC"
48502+ c-basic-offset: 8
48503+ tab-width: 8
48504+ fill-column: 120
48505+ scroll-step: 1
48506+ End:
48507+*/
48508diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/internal.c linux-2.6.23/fs/reiser4/plugin/item/internal.c
48509--- linux-2.6.23.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 03:00:00.000000000 +0300
48510+++ linux-2.6.23/fs/reiser4/plugin/item/internal.c 2007-12-04 16:49:30.000000000 +0300
48511@@ -0,0 +1,396 @@
48512+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48513+
48514+/* Implementation of internal-item plugin methods. */
48515+
48516+#include "../../forward.h"
48517+#include "../../debug.h"
48518+#include "../../dformat.h"
48519+#include "../../key.h"
48520+#include "../../coord.h"
48521+#include "internal.h"
48522+#include "item.h"
48523+#include "../node/node.h"
48524+#include "../plugin.h"
48525+#include "../../jnode.h"
48526+#include "../../znode.h"
48527+#include "../../tree_walk.h"
48528+#include "../../tree_mod.h"
48529+#include "../../tree.h"
48530+#include "../../super.h"
48531+#include "../../block_alloc.h"
48532+
48533+/* see internal.h for explanation */
48534+
48535+/* plugin->u.item.b.mergeable */
48536+int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
48537+ const coord_t * p2 UNUSED_ARG /* second item */ )
48538+{
48539+ /* internal items are not mergeable */
48540+ return 0;
48541+}
48542+
48543+/* ->lookup() method for internal items */
48544+lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
48545+ lookup_bias bias UNUSED_ARG /* lookup bias */ ,
48546+ coord_t * coord /* coord of item */ )
48547+{
48548+ reiser4_key ukey;
48549+
48550+ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
48551+ default:
48552+ impossible("", "keycmp()?!");
48553+ case LESS_THAN:
48554+ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
48555+ item plugin can not be taken using coord set this way */
48556+ assert("vs-681", coord->unit_pos == 0);
48557+ coord->between = AFTER_UNIT;
48558+ case EQUAL_TO:
48559+ return CBK_COORD_FOUND;
48560+ case GREATER_THAN:
48561+ return CBK_COORD_NOTFOUND;
48562+ }
48563+}
48564+
48565+/* return body of internal item at @coord */
48566+static internal_item_layout *internal_at(const coord_t * coord /* coord of
48567+ * item */ )
48568+{
48569+ assert("nikita-607", coord != NULL);
48570+ assert("nikita-1650",
48571+ item_plugin_by_coord(coord) ==
48572+ item_plugin_by_id(NODE_POINTER_ID));
48573+ return (internal_item_layout *) item_body_by_coord(coord);
48574+}
48575+
48576+void reiser4_update_internal(const coord_t * coord,
48577+ const reiser4_block_nr * blocknr)
48578+{
48579+ internal_item_layout *item = internal_at(coord);
48580+ assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
48581+
48582+ put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
48583+}
48584+
48585+/* return child block number stored in the internal item at @coord */
48586+static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
48587+{
48588+ assert("nikita-608", coord != NULL);
48589+ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
48590+}
48591+
48592+/* get znode pointed to by internal @item */
48593+static znode *znode_at(const coord_t * item /* coord of item */ ,
48594+ znode * parent /* parent node */ )
48595+{
48596+ return child_znode(item, parent, 1, 0);
48597+}
48598+
48599+/* store pointer from internal item into "block". Implementation of
48600+ ->down_link() method */
48601+void down_link_internal(const coord_t * coord /* coord of item */ ,
48602+ const reiser4_key * key UNUSED_ARG /* key to get
48603+ * pointer for */ ,
48604+ reiser4_block_nr * block /* resulting block number */ )
48605+{
48606+ ON_DEBUG(reiser4_key item_key);
48607+
48608+ assert("nikita-609", coord != NULL);
48609+ assert("nikita-611", block != NULL);
48610+ assert("nikita-612", (key == NULL) ||
48611+ /* twig horrors */
48612+ (znode_get_level(coord->node) == TWIG_LEVEL)
48613+ || keyle(item_key_by_coord(coord, &item_key), key));
48614+
48615+ *block = pointer_at(coord);
48616+ assert("nikita-2960", reiser4_blocknr_is_sane(block));
48617+}
48618+
48619+/* Get the child's block number, or 0 if the block is unallocated. */
48620+int
48621+utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
48622+ reiser4_block_nr * block)
48623+{
48624+ assert("jmacd-2059", coord != NULL);
48625+
48626+ *block = pointer_at(coord);
48627+ assert("nikita-2961", reiser4_blocknr_is_sane(block));
48628+
48629+ if (reiser4_blocknr_is_fake(block)) {
48630+ *block = 0;
48631+ }
48632+
48633+ return 0;
48634+}
48635+
48636+/* Return the child. */
48637+int
48638+utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
48639+ jnode ** childp)
48640+{
48641+ reiser4_block_nr block = pointer_at(coord);
48642+ znode *child;
48643+
48644+ assert("jmacd-2059", childp != NULL);
48645+ assert("nikita-2962", reiser4_blocknr_is_sane(&block));
48646+
48647+ child = zlook(znode_get_tree(coord->node), &block);
48648+
48649+ if (IS_ERR(child)) {
48650+ return PTR_ERR(child);
48651+ }
48652+
48653+ *childp = ZJNODE(child);
48654+
48655+ return 0;
48656+}
48657+
48658+#if REISER4_DEBUG
48659+
48660+static void check_link(znode * left, znode * right)
48661+{
48662+ znode *scan;
48663+
48664+ for (scan = left; scan != right; scan = scan->right) {
48665+ if (ZF_ISSET(scan, JNODE_RIP))
48666+ break;
48667+ if (znode_is_right_connected(scan) && scan->right != NULL) {
48668+ if (ZF_ISSET(scan->right, JNODE_RIP))
48669+ break;
48670+ assert("nikita-3285",
48671+ znode_is_left_connected(scan->right));
48672+ assert("nikita-3265",
48673+ ergo(scan != left,
48674+ ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
48675+ assert("nikita-3284", scan->right->left == scan);
48676+ } else
48677+ break;
48678+ }
48679+}
48680+
48681+int check__internal(const coord_t * coord, const char **error)
48682+{
48683+ reiser4_block_nr blk;
48684+ znode *child;
48685+ coord_t cpy;
48686+
48687+ blk = pointer_at(coord);
48688+ if (!reiser4_blocknr_is_sane(&blk)) {
48689+ *error = "Invalid pointer";
48690+ return -1;
48691+ }
48692+ coord_dup(&cpy, coord);
48693+ child = znode_at(&cpy, cpy.node);
48694+ if (child != NULL) {
48695+ znode *left_child;
48696+ znode *right_child;
48697+
48698+ left_child = right_child = NULL;
48699+
48700+ assert("nikita-3256", znode_invariant(child));
48701+ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
48702+ left_child = znode_at(&cpy, cpy.node);
48703+ if (left_child != NULL) {
48704+ read_lock_tree(znode_get_tree(child));
48705+ check_link(left_child, child);
48706+ read_unlock_tree(znode_get_tree(child));
48707+ zput(left_child);
48708+ }
48709+ }
48710+ coord_dup(&cpy, coord);
48711+ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
48712+ right_child = znode_at(&cpy, cpy.node);
48713+ if (right_child != NULL) {
48714+ read_lock_tree(znode_get_tree(child));
48715+ check_link(child, right_child);
48716+ read_unlock_tree(znode_get_tree(child));
48717+ zput(right_child);
48718+ }
48719+ }
48720+ zput(child);
48721+ }
48722+ return 0;
48723+}
48724+
48725+#endif /* REISER4_DEBUG */
48726+
48727+/* return true only if this item really points to "block" */
48728+/* Audited by: green(2002.06.14) */
48729+int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
48730+ const reiser4_block_nr * block /* block number to
48731+ * check */ )
48732+{
48733+ assert("nikita-613", coord != NULL);
48734+ assert("nikita-614", block != NULL);
48735+
48736+ return pointer_at(coord) == *block;
48737+}
48738+
48739+/* hook called by ->create_item() method of node plugin after new internal
48740+ item was just created.
48741+
48742+ This is point where pointer to new node is inserted into tree. Initialize
48743+ parent pointer in child znode, insert child into sibling list and slum.
48744+
48745+*/
48746+int create_hook_internal(const coord_t * item /* coord of item */ ,
48747+ void *arg /* child's left neighbor, if any */ )
48748+{
48749+ znode *child;
48750+ __u64 child_ptr;
48751+
48752+ assert("nikita-1252", item != NULL);
48753+ assert("nikita-1253", item->node != NULL);
48754+ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
48755+ assert("nikita-1450", item->unit_pos == 0);
48756+
48757+ /*
48758+ * preparing to item insertion build_child_ptr_data sets pointer to
48759+ * data to be inserted to jnode's blocknr which is in cpu byte
48760+ * order. Node's create_item simply copied those data. As result we
48761+ * have child pointer in cpu's byte order. Convert content of internal
48762+ * item to little endian byte order.
48763+ */
48764+ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
48765+ reiser4_update_internal(item, &child_ptr);
48766+
48767+ child = znode_at(item, item->node);
48768+ if (child != NULL && !IS_ERR(child)) {
48769+ znode *left;
48770+ int result = 0;
48771+ reiser4_tree *tree;
48772+
48773+ left = arg;
48774+ tree = znode_get_tree(item->node);
48775+ write_lock_tree(tree);
48776+ write_lock_dk(tree);
48777+ assert("nikita-1400", (child->in_parent.node == NULL)
48778+ || (znode_above_root(child->in_parent.node)));
48779+ ++item->node->c_count;
48780+ coord_to_parent_coord(item, &child->in_parent);
48781+ sibling_list_insert_nolock(child, left);
48782+
48783+ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
48784+ ZF_CLR(child, JNODE_ORPHAN);
48785+
48786+ if ((left != NULL) && !keyeq(znode_get_rd_key(left),
48787+ znode_get_rd_key(child))) {
48788+ znode_set_rd_key(child, znode_get_rd_key(left));
48789+ }
48790+ write_unlock_dk(tree);
48791+ write_unlock_tree(tree);
48792+ zput(child);
48793+ return result;
48794+ } else {
48795+ if (child == NULL)
48796+ child = ERR_PTR(-EIO);
48797+ return PTR_ERR(child);
48798+ }
48799+}
48800+
48801+/* hook called by ->cut_and_kill() method of node plugin just before internal
48802+ item is removed.
48803+
48804+ This is point where empty node is removed from the tree. Clear parent
48805+ pointer in child, and mark node for pending deletion.
48806+
48807+ Node will be actually deleted later and in several installations:
48808+
48809+ . when last lock on this node will be released, node will be removed from
48810+ the sibling list and its lock will be invalidated
48811+
48812+ . when last reference to this node will be dropped, bitmap will be updated
48813+ and node will be actually removed from the memory.
48814+
48815+*/
48816+int kill_hook_internal(const coord_t * item /* coord of item */ ,
48817+ pos_in_node_t from UNUSED_ARG /* start unit */ ,
48818+ pos_in_node_t count UNUSED_ARG /* stop unit */ ,
48819+ struct carry_kill_data *p UNUSED_ARG)
48820+{
48821+ znode *child;
48822+
48823+ assert("nikita-1222", item != NULL);
48824+ assert("nikita-1224", from == 0);
48825+ assert("nikita-1225", count == 1);
48826+
48827+ child = znode_at(item, item->node);
48828+ if (IS_ERR(child))
48829+ return PTR_ERR(child);
48830+ else if (node_is_empty(child)) {
48831+ reiser4_tree *tree;
48832+
48833+ assert("nikita-1397", znode_is_write_locked(child));
48834+ assert("nikita-1398", child->c_count == 0);
48835+ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
48836+
48837+ tree = znode_get_tree(item->node);
48838+ write_lock_tree(tree);
48839+ init_parent_coord(&child->in_parent, NULL);
48840+ --item->node->c_count;
48841+ write_unlock_tree(tree);
48842+ zput(child);
48843+ return 0;
48844+ } else {
48845+ warning("nikita-1223",
48846+ "Cowardly refuse to remove link to non-empty node");
48847+ zput(child);
48848+ return RETERR(-EIO);
48849+ }
48850+}
48851+
48852+/* hook called by ->shift() node plugin method when iternal item was just
48853+ moved from one node to another.
48854+
48855+ Update parent pointer in child and c_counts in old and new parent
48856+
48857+*/
48858+int shift_hook_internal(const coord_t * item /* coord of item */ ,
48859+ unsigned from UNUSED_ARG /* start unit */ ,
48860+ unsigned count UNUSED_ARG /* stop unit */ ,
48861+ znode * old_node /* old parent */ )
48862+{
48863+ znode *child;
48864+ znode *new_node;
48865+ reiser4_tree *tree;
48866+
48867+ assert("nikita-1276", item != NULL);
48868+ assert("nikita-1277", from == 0);
48869+ assert("nikita-1278", count == 1);
48870+ assert("nikita-1451", item->unit_pos == 0);
48871+
48872+ new_node = item->node;
48873+ assert("nikita-2132", new_node != old_node);
48874+ tree = znode_get_tree(item->node);
48875+ child = child_znode(item, old_node, 1, 0);
48876+ if (child == NULL)
48877+ return 0;
48878+ if (!IS_ERR(child)) {
48879+ write_lock_tree(tree);
48880+ ++new_node->c_count;
48881+ assert("nikita-1395", znode_parent(child) == old_node);
48882+ assert("nikita-1396", old_node->c_count > 0);
48883+ coord_to_parent_coord(item, &child->in_parent);
48884+ assert("nikita-1781", znode_parent(child) == new_node);
48885+ assert("nikita-1782",
48886+ check_tree_pointer(item, child) == NS_FOUND);
48887+ --old_node->c_count;
48888+ write_unlock_tree(tree);
48889+ zput(child);
48890+ return 0;
48891+ } else
48892+ return PTR_ERR(child);
48893+}
48894+
48895+/* plugin->u.item.b.max_key_inside - not defined */
48896+
48897+/* plugin->u.item.b.nr_units - item.c:single_unit */
48898+
48899+/* Make Linus happy.
48900+ Local variables:
48901+ c-indentation-style: "K&R"
48902+ mode-name: "LC"
48903+ c-basic-offset: 8
48904+ tab-width: 8
48905+ fill-column: 120
48906+ End:
48907+*/
48908diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/internal.h linux-2.6.23/fs/reiser4/plugin/item/internal.h
48909--- linux-2.6.23.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 03:00:00.000000000 +0300
48910+++ linux-2.6.23/fs/reiser4/plugin/item/internal.h 2007-12-04 16:49:30.000000000 +0300
48911@@ -0,0 +1,57 @@
48912+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48913+/* Internal item contains down-link to the child of the internal/twig
48914+ node in a tree. It is internal items that are actually used during
48915+ tree traversal. */
48916+
48917+#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
48918+#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
48919+
48920+#include "../../forward.h"
48921+#include "../../dformat.h"
48922+
48923+/* on-disk layout of internal item */
48924+typedef struct internal_item_layout {
48925+ /* 0 */ reiser4_dblock_nr pointer;
48926+ /* 4 */
48927+} internal_item_layout;
48928+
48929+struct cut_list;
48930+
48931+int mergeable_internal(const coord_t * p1, const coord_t * p2);
48932+lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
48933+ coord_t * coord);
48934+/* store pointer from internal item into "block". Implementation of
48935+ ->down_link() method */
48936+extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
48937+ reiser4_block_nr * block);
48938+extern int has_pointer_to_internal(const coord_t * coord,
48939+ const reiser4_block_nr * block);
48940+extern int create_hook_internal(const coord_t * item, void *arg);
48941+extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
48942+ pos_in_node_t count, struct carry_kill_data *);
48943+extern int shift_hook_internal(const coord_t * item, unsigned from,
48944+ unsigned count, znode * old_node);
48945+extern void reiser4_print_internal(const char *prefix, coord_t * coord);
48946+
48947+extern int utmost_child_internal(const coord_t * coord, sideof side,
48948+ jnode ** child);
48949+int utmost_child_real_block_internal(const coord_t * coord, sideof side,
48950+ reiser4_block_nr * block);
48951+
48952+extern void reiser4_update_internal(const coord_t * coord,
48953+ const reiser4_block_nr * blocknr);
48954+/* FIXME: reiserfs has check_internal */
48955+extern int check__internal(const coord_t * coord, const char **error);
48956+
48957+/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
48958+#endif
48959+
48960+/* Make Linus happy.
48961+ Local variables:
48962+ c-indentation-style: "K&R"
48963+ mode-name: "LC"
48964+ c-basic-offset: 8
48965+ tab-width: 8
48966+ fill-column: 120
48967+ End:
48968+*/
48969diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/item.c linux-2.6.23/fs/reiser4/plugin/item/item.c
48970--- linux-2.6.23.orig/fs/reiser4/plugin/item/item.c 1970-01-01 03:00:00.000000000 +0300
48971+++ linux-2.6.23/fs/reiser4/plugin/item/item.c 2007-12-04 16:49:30.000000000 +0300
48972@@ -0,0 +1,719 @@
48973+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48974+
48975+/* definition of item plugins. */
48976+
48977+#include "../../forward.h"
48978+#include "../../debug.h"
48979+#include "../../key.h"
48980+#include "../../coord.h"
48981+#include "../plugin_header.h"
48982+#include "sde.h"
48983+#include "internal.h"
48984+#include "item.h"
48985+#include "static_stat.h"
48986+#include "../plugin.h"
48987+#include "../../znode.h"
48988+#include "../../tree.h"
48989+#include "../../context.h"
48990+#include "ctail.h"
48991+
48992+/* return pointer to item body */
48993+void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
48994+{
48995+ assert("nikita-324", coord != NULL);
48996+ assert("nikita-325", coord->node != NULL);
48997+ assert("nikita-326", znode_is_loaded(coord->node));
48998+ assert("nikita-3200", coord->offset == INVALID_OFFSET);
48999+
49000+ coord->offset =
49001+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
49002+ zdata(coord->node);
49003+ ON_DEBUG(coord->body_v = coord->node->times_locked);
49004+}
49005+
49006+void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
49007+{
49008+ return zdata(coord->node) + coord->offset;
49009+}
49010+
49011+#if REISER4_DEBUG
49012+
49013+int item_body_is_valid(const coord_t * coord)
49014+{
49015+ return
49016+ coord->offset ==
49017+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
49018+ zdata(coord->node);
49019+}
49020+
49021+#endif
49022+
49023+/* return length of item at @coord */
49024+pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
49025+{
49026+ int len;
49027+
49028+ assert("nikita-327", coord != NULL);
49029+ assert("nikita-328", coord->node != NULL);
49030+ assert("nikita-329", znode_is_loaded(coord->node));
49031+
49032+ len = node_plugin_by_node(coord->node)->length_by_coord(coord);
49033+ return len;
49034+}
49035+
49036+void obtain_item_plugin(const coord_t * coord)
49037+{
49038+ assert("nikita-330", coord != NULL);
49039+ assert("nikita-331", coord->node != NULL);
49040+ assert("nikita-332", znode_is_loaded(coord->node));
49041+
49042+ coord_set_iplug((coord_t *) coord,
49043+ node_plugin_by_node(coord->node)->
49044+ plugin_by_coord(coord));
49045+ assert("nikita-2479",
49046+ coord_iplug(coord) ==
49047+ node_plugin_by_node(coord->node)->plugin_by_coord(coord));
49048+}
49049+
49050+/* return id of item */
49051+/* Audited by: green(2002.06.15) */
49052+item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
49053+{
49054+ assert("vs-539", coord != NULL);
49055+ assert("vs-538", coord->node != NULL);
49056+ assert("vs-537", znode_is_loaded(coord->node));
49057+ assert("vs-536", item_plugin_by_coord(coord) != NULL);
49058+ assert("vs-540",
49059+ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
49060+
49061+ return item_id_by_plugin(item_plugin_by_coord(coord));
49062+}
49063+
49064+/* return key of item at @coord */
49065+/* Audited by: green(2002.06.15) */
49066+reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
49067+ reiser4_key * key /* result */ )
49068+{
49069+ assert("nikita-338", coord != NULL);
49070+ assert("nikita-339", coord->node != NULL);
49071+ assert("nikita-340", znode_is_loaded(coord->node));
49072+
49073+ return node_plugin_by_node(coord->node)->key_at(coord, key);
49074+}
49075+
49076+/* this returns max key in the item */
49077+reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
49078+ reiser4_key * key /* result */ )
49079+{
49080+ coord_t last;
49081+
49082+ assert("nikita-338", coord != NULL);
49083+ assert("nikita-339", coord->node != NULL);
49084+ assert("nikita-340", znode_is_loaded(coord->node));
49085+
49086+ /* make coord pointing to last item's unit */
49087+ coord_dup(&last, coord);
49088+ last.unit_pos = coord_num_units(&last) - 1;
49089+ assert("vs-1560", coord_is_existing_unit(&last));
49090+
49091+ max_unit_key_by_coord(&last, key);
49092+ return key;
49093+}
49094+
49095+/* return key of unit at @coord */
49096+reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49097+ reiser4_key * key /* result */ )
49098+{
49099+ assert("nikita-772", coord != NULL);
49100+ assert("nikita-774", coord->node != NULL);
49101+ assert("nikita-775", znode_is_loaded(coord->node));
49102+
49103+ if (item_plugin_by_coord(coord)->b.unit_key != NULL)
49104+ return item_plugin_by_coord(coord)->b.unit_key(coord, key);
49105+ else
49106+ return item_key_by_coord(coord, key);
49107+}
49108+
49109+/* return the biggest key contained the unit @coord */
49110+reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49111+ reiser4_key * key /* result */ )
49112+{
49113+ assert("nikita-772", coord != NULL);
49114+ assert("nikita-774", coord->node != NULL);
49115+ assert("nikita-775", znode_is_loaded(coord->node));
49116+
49117+ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
49118+ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
49119+ else
49120+ return unit_key_by_coord(coord, key);
49121+}
49122+
49123+/* ->max_key_inside() method for items consisting of exactly one key (like
49124+ stat-data) */
49125+static reiser4_key *max_key_inside_single_key(const coord_t *
49126+ coord /* coord of item */ ,
49127+ reiser4_key *
49128+ result /* resulting key */ )
49129+{
49130+ assert("nikita-604", coord != NULL);
49131+
49132+ /* coord -> key is starting key of this item and it has to be already
49133+ filled in */
49134+ return unit_key_by_coord(coord, result);
49135+}
49136+
49137+/* ->nr_units() method for items consisting of exactly one unit always */
49138+pos_in_node_t
49139+nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
49140+{
49141+ return 1;
49142+}
49143+
49144+static int
49145+paste_no_paste(coord_t * coord UNUSED_ARG,
49146+ reiser4_item_data * data UNUSED_ARG,
49147+ carry_plugin_info * info UNUSED_ARG)
49148+{
49149+ return 0;
49150+}
49151+
49152+/* default ->fast_paste() method */
49153+static int
49154+agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
49155+{
49156+ return 1;
49157+}
49158+
49159+int item_can_contain_key(const coord_t * item /* coord of item */ ,
49160+ const reiser4_key * key /* key to check */ ,
49161+ const reiser4_item_data * data /* parameters of item
49162+ * being created */ )
49163+{
49164+ item_plugin *iplug;
49165+ reiser4_key min_key_in_item;
49166+ reiser4_key max_key_in_item;
49167+
49168+ assert("nikita-1658", item != NULL);
49169+ assert("nikita-1659", key != NULL);
49170+
49171+ iplug = item_plugin_by_coord(item);
49172+ if (iplug->b.can_contain_key != NULL)
49173+ return iplug->b.can_contain_key(item, key, data);
49174+ else {
49175+ assert("nikita-1681", iplug->b.max_key_inside != NULL);
49176+ item_key_by_coord(item, &min_key_in_item);
49177+ iplug->b.max_key_inside(item, &max_key_in_item);
49178+
49179+ /* can contain key if
49180+ min_key_in_item <= key &&
49181+ key <= max_key_in_item
49182+ */
49183+ return keyle(&min_key_in_item, key)
49184+ && keyle(key, &max_key_in_item);
49185+ }
49186+}
49187+
49188+/* mergeable method for non mergeable items */
49189+static int
49190+not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
49191+{
49192+ return 0;
49193+}
49194+
49195+/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
49196+int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
49197+ const coord_t * i2 /* coord of second item */ )
49198+{
49199+ item_plugin *iplug;
49200+ reiser4_key k1;
49201+ reiser4_key k2;
49202+
49203+ assert("nikita-1336", i1 != NULL);
49204+ assert("nikita-1337", i2 != NULL);
49205+
49206+ iplug = item_plugin_by_coord(i1);
49207+ assert("nikita-1338", iplug != NULL);
49208+
49209+ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
49210+ shifting code when nodes are in "suspended" state. */
49211+ assert("nikita-1663",
49212+ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
49213+
49214+ if (iplug->b.mergeable != NULL) {
49215+ return iplug->b.mergeable(i1, i2);
49216+ } else if (iplug->b.max_key_inside != NULL) {
49217+ iplug->b.max_key_inside(i1, &k1);
49218+ item_key_by_coord(i2, &k2);
49219+
49220+ /* mergeable if ->max_key_inside() >= key of i2; */
49221+ return keyge(iplug->b.max_key_inside(i1, &k1),
49222+ item_key_by_coord(i2, &k2));
49223+ } else {
49224+ item_key_by_coord(i1, &k1);
49225+ item_key_by_coord(i2, &k2);
49226+
49227+ return
49228+ (get_key_locality(&k1) == get_key_locality(&k2)) &&
49229+ (get_key_objectid(&k1) == get_key_objectid(&k2))
49230+ && (iplug == item_plugin_by_coord(i2));
49231+ }
49232+}
49233+
49234+int item_is_extent(const coord_t * item)
49235+{
49236+ assert("vs-482", coord_is_existing_item(item));
49237+ return item_id_by_coord(item) == EXTENT_POINTER_ID;
49238+}
49239+
49240+int item_is_tail(const coord_t * item)
49241+{
49242+ assert("vs-482", coord_is_existing_item(item));
49243+ return item_id_by_coord(item) == FORMATTING_ID;
49244+}
49245+
49246+#if REISER4_DEBUG
49247+
49248+int item_is_statdata(const coord_t * item)
49249+{
49250+ assert("vs-516", coord_is_existing_item(item));
49251+ return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
49252+}
49253+
49254+int item_is_ctail(const coord_t * item)
49255+{
49256+ assert("edward-xx", coord_is_existing_item(item));
49257+ return item_id_by_coord(item) == CTAIL_ID;
49258+}
49259+
49260+#endif /* REISER4_DEBUG */
49261+
49262+static int change_item(struct inode *inode,
49263+ reiser4_plugin * plugin,
49264+ pset_member memb)
49265+{
49266+ /* cannot change constituent item (sd, or dir_item) */
49267+ return RETERR(-EINVAL);
49268+}
49269+
49270+static reiser4_plugin_ops item_plugin_ops = {
49271+ .init = NULL,
49272+ .load = NULL,
49273+ .save_len = NULL,
49274+ .save = NULL,
49275+ .change = change_item
49276+};
49277+
49278+item_plugin item_plugins[LAST_ITEM_ID] = {
49279+ [STATIC_STAT_DATA_ID] = {
49280+ .h = {
49281+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49282+ .id = STATIC_STAT_DATA_ID,
49283+ .groups = (1 << STAT_DATA_ITEM_TYPE),
49284+ .pops = &item_plugin_ops,
49285+ .label = "sd",
49286+ .desc = "stat-data",
49287+ .linkage = {NULL, NULL}
49288+ },
49289+ .b = {
49290+ .max_key_inside = max_key_inside_single_key,
49291+ .can_contain_key = NULL,
49292+ .mergeable = not_mergeable,
49293+ .nr_units = nr_units_single_unit,
49294+ .lookup = NULL,
49295+ .init = NULL,
49296+ .paste = paste_no_paste,
49297+ .fast_paste = NULL,
49298+ .can_shift = NULL,
49299+ .copy_units = NULL,
49300+ .create_hook = NULL,
49301+ .kill_hook = NULL,
49302+ .shift_hook = NULL,
49303+ .cut_units = NULL,
49304+ .kill_units = NULL,
49305+ .unit_key = NULL,
49306+ .max_unit_key = NULL,
49307+ .estimate = NULL,
49308+ .item_data_by_flow = NULL,
49309+#if REISER4_DEBUG
49310+ .check = NULL
49311+#endif
49312+ },
49313+ .f = {
49314+ .utmost_child = NULL,
49315+ .utmost_child_real_block = NULL,
49316+ .update = NULL,
49317+ .scan = NULL,
49318+ .convert = NULL
49319+ },
49320+ .s = {
49321+ .sd = {
49322+ .init_inode = init_inode_static_sd,
49323+ .save_len = save_len_static_sd,
49324+ .save = save_static_sd
49325+ }
49326+ }
49327+ },
49328+ [SIMPLE_DIR_ENTRY_ID] = {
49329+ .h = {
49330+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49331+ .id = SIMPLE_DIR_ENTRY_ID,
49332+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49333+ .pops = &item_plugin_ops,
49334+ .label = "de",
49335+ .desc = "directory entry",
49336+ .linkage = {NULL, NULL}
49337+ },
49338+ .b = {
49339+ .max_key_inside = max_key_inside_single_key,
49340+ .can_contain_key = NULL,
49341+ .mergeable = NULL,
49342+ .nr_units = nr_units_single_unit,
49343+ .lookup = NULL,
49344+ .init = NULL,
49345+ .paste = NULL,
49346+ .fast_paste = NULL,
49347+ .can_shift = NULL,
49348+ .copy_units = NULL,
49349+ .create_hook = NULL,
49350+ .kill_hook = NULL,
49351+ .shift_hook = NULL,
49352+ .cut_units = NULL,
49353+ .kill_units = NULL,
49354+ .unit_key = NULL,
49355+ .max_unit_key = NULL,
49356+ .estimate = NULL,
49357+ .item_data_by_flow = NULL,
49358+#if REISER4_DEBUG
49359+ .check = NULL
49360+#endif
49361+ },
49362+ .f = {
49363+ .utmost_child = NULL,
49364+ .utmost_child_real_block = NULL,
49365+ .update = NULL,
49366+ .scan = NULL,
49367+ .convert = NULL
49368+ },
49369+ .s = {
49370+ .dir = {
49371+ .extract_key = extract_key_de,
49372+ .update_key = update_key_de,
49373+ .extract_name = extract_name_de,
49374+ .extract_file_type = extract_file_type_de,
49375+ .add_entry = add_entry_de,
49376+ .rem_entry = rem_entry_de,
49377+ .max_name_len = max_name_len_de
49378+ }
49379+ }
49380+ },
49381+ [COMPOUND_DIR_ID] = {
49382+ .h = {
49383+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49384+ .id = COMPOUND_DIR_ID,
49385+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49386+ .pops = &item_plugin_ops,
49387+ .label = "cde",
49388+ .desc = "compressed directory entry",
49389+ .linkage = {NULL, NULL}
49390+ },
49391+ .b = {
49392+ .max_key_inside = max_key_inside_cde,
49393+ .can_contain_key = can_contain_key_cde,
49394+ .mergeable = mergeable_cde,
49395+ .nr_units = nr_units_cde,
49396+ .lookup = lookup_cde,
49397+ .init = init_cde,
49398+ .paste = paste_cde,
49399+ .fast_paste = agree_to_fast_op,
49400+ .can_shift = can_shift_cde,
49401+ .copy_units = copy_units_cde,
49402+ .create_hook = NULL,
49403+ .kill_hook = NULL,
49404+ .shift_hook = NULL,
49405+ .cut_units = cut_units_cde,
49406+ .kill_units = kill_units_cde,
49407+ .unit_key = unit_key_cde,
49408+ .max_unit_key = unit_key_cde,
49409+ .estimate = estimate_cde,
49410+ .item_data_by_flow = NULL,
49411+#if REISER4_DEBUG
49412+ .check = reiser4_check_cde
49413+#endif
49414+ },
49415+ .f = {
49416+ .utmost_child = NULL,
49417+ .utmost_child_real_block = NULL,
49418+ .update = NULL,
49419+ .scan = NULL,
49420+ .convert = NULL
49421+ },
49422+ .s = {
49423+ .dir = {
49424+ .extract_key = extract_key_cde,
49425+ .update_key = update_key_cde,
49426+ .extract_name = extract_name_cde,
49427+ .extract_file_type = extract_file_type_de,
49428+ .add_entry = add_entry_cde,
49429+ .rem_entry = rem_entry_cde,
49430+ .max_name_len = max_name_len_cde
49431+ }
49432+ }
49433+ },
49434+ [NODE_POINTER_ID] = {
49435+ .h = {
49436+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49437+ .id = NODE_POINTER_ID,
49438+ .groups = (1 << INTERNAL_ITEM_TYPE),
49439+ .pops = NULL,
49440+ .label = "internal",
49441+ .desc = "internal item",
49442+ .linkage = {NULL, NULL}
49443+ },
49444+ .b = {
49445+ .max_key_inside = NULL,
49446+ .can_contain_key = NULL,
49447+ .mergeable = mergeable_internal,
49448+ .nr_units = nr_units_single_unit,
49449+ .lookup = lookup_internal,
49450+ .init = NULL,
49451+ .paste = NULL,
49452+ .fast_paste = NULL,
49453+ .can_shift = NULL,
49454+ .copy_units = NULL,
49455+ .create_hook = create_hook_internal,
49456+ .kill_hook = kill_hook_internal,
49457+ .shift_hook = shift_hook_internal,
49458+ .cut_units = NULL,
49459+ .kill_units = NULL,
49460+ .unit_key = NULL,
49461+ .max_unit_key = NULL,
49462+ .estimate = NULL,
49463+ .item_data_by_flow = NULL,
49464+#if REISER4_DEBUG
49465+ .check = check__internal
49466+#endif
49467+ },
49468+ .f = {
49469+ .utmost_child = utmost_child_internal,
49470+ .utmost_child_real_block =
49471+ utmost_child_real_block_internal,
49472+ .update = reiser4_update_internal,
49473+ .scan = NULL,
49474+ .convert = NULL
49475+ },
49476+ .s = {
49477+ .internal = {
49478+ .down_link = down_link_internal,
49479+ .has_pointer_to = has_pointer_to_internal
49480+ }
49481+ }
49482+ },
49483+ [EXTENT_POINTER_ID] = {
49484+ .h = {
49485+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49486+ .id = EXTENT_POINTER_ID,
49487+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49488+ .pops = NULL,
49489+ .label = "extent",
49490+ .desc = "extent item",
49491+ .linkage = {NULL, NULL}
49492+ },
49493+ .b = {
49494+ .max_key_inside = max_key_inside_extent,
49495+ .can_contain_key = can_contain_key_extent,
49496+ .mergeable = mergeable_extent,
49497+ .nr_units = nr_units_extent,
49498+ .lookup = lookup_extent,
49499+ .init = NULL,
49500+ .paste = paste_extent,
49501+ .fast_paste = agree_to_fast_op,
49502+ .can_shift = can_shift_extent,
49503+ .create_hook = create_hook_extent,
49504+ .copy_units = copy_units_extent,
49505+ .kill_hook = kill_hook_extent,
49506+ .shift_hook = NULL,
49507+ .cut_units = cut_units_extent,
49508+ .kill_units = kill_units_extent,
49509+ .unit_key = unit_key_extent,
49510+ .max_unit_key = max_unit_key_extent,
49511+ .estimate = NULL,
49512+ .item_data_by_flow = NULL,
49513+#if REISER4_DEBUG
49514+ .check = reiser4_check_extent
49515+#endif
49516+ },
49517+ .f = {
49518+ .utmost_child = utmost_child_extent,
49519+ .utmost_child_real_block =
49520+ utmost_child_real_block_extent,
49521+ .update = NULL,
49522+ .scan = reiser4_scan_extent,
49523+ .convert = NULL,
49524+ .key_by_offset = key_by_offset_extent
49525+ },
49526+ .s = {
49527+ .file = {
49528+ .write = reiser4_write_extent,
49529+ .read = reiser4_read_extent,
49530+ .readpage = reiser4_readpage_extent,
49531+ .get_block = get_block_address_extent,
49532+ .append_key = append_key_extent,
49533+ .init_coord_extension =
49534+ init_coord_extension_extent
49535+ }
49536+ }
49537+ },
49538+ [FORMATTING_ID] = {
49539+ .h = {
49540+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49541+ .id = FORMATTING_ID,
49542+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49543+ .pops = NULL,
49544+ .label = "body",
49545+ .desc = "body (or tail?) item",
49546+ .linkage = {NULL, NULL}
49547+ },
49548+ .b = {
49549+ .max_key_inside = max_key_inside_tail,
49550+ .can_contain_key = can_contain_key_tail,
49551+ .mergeable = mergeable_tail,
49552+ .nr_units = nr_units_tail,
49553+ .lookup = lookup_tail,
49554+ .init = NULL,
49555+ .paste = paste_tail,
49556+ .fast_paste = agree_to_fast_op,
49557+ .can_shift = can_shift_tail,
49558+ .create_hook = NULL,
49559+ .copy_units = copy_units_tail,
49560+ .kill_hook = kill_hook_tail,
49561+ .shift_hook = NULL,
49562+ .cut_units = cut_units_tail,
49563+ .kill_units = kill_units_tail,
49564+ .unit_key = unit_key_tail,
49565+ .max_unit_key = unit_key_tail,
49566+ .estimate = NULL,
49567+ .item_data_by_flow = NULL,
49568+#if REISER4_DEBUG
49569+ .check = NULL
49570+#endif
49571+ },
49572+ .f = {
49573+ .utmost_child = NULL,
49574+ .utmost_child_real_block = NULL,
49575+ .update = NULL,
49576+ .scan = NULL,
49577+ .convert = NULL
49578+ },
49579+ .s = {
49580+ .file = {
49581+ .write = reiser4_write_tail,
49582+ .read = reiser4_read_tail,
49583+ .readpage = readpage_tail,
49584+ .get_block = get_block_address_tail,
49585+ .append_key = append_key_tail,
49586+ .init_coord_extension =
49587+ init_coord_extension_tail
49588+ }
49589+ }
49590+ },
49591+ [CTAIL_ID] = {
49592+ .h = {
49593+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49594+ .id = CTAIL_ID,
49595+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49596+ .pops = NULL,
49597+ .label = "ctail",
49598+ .desc = "cryptcompress tail item",
49599+ .linkage = {NULL, NULL}
49600+ },
49601+ .b = {
49602+ .max_key_inside = max_key_inside_tail,
49603+ .can_contain_key = can_contain_key_ctail,
49604+ .mergeable = mergeable_ctail,
49605+ .nr_units = nr_units_ctail,
49606+ .lookup = NULL,
49607+ .init = init_ctail,
49608+ .paste = paste_ctail,
49609+ .fast_paste = agree_to_fast_op,
49610+ .can_shift = can_shift_ctail,
49611+ .create_hook = create_hook_ctail,
49612+ .copy_units = copy_units_ctail,
49613+ .kill_hook = kill_hook_ctail,
49614+ .shift_hook = shift_hook_ctail,
49615+ .cut_units = cut_units_ctail,
49616+ .kill_units = kill_units_ctail,
49617+ .unit_key = unit_key_tail,
49618+ .max_unit_key = unit_key_tail,
49619+ .estimate = estimate_ctail,
49620+ .item_data_by_flow = NULL,
49621+#if REISER4_DEBUG
49622+ .check = check_ctail
49623+#endif
49624+ },
49625+ .f = {
49626+ .utmost_child = utmost_child_ctail,
49627+ /* FIXME-EDWARD: write this */
49628+ .utmost_child_real_block = NULL,
49629+ .update = NULL,
49630+ .scan = scan_ctail,
49631+ .convert = convert_ctail
49632+ },
49633+ .s = {
49634+ .file = {
49635+ .write = NULL,
49636+ .read = read_ctail,
49637+ .readpage = readpage_ctail,
49638+ .get_block = get_block_address_tail,
49639+ .append_key = append_key_ctail,
49640+ .init_coord_extension =
49641+ init_coord_extension_tail
49642+ }
49643+ }
49644+ },
49645+ [BLACK_BOX_ID] = {
49646+ .h = {
49647+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49648+ .id = BLACK_BOX_ID,
49649+ .groups = (1 << OTHER_ITEM_TYPE),
49650+ .pops = NULL,
49651+ .label = "blackbox",
49652+ .desc = "black box item",
49653+ .linkage = {NULL, NULL}
49654+ },
49655+ .b = {
49656+ .max_key_inside = NULL,
49657+ .can_contain_key = NULL,
49658+ .mergeable = not_mergeable,
49659+ .nr_units = nr_units_single_unit,
49660+ /* to need for ->lookup method */
49661+ .lookup = NULL,
49662+ .init = NULL,
49663+ .paste = NULL,
49664+ .fast_paste = NULL,
49665+ .can_shift = NULL,
49666+ .copy_units = NULL,
49667+ .create_hook = NULL,
49668+ .kill_hook = NULL,
49669+ .shift_hook = NULL,
49670+ .cut_units = NULL,
49671+ .kill_units = NULL,
49672+ .unit_key = NULL,
49673+ .max_unit_key = NULL,
49674+ .estimate = NULL,
49675+ .item_data_by_flow = NULL,
49676+#if REISER4_DEBUG
49677+ .check = NULL
49678+#endif
49679+ }
49680+ }
49681+};
49682+
49683+/* Make Linus happy.
49684+ Local variables:
49685+ c-indentation-style: "K&R"
49686+ mode-name: "LC"
49687+ c-basic-offset: 8
49688+ tab-width: 8
49689+ fill-column: 120
49690+ End:
49691+*/
49692diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/item.h linux-2.6.23/fs/reiser4/plugin/item/item.h
49693--- linux-2.6.23.orig/fs/reiser4/plugin/item/item.h 1970-01-01 03:00:00.000000000 +0300
49694+++ linux-2.6.23/fs/reiser4/plugin/item/item.h 2007-12-04 16:49:30.000000000 +0300
49695@@ -0,0 +1,397 @@
49696+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49697+
49698+/* first read balance.c comments before reading this */
49699+
49700+/* An item_plugin implements all of the operations required for
49701+ balancing that are item specific. */
49702+
49703+/* an item plugin also implements other operations that are specific to that
49704+ item. These go into the item specific operations portion of the item
49705+ handler, and all of the item specific portions of the item handler are put
49706+ into a union. */
49707+
49708+#if !defined( __REISER4_ITEM_H__ )
49709+#define __REISER4_ITEM_H__
49710+
49711+#include "../../forward.h"
49712+#include "../plugin_header.h"
49713+#include "../../dformat.h"
49714+#include "../../seal.h"
49715+#include "../../plugin/file/file.h"
49716+
49717+#include <linux/fs.h> /* for struct file, struct inode */
49718+#include <linux/mm.h> /* for struct page */
49719+#include <linux/dcache.h> /* for struct dentry */
49720+
49721+typedef enum {
49722+ STAT_DATA_ITEM_TYPE,
49723+ DIR_ENTRY_ITEM_TYPE,
49724+ INTERNAL_ITEM_TYPE,
49725+ UNIX_FILE_METADATA_ITEM_TYPE,
49726+ OTHER_ITEM_TYPE
49727+} item_type_id;
49728+
49729+/* this is the part of each item plugin that all items are expected to
49730+ support or at least explicitly fail to support by setting the
49731+ pointer to null. */
49732+struct balance_ops {
49733+ /* operations called by balancing
49734+
49735+ It is interesting to consider that some of these item
49736+ operations could be given sources or targets that are not
49737+ really items in nodes. This could be ok/useful.
49738+
49739+ */
49740+ /* maximal key that can _possibly_ be occupied by this item
49741+
49742+ When inserting, and node ->lookup() method (called by
49743+ coord_by_key()) reaches an item after binary search,
49744+ the ->max_key_inside() item plugin method is used to determine
49745+ whether new item should pasted into existing item
49746+ (new_key<=max_key_inside()) or new item has to be created
49747+ (new_key>max_key_inside()).
49748+
49749+ For items that occupy exactly one key (like stat-data)
49750+ this method should return this key. For items that can
49751+ grow indefinitely (extent, directory item) this should
49752+ return reiser4_max_key().
49753+
49754+ For example extent with the key
49755+
49756+ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
49757+
49758+ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
49759+ */
49760+ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
49761+
49762+ /* true if item @coord can merge data at @key. */
49763+ int (*can_contain_key) (const coord_t *, const reiser4_key *,
49764+ const reiser4_item_data *);
49765+ /* mergeable() - check items for mergeability
49766+
49767+ Optional method. Returns true if two items can be merged.
49768+
49769+ */
49770+ int (*mergeable) (const coord_t *, const coord_t *);
49771+
49772+ /* number of atomic things in an item.
49773+ NOTE FOR CONTRIBUTORS: use a generic method
49774+ nr_units_single_unit() for solid (atomic) items, as
49775+ tree operations use it as a criterion of solidness
49776+ (see is_solid_item macro) */
49777+ pos_in_node_t(*nr_units) (const coord_t *);
49778+
49779+ /* search within item for a unit within the item, and return a
49780+ pointer to it. This can be used to calculate how many
49781+ bytes to shrink an item if you use pointer arithmetic and
49782+ compare to the start of the item body if the item's data
49783+ are continuous in the node, if the item's data are not
49784+ continuous in the node, all sorts of other things are maybe
49785+ going to break as well. */
49786+ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
49787+ /* method called by ode_plugin->create_item() to initialise new
49788+ item */
49789+ int (*init) (coord_t * target, coord_t * from,
49790+ reiser4_item_data * data);
49791+ /* method called (e.g., by reiser4_resize_item()) to place new data
49792+ into item when it grows */
49793+ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
49794+ /* return true if paste into @coord is allowed to skip
49795+ carry. That is, if such paste would require any changes
49796+ at the parent level
49797+ */
49798+ int (*fast_paste) (const coord_t *);
49799+ /* how many but not more than @want units of @source can be
49800+ shifted into @target node. If pend == append - we try to
49801+ append last item of @target by first units of @source. If
49802+ pend == prepend - we try to "prepend" first item in @target
49803+ by last units of @source. @target node has @free_space
49804+ bytes of free space. Total size of those units are returned
49805+ via @size.
49806+
49807+ @target is not NULL if shifting to the mergeable item and
49808+ NULL is new item will be created during shifting.
49809+ */
49810+ int (*can_shift) (unsigned free_space, coord_t *,
49811+ znode *, shift_direction, unsigned *size,
49812+ unsigned want);
49813+
49814+ /* starting off @from-th unit of item @source append or
49815+ prepend @count units to @target. @target has been already
49816+ expanded by @free_space bytes. That must be exactly what is
49817+ needed for those items in @target. If @where_is_free_space
49818+ == SHIFT_LEFT - free space is at the end of @target item,
49819+ othersize - it is in the beginning of it. */
49820+ void (*copy_units) (coord_t *, coord_t *,
49821+ unsigned from, unsigned count,
49822+ shift_direction where_is_free_space,
49823+ unsigned free_space);
49824+
49825+ int (*create_hook) (const coord_t *, void *);
49826+ /* do whatever is necessary to do when @count units starting
49827+ from @from-th one are removed from the tree */
49828+ /* FIXME-VS: this is used to be here for, in particular,
49829+ extents and items of internal type to free blocks they point
49830+ to at the same time with removing items from a
49831+ tree. Problems start, however, when dealloc_block fails due
49832+ to some reason. Item gets removed, but blocks it pointed to
49833+ are not freed. It is not clear how to fix this for items of
49834+ internal type because a need to remove internal item may
49835+ appear in the middle of balancing, and there is no way to
49836+ undo changes made. OTOH, if space allocator involves
49837+ balancing to perform dealloc_block - this will probably
49838+ break balancing due to deadlock issues
49839+ */
49840+ int (*kill_hook) (const coord_t *, pos_in_node_t from,
49841+ pos_in_node_t count, struct carry_kill_data *);
49842+ int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
49843+ znode * _node);
49844+
49845+ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
49846+ including boundaries. When units are cut from item beginning - move space which gets freed to head of
49847+ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
49848+ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
49849+ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
49850+ */
49851+ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
49852+ struct carry_cut_data *,
49853+ reiser4_key * smallest_removed,
49854+ reiser4_key * new_first_key);
49855+
49856+ /* like cut_units, except that these units are removed from the
49857+ tree, not only from a node */
49858+ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
49859+ struct carry_kill_data *,
49860+ reiser4_key * smallest_removed,
49861+ reiser4_key * new_first);
49862+
49863+ /* if @key_of_coord == 1 - returned key of coord, otherwise -
49864+ key of unit is returned. If @coord is not set to certain
49865+ unit - ERR_PTR(-ENOENT) is returned */
49866+ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
49867+ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
49868+ /* estimate how much space is needed for paste @data into item at
49869+ @coord. if @coord==0 - estimate insertion, otherwise - estimate
49870+ pasting
49871+ */
49872+ int (*estimate) (const coord_t *, const reiser4_item_data *);
49873+
49874+ /* converts flow @f to item data. @coord == 0 on insert */
49875+ int (*item_data_by_flow) (const coord_t *, const flow_t *,
49876+ reiser4_item_data *);
49877+
49878+ /*void (*show) (struct seq_file *, coord_t *); */
49879+
49880+#if REISER4_DEBUG
49881+ /* used for debugging, every item should have here the most
49882+ complete possible check of the consistency of the item that
49883+ the inventor can construct */
49884+ int (*check) (const coord_t *, const char **error);
49885+#endif
49886+
49887+};
49888+
49889+struct flush_ops {
49890+ /* return the right or left child of @coord, only if it is in memory */
49891+ int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
49892+
49893+ /* return whether the right or left child of @coord has a non-fake
49894+ block number. */
49895+ int (*utmost_child_real_block) (const coord_t *, sideof side,
49896+ reiser4_block_nr *);
49897+ /* relocate child at @coord to the @block */
49898+ void (*update) (const coord_t *, const reiser4_block_nr *);
49899+ /* count unformatted nodes per item for leave relocation policy, etc.. */
49900+ int (*scan) (flush_scan * scan);
49901+ /* convert item by flush */
49902+ int (*convert) (flush_pos_t * pos);
49903+ /* backward mapping from jnode offset to a key. */
49904+ int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
49905+};
49906+
49907+/* operations specific to the directory item */
49908+struct dir_entry_iops {
49909+ /* extract stat-data key from directory entry at @coord and place it
49910+ into @key. */
49911+ int (*extract_key) (const coord_t *, reiser4_key * key);
49912+ /* update object key in item. */
49913+ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
49914+ /* extract name from directory entry at @coord and return it */
49915+ char *(*extract_name) (const coord_t *, char *buf);
49916+ /* extract file type (DT_* stuff) from directory entry at @coord and
49917+ return it */
49918+ unsigned (*extract_file_type) (const coord_t *);
49919+ int (*add_entry) (struct inode * dir,
49920+ coord_t *, lock_handle *,
49921+ const struct dentry * name,
49922+ reiser4_dir_entry_desc * entry);
49923+ int (*rem_entry) (struct inode * dir, const struct qstr * name,
49924+ coord_t *, lock_handle *,
49925+ reiser4_dir_entry_desc * entry);
49926+ int (*max_name_len) (const struct inode * dir);
49927+};
49928+
49929+/* operations specific to items regular (unix) file metadata are built of */
49930+struct file_iops{
49931+ int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
49932+ int (*read) (struct file *, flow_t *, hint_t *);
49933+ int (*readpage) (void *, struct page *);
49934+ int (*get_block) (const coord_t *, sector_t, sector_t *);
49935+ /*
49936+ * key of first byte which is not addressed by the item @coord is set
49937+ * to.
49938+ * For example, for extent item with the key
49939+ *
49940+ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
49941+ *
49942+ * ->append_key is
49943+ *
49944+ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
49945+ */
49946+ reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
49947+
49948+ void (*init_coord_extension) (uf_coord_t *, loff_t);
49949+};
49950+
49951+/* operations specific to items of stat data type */
49952+struct sd_iops {
49953+ int (*init_inode) (struct inode * inode, char *sd, int len);
49954+ int (*save_len) (struct inode * inode);
49955+ int (*save) (struct inode * inode, char **area);
49956+};
49957+
49958+/* operations specific to internal item */
49959+struct internal_iops{
49960+ /* all tree traversal want to know from internal item is where
49961+ to go next. */
49962+ void (*down_link) (const coord_t * coord,
49963+ const reiser4_key * key, reiser4_block_nr * block);
49964+ /* check that given internal item contains given pointer. */
49965+ int (*has_pointer_to) (const coord_t * coord,
49966+ const reiser4_block_nr * block);
49967+};
49968+
49969+struct item_plugin {
49970+ /* generic fields */
49971+ plugin_header h;
49972+ /* methods common for all item types */
49973+ struct balance_ops b; /* balance operations */
49974+ struct flush_ops f; /* flush operates with items via this methods */
49975+
49976+ /* methods specific to particular type of item */
49977+ union {
49978+ struct dir_entry_iops dir;
49979+ struct file_iops file;
49980+ struct sd_iops sd;
49981+ struct internal_iops internal;
49982+ } s;
49983+};
49984+
49985+#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
49986+
49987+static inline item_id item_id_by_plugin(item_plugin * plugin)
49988+{
49989+ return plugin->h.id;
49990+}
49991+
49992+static inline char get_iplugid(item_plugin * iplug)
49993+{
49994+ assert("nikita-2838", iplug != NULL);
49995+ assert("nikita-2839", iplug->h.id < 0xff);
49996+ return (char)item_id_by_plugin(iplug);
49997+}
49998+
49999+extern unsigned long znode_times_locked(const znode * z);
50000+
50001+static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
50002+{
50003+ assert("nikita-2837", coord != NULL);
50004+ assert("nikita-2838", iplug != NULL);
50005+ coord->iplugid = get_iplugid(iplug);
50006+ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
50007+}
50008+
50009+static inline item_plugin *coord_iplug(const coord_t * coord)
50010+{
50011+ assert("nikita-2833", coord != NULL);
50012+ assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
50013+ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
50014+ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
50015+ coord->iplugid);
50016+}
50017+
50018+extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
50019+ const reiser4_item_data *);
50020+extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
50021+extern int item_is_extent(const coord_t *);
50022+extern int item_is_tail(const coord_t *);
50023+extern int item_is_statdata(const coord_t * item);
50024+extern int item_is_ctail(const coord_t *);
50025+
50026+extern pos_in_node_t item_length_by_coord(const coord_t * coord);
50027+extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
50028+extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
50029+extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
50030+extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
50031+extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
50032+extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
50033+ reiser4_key * key);
50034+extern void obtain_item_plugin(const coord_t * coord);
50035+
50036+#if defined(REISER4_DEBUG)
50037+extern int znode_is_loaded(const znode * node);
50038+#endif
50039+
50040+/* return plugin of item at @coord */
50041+static inline item_plugin *item_plugin_by_coord(const coord_t *
50042+ coord /* coord to query */ )
50043+{
50044+ assert("nikita-330", coord != NULL);
50045+ assert("nikita-331", coord->node != NULL);
50046+ assert("nikita-332", znode_is_loaded(coord->node));
50047+
50048+ if (unlikely(!coord_is_iplug_set(coord)))
50049+ obtain_item_plugin(coord);
50050+ return coord_iplug(coord);
50051+}
50052+
50053+/* this returns true if item is of internal type */
50054+static inline int item_is_internal(const coord_t * item)
50055+{
50056+ assert("vs-483", coord_is_existing_item(item));
50057+ return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
50058+}
50059+
50060+extern void item_body_by_coord_hard(coord_t * coord);
50061+extern void *item_body_by_coord_easy(const coord_t * coord);
50062+#if REISER4_DEBUG
50063+extern int item_body_is_valid(const coord_t * coord);
50064+#endif
50065+
50066+/* return pointer to item body */
50067+static inline void *item_body_by_coord(const coord_t *
50068+ coord /* coord to query */ )
50069+{
50070+ assert("nikita-324", coord != NULL);
50071+ assert("nikita-325", coord->node != NULL);
50072+ assert("nikita-326", znode_is_loaded(coord->node));
50073+
50074+ if (coord->offset == INVALID_OFFSET)
50075+ item_body_by_coord_hard((coord_t *) coord);
50076+ assert("nikita-3201", item_body_is_valid(coord));
50077+ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
50078+ return item_body_by_coord_easy(coord);
50079+}
50080+
50081+/* __REISER4_ITEM_H__ */
50082+#endif
50083+/* Make Linus happy.
50084+ Local variables:
50085+ c-indentation-style: "K&R"
50086+ mode-name: "LC"
50087+ c-basic-offset: 8
50088+ tab-width: 8
50089+ fill-column: 120
50090+ scroll-step: 1
50091+ End:
50092+*/
50093diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/Makefile linux-2.6.23/fs/reiser4/plugin/item/Makefile
50094--- linux-2.6.23.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 03:00:00.000000000 +0300
50095+++ linux-2.6.23/fs/reiser4/plugin/item/Makefile 2007-12-04 16:49:30.000000000 +0300
50096@@ -0,0 +1,18 @@
50097+obj-$(CONFIG_REISER4_FS) += item_plugins.o
50098+
50099+item_plugins-objs := \
50100+ item.o \
50101+ static_stat.o \
50102+ sde.o \
50103+ cde.o \
50104+ blackbox.o \
50105+ internal.o \
50106+ tail.o \
50107+ ctail.o \
50108+ extent.o \
50109+ extent_item_ops.o \
50110+ extent_file_ops.o \
50111+ extent_flush_ops.o
50112+
50113+
50114+
50115diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/sde.c linux-2.6.23/fs/reiser4/plugin/item/sde.c
50116--- linux-2.6.23.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 03:00:00.000000000 +0300
50117+++ linux-2.6.23/fs/reiser4/plugin/item/sde.c 2007-12-04 16:49:30.000000000 +0300
50118@@ -0,0 +1,190 @@
50119+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50120+
50121+/* Directory entry implementation */
50122+#include "../../forward.h"
50123+#include "../../debug.h"
50124+#include "../../dformat.h"
50125+#include "../../kassign.h"
50126+#include "../../coord.h"
50127+#include "sde.h"
50128+#include "item.h"
50129+#include "../plugin.h"
50130+#include "../../znode.h"
50131+#include "../../carry.h"
50132+#include "../../tree.h"
50133+#include "../../inode.h"
50134+
50135+#include <linux/fs.h> /* for struct inode */
50136+#include <linux/dcache.h> /* for struct dentry */
50137+#include <linux/quotaops.h>
50138+
50139+/* ->extract_key() method of simple directory item plugin. */
50140+int extract_key_de(const coord_t * coord /* coord of item */ ,
50141+ reiser4_key * key /* resulting key */ )
50142+{
50143+ directory_entry_format *dent;
50144+
50145+ assert("nikita-1458", coord != NULL);
50146+ assert("nikita-1459", key != NULL);
50147+
50148+ dent = (directory_entry_format *) item_body_by_coord(coord);
50149+ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
50150+ return extract_key_from_id(&dent->id, key);
50151+}
50152+
50153+int
50154+update_key_de(const coord_t * coord, const reiser4_key * key,
50155+ lock_handle * lh UNUSED_ARG)
50156+{
50157+ directory_entry_format *dent;
50158+ obj_key_id obj_id;
50159+ int result;
50160+
50161+ assert("nikita-2342", coord != NULL);
50162+ assert("nikita-2343", key != NULL);
50163+
50164+ dent = (directory_entry_format *) item_body_by_coord(coord);
50165+ result = build_obj_key_id(key, &obj_id);
50166+ if (result == 0) {
50167+ dent->id = obj_id;
50168+ znode_make_dirty(coord->node);
50169+ }
50170+ return 0;
50171+}
50172+
50173+char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
50174+ char *buf)
50175+{
50176+ reiser4_key key;
50177+
50178+ unit_key_by_coord(coord, &key);
50179+ if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
50180+ reiser4_print_address("oops", znode_get_block(coord->node));
50181+ if (!is_longname_key(&key)) {
50182+ if (is_dot_key(&key))
50183+ return (char *)".";
50184+ else
50185+ return extract_name_from_key(&key, buf);
50186+ } else
50187+ return (char *)dent->name;
50188+}
50189+
50190+/* ->extract_name() method of simple directory item plugin. */
50191+char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
50192+{
50193+ directory_entry_format *dent;
50194+
50195+ assert("nikita-1460", coord != NULL);
50196+
50197+ dent = (directory_entry_format *) item_body_by_coord(coord);
50198+ return extract_dent_name(coord, dent, buf);
50199+}
50200+
50201+/* ->extract_file_type() method of simple directory item plugin. */
50202+unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
50203+ * item */ )
50204+{
50205+ assert("nikita-1764", coord != NULL);
50206+ /* we don't store file type in the directory entry yet.
50207+
50208+ But see comments at kassign.h:obj_key_id
50209+ */
50210+ return DT_UNKNOWN;
50211+}
50212+
50213+int add_entry_de(struct inode *dir /* directory of item */ ,
50214+ coord_t * coord /* coord of item */ ,
50215+ lock_handle * lh /* insertion lock handle */ ,
50216+ const struct dentry *de /* name to add */ ,
50217+ reiser4_dir_entry_desc * entry /* parameters of new directory
50218+ * entry */ )
50219+{
50220+ reiser4_item_data data;
50221+ directory_entry_format *dent;
50222+ int result;
50223+ const char *name;
50224+ int len;
50225+ int longname;
50226+
50227+ name = de->d_name.name;
50228+ len = de->d_name.len;
50229+ assert("nikita-1163", strlen(name) == len);
50230+
50231+ longname = is_longname(name, len);
50232+
50233+ data.length = sizeof *dent;
50234+ if (longname)
50235+ data.length += len + 1;
50236+ data.data = NULL;
50237+ data.user = 0;
50238+ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
50239+
50240+ /* NOTE-NIKITA quota plugin */
50241+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
50242+ return -EDQUOT;
50243+
50244+ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
50245+ if (result != 0)
50246+ return result;
50247+
50248+ dent = (directory_entry_format *) item_body_by_coord(coord);
50249+ build_inode_key_id(entry->obj, &dent->id);
50250+ if (longname) {
50251+ memcpy(dent->name, name, len);
50252+ put_unaligned(0, &dent->name[len]);
50253+ }
50254+ return 0;
50255+}
50256+
50257+int rem_entry_de(struct inode *dir /* directory of item */ ,
50258+ const struct qstr *name UNUSED_ARG,
50259+ coord_t * coord /* coord of item */ ,
50260+ lock_handle * lh UNUSED_ARG /* lock handle for
50261+ * removal */ ,
50262+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
50263+ * directory entry
50264+ * being removed */ )
50265+{
50266+ coord_t shadow;
50267+ int result;
50268+ int length;
50269+
50270+ length = item_length_by_coord(coord);
50271+ if (inode_get_bytes(dir) < length) {
50272+ warning("nikita-2627", "Dir is broke: %llu: %llu",
50273+ (unsigned long long)get_inode_oid(dir),
50274+ inode_get_bytes(dir));
50275+
50276+ return RETERR(-EIO);
50277+ }
50278+
50279+ /* cut_node() is supposed to take pointers to _different_
50280+ coords, because it will modify them without respect to
50281+ possible aliasing. To work around this, create temporary copy
50282+ of @coord.
50283+ */
50284+ coord_dup(&shadow, coord);
50285+ result =
50286+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
50287+ if (result == 0) {
50288+ /* NOTE-NIKITA quota plugin */
50289+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
50290+ }
50291+ return result;
50292+}
50293+
50294+int max_name_len_de(const struct inode *dir)
50295+{
50296+ return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
50297+ sizeof(directory_entry_format) - 2;
50298+}
50299+
50300+/* Make Linus happy.
50301+ Local variables:
50302+ c-indentation-style: "K&R"
50303+ mode-name: "LC"
50304+ c-basic-offset: 8
50305+ tab-width: 8
50306+ fill-column: 120
50307+ End:
50308+*/
50309diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/sde.h linux-2.6.23/fs/reiser4/plugin/item/sde.h
50310--- linux-2.6.23.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 03:00:00.000000000 +0300
50311+++ linux-2.6.23/fs/reiser4/plugin/item/sde.h 2007-12-04 16:49:30.000000000 +0300
50312@@ -0,0 +1,66 @@
50313+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50314+
50315+/* Directory entry. */
50316+
50317+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
50318+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
50319+
50320+#include "../../forward.h"
50321+#include "../../dformat.h"
50322+#include "../../kassign.h"
50323+#include "../../key.h"
50324+
50325+#include <linux/fs.h>
50326+#include <linux/dcache.h> /* for struct dentry */
50327+
50328+typedef struct directory_entry_format {
50329+ /* key of object stat-data. It's not necessary to store whole
50330+ key here, because it's always key of stat-data, so minor
50331+ packing locality and offset can be omitted here. But this
50332+ relies on particular key allocation scheme for stat-data, so,
50333+ for extensibility sake, whole key can be stored here.
50334+
50335+ We store key as array of bytes, because we don't want 8-byte
50336+ alignment of dir entries.
50337+ */
50338+ obj_key_id id;
50339+ /* file name. Null terminated string. */
50340+ d8 name[0];
50341+} directory_entry_format;
50342+
50343+void print_de(const char *prefix, coord_t * coord);
50344+int extract_key_de(const coord_t * coord, reiser4_key * key);
50345+int update_key_de(const coord_t * coord, const reiser4_key * key,
50346+ lock_handle * lh);
50347+char *extract_name_de(const coord_t * coord, char *buf);
50348+unsigned extract_file_type_de(const coord_t * coord);
50349+int add_entry_de(struct inode *dir, coord_t * coord,
50350+ lock_handle * lh, const struct dentry *name,
50351+ reiser4_dir_entry_desc * entry);
50352+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
50353+ lock_handle * lh, reiser4_dir_entry_desc * entry);
50354+int max_name_len_de(const struct inode *dir);
50355+
50356+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
50357+
50358+char *extract_dent_name(const coord_t * coord,
50359+ directory_entry_format * dent, char *buf);
50360+
50361+#if REISER4_LARGE_KEY
50362+#define DE_NAME_BUF_LEN (24)
50363+#else
50364+#define DE_NAME_BUF_LEN (16)
50365+#endif
50366+
50367+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
50368+#endif
50369+
50370+/* Make Linus happy.
50371+ Local variables:
50372+ c-indentation-style: "K&R"
50373+ mode-name: "LC"
50374+ c-basic-offset: 8
50375+ tab-width: 8
50376+ fill-column: 120
50377+ End:
50378+*/
50379diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.23/fs/reiser4/plugin/item/static_stat.c
50380--- linux-2.6.23.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 03:00:00.000000000 +0300
50381+++ linux-2.6.23/fs/reiser4/plugin/item/static_stat.c 2007-12-04 16:49:30.000000000 +0300
50382@@ -0,0 +1,1107 @@
50383+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50384+
50385+/* stat data manipulation. */
50386+
50387+#include "../../forward.h"
50388+#include "../../super.h"
50389+#include "../../vfs_ops.h"
50390+#include "../../inode.h"
50391+#include "../../debug.h"
50392+#include "../../dformat.h"
50393+#include "../object.h"
50394+#include "../plugin.h"
50395+#include "../plugin_header.h"
50396+#include "static_stat.h"
50397+#include "item.h"
50398+
50399+#include <linux/types.h>
50400+#include <linux/fs.h>
50401+
50402+/* see static_stat.h for explanation */
50403+
50404+/* helper function used while we are dumping/loading inode/plugin state
50405+ to/from the stat-data. */
50406+
50407+static void move_on(int *length /* space remaining in stat-data */ ,
50408+ char **area /* current coord in stat data */ ,
50409+ int size_of /* how many bytes to move forward */ )
50410+{
50411+ assert("nikita-615", length != NULL);
50412+ assert("nikita-616", area != NULL);
50413+
50414+ *length -= size_of;
50415+ *area += size_of;
50416+
50417+ assert("nikita-617", *length >= 0);
50418+}
50419+
50420+/* helper function used while loading inode/plugin state from stat-data.
50421+ Complain if there is less space in stat-data than was expected.
50422+ Can only happen on disk corruption. */
50423+static int not_enough_space(struct inode *inode /* object being processed */ ,
50424+ const char *where /* error message */ )
50425+{
50426+ assert("nikita-618", inode != NULL);
50427+
50428+ warning("nikita-619", "Not enough space in %llu while loading %s",
50429+ (unsigned long long)get_inode_oid(inode), where);
50430+
50431+ return RETERR(-EINVAL);
50432+}
50433+
50434+/* helper function used while loading inode/plugin state from
50435+ stat-data. Call it if invalid plugin id was found. */
50436+static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
50437+ struct inode *inode /* object being processed */ )
50438+{
50439+ warning("nikita-620", "Unknown plugin %i in %llu",
50440+ id, (unsigned long long)get_inode_oid(inode));
50441+
50442+ return RETERR(-EINVAL);
50443+}
50444+
50445+/* this is installed as ->init_inode() method of
50446+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
50447+ Copies data from on-disk stat-data format into inode.
50448+ Handles stat-data extensions. */
50449+/* was sd_load */
50450+int init_inode_static_sd(struct inode *inode /* object being processed */ ,
50451+ char *sd /* stat-data body */ ,
50452+ int len /* length of stat-data */ )
50453+{
50454+ int result;
50455+ int bit;
50456+ int chunk;
50457+ __u16 mask;
50458+ __u64 bigmask;
50459+ reiser4_stat_data_base *sd_base;
50460+ reiser4_inode *state;
50461+
50462+ assert("nikita-625", inode != NULL);
50463+ assert("nikita-626", sd != NULL);
50464+
50465+ result = 0;
50466+ sd_base = (reiser4_stat_data_base *) sd;
50467+ state = reiser4_inode_data(inode);
50468+ mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
50469+ bigmask = mask;
50470+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
50471+
50472+ move_on(&len, &sd, sizeof *sd_base);
50473+ for (bit = 0, chunk = 0;
50474+ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
50475+ ++bit, mask >>= 1) {
50476+ if (((bit + 1) % 16) != 0) {
50477+ /* handle extension */
50478+ sd_ext_plugin *sdplug;
50479+
50480+ if (bit >= LAST_SD_EXTENSION) {
50481+ warning("vpf-1904",
50482+ "No such extension %i in inode %llu",
50483+ bit,
50484+ (unsigned long long)
50485+ get_inode_oid(inode));
50486+
50487+ result = RETERR(-EINVAL);
50488+ break;
50489+ }
50490+
50491+ sdplug = sd_ext_plugin_by_id(bit);
50492+ if (sdplug == NULL) {
50493+ warning("nikita-627",
50494+ "No such extension %i in inode %llu",
50495+ bit,
50496+ (unsigned long long)
50497+ get_inode_oid(inode));
50498+
50499+ result = RETERR(-EINVAL);
50500+ break;
50501+ }
50502+ if (mask & 1) {
50503+ assert("nikita-628", sdplug->present);
50504+ /* alignment is not supported in node layout
50505+ plugin yet.
50506+ result = align( inode, &len, &sd,
50507+ sdplug -> alignment );
50508+ if( result != 0 )
50509+ return result; */
50510+ result = sdplug->present(inode, &sd, &len);
50511+ } else if (sdplug->absent != NULL)
50512+ result = sdplug->absent(inode);
50513+ if (result)
50514+ break;
50515+ /* else, we are looking at the last bit in 16-bit
50516+ portion of bitmask */
50517+ } else if (mask & 1) {
50518+ /* next portion of bitmask */
50519+ if (len < (int)sizeof(d16)) {
50520+ warning("nikita-629",
50521+ "No space for bitmap in inode %llu",
50522+ (unsigned long long)
50523+ get_inode_oid(inode));
50524+
50525+ result = RETERR(-EINVAL);
50526+ break;
50527+ }
50528+ mask = le16_to_cpu(get_unaligned((d16 *)sd));
50529+ bigmask <<= 16;
50530+ bigmask |= mask;
50531+ move_on(&len, &sd, sizeof(d16));
50532+ ++chunk;
50533+ if (chunk == 3) {
50534+ if (!(mask & 0x8000)) {
50535+ /* clear last bit */
50536+ mask &= ~0x8000;
50537+ continue;
50538+ }
50539+ /* too much */
50540+ warning("nikita-630",
50541+ "Too many extensions in %llu",
50542+ (unsigned long long)
50543+ get_inode_oid(inode));
50544+
50545+ result = RETERR(-EINVAL);
50546+ break;
50547+ }
50548+ } else
50549+ /* bitmask exhausted */
50550+ break;
50551+ }
50552+ state->extmask = bigmask;
50553+ /* common initialisations */
50554+ if (len - (bit / 16 * sizeof(d16)) > 0) {
50555+ /* alignment in save_len_static_sd() is taken into account
50556+ -edward */
50557+ warning("nikita-631", "unused space in inode %llu",
50558+ (unsigned long long)get_inode_oid(inode));
50559+ }
50560+
50561+ return result;
50562+}
50563+
50564+/* estimates size of stat-data required to store inode.
50565+ Installed as ->save_len() method of
50566+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50567+/* was sd_len */
50568+int save_len_static_sd(struct inode *inode /* object being processed */ )
50569+{
50570+ unsigned int result;
50571+ __u64 mask;
50572+ int bit;
50573+
50574+ assert("nikita-632", inode != NULL);
50575+
50576+ result = sizeof(reiser4_stat_data_base);
50577+ mask = reiser4_inode_data(inode)->extmask;
50578+ for (bit = 0; mask != 0; ++bit, mask >>= 1) {
50579+ if (mask & 1) {
50580+ sd_ext_plugin *sdplug;
50581+
50582+ sdplug = sd_ext_plugin_by_id(bit);
50583+ assert("nikita-633", sdplug != NULL);
50584+ /* no aligment support
50585+ result +=
50586+ round_up( result, sdplug -> alignment ) - result; */
50587+ result += sdplug->save_len(inode);
50588+ }
50589+ }
50590+ result += bit / 16 * sizeof(d16);
50591+ return result;
50592+}
50593+
50594+/* saves inode into stat-data.
50595+ Installed as ->save() method of
50596+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50597+/* was sd_save */
50598+int save_static_sd(struct inode *inode /* object being processed */ ,
50599+ char **area /* where to save stat-data */ )
50600+{
50601+ int result;
50602+ __u64 emask;
50603+ int bit;
50604+ unsigned int len;
50605+ reiser4_stat_data_base *sd_base;
50606+
50607+ assert("nikita-634", inode != NULL);
50608+ assert("nikita-635", area != NULL);
50609+
50610+ result = 0;
50611+ emask = reiser4_inode_data(inode)->extmask;
50612+ sd_base = (reiser4_stat_data_base *) * area;
50613+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
50614+ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
50615+
50616+ *area += sizeof *sd_base;
50617+ len = 0xffffffffu;
50618+ for (bit = 0; emask != 0; ++bit, emask >>= 1) {
50619+ if (emask & 1) {
50620+ if ((bit + 1) % 16 != 0) {
50621+ sd_ext_plugin *sdplug;
50622+ sdplug = sd_ext_plugin_by_id(bit);
50623+ assert("nikita-636", sdplug != NULL);
50624+ /* no alignment support yet
50625+ align( inode, &len, area,
50626+ sdplug -> alignment ); */
50627+ result = sdplug->save(inode, area);
50628+ if (result)
50629+ break;
50630+ } else {
50631+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
50632+ (d16 *)(*area));
50633+ /*cputod16((unsigned)(emask & 0xffff),
50634+ (d16 *) * area);*/
50635+ *area += sizeof(d16);
50636+ }
50637+ }
50638+ }
50639+ return result;
50640+}
50641+
50642+/* stat-data extension handling functions. */
50643+
50644+static int present_lw_sd(struct inode *inode /* object being processed */ ,
50645+ char **area /* position in stat-data */ ,
50646+ int *len /* remaining length */ )
50647+{
50648+ if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
50649+ reiser4_light_weight_stat *sd_lw;
50650+
50651+ sd_lw = (reiser4_light_weight_stat *) * area;
50652+
50653+ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
50654+ inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
50655+ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
50656+ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
50657+ inode->i_mode &= ~S_IFIFO;
50658+ warning("", "partially converted file is encountered");
50659+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
50660+ }
50661+ move_on(len, area, sizeof *sd_lw);
50662+ return 0;
50663+ } else
50664+ return not_enough_space(inode, "lw sd");
50665+}
50666+
50667+static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
50668+ * processed */ )
50669+{
50670+ return sizeof(reiser4_light_weight_stat);
50671+}
50672+
50673+static int save_lw_sd(struct inode *inode /* object being processed */ ,
50674+ char **area /* position in stat-data */ )
50675+{
50676+ reiser4_light_weight_stat *sd;
50677+ mode_t delta;
50678+
50679+ assert("nikita-2705", inode != NULL);
50680+ assert("nikita-2706", area != NULL);
50681+ assert("nikita-2707", *area != NULL);
50682+
50683+ sd = (reiser4_light_weight_stat *) * area;
50684+
50685+ delta = (reiser4_inode_get_flag(inode,
50686+ REISER4_PART_MIXED) ? S_IFIFO : 0);
50687+ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
50688+ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
50689+ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
50690+ *area += sizeof *sd;
50691+ return 0;
50692+}
50693+
50694+static int present_unix_sd(struct inode *inode /* object being processed */ ,
50695+ char **area /* position in stat-data */ ,
50696+ int *len /* remaining length */ )
50697+{
50698+ assert("nikita-637", inode != NULL);
50699+ assert("nikita-638", area != NULL);
50700+ assert("nikita-639", *area != NULL);
50701+ assert("nikita-640", len != NULL);
50702+ assert("nikita-641", *len > 0);
50703+
50704+ if (*len >= (int)sizeof(reiser4_unix_stat)) {
50705+ reiser4_unix_stat *sd;
50706+
50707+ sd = (reiser4_unix_stat *) * area;
50708+
50709+ inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
50710+ inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
50711+ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
50712+ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
50713+ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
50714+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
50715+ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
50716+ else
50717+ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
50718+ move_on(len, area, sizeof *sd);
50719+ return 0;
50720+ } else
50721+ return not_enough_space(inode, "unix sd");
50722+}
50723+
50724+static int absent_unix_sd(struct inode *inode /* object being processed */ )
50725+{
50726+ inode->i_uid = get_super_private(inode->i_sb)->default_uid;
50727+ inode->i_gid = get_super_private(inode->i_sb)->default_gid;
50728+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
50729+ inode_set_bytes(inode, inode->i_size);
50730+ /* mark inode as lightweight, so that caller (lookup_common) will
50731+ complete initialisation by copying [ug]id from a parent. */
50732+ reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
50733+ return 0;
50734+}
50735+
50736+/* Audited by: green(2002.06.14) */
50737+static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
50738+ * processed */ )
50739+{
50740+ return sizeof(reiser4_unix_stat);
50741+}
50742+
50743+static int save_unix_sd(struct inode *inode /* object being processed */ ,
50744+ char **area /* position in stat-data */ )
50745+{
50746+ reiser4_unix_stat *sd;
50747+
50748+ assert("nikita-642", inode != NULL);
50749+ assert("nikita-643", area != NULL);
50750+ assert("nikita-644", *area != NULL);
50751+
50752+ sd = (reiser4_unix_stat *) * area;
50753+ put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
50754+ put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
50755+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
50756+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
50757+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
50758+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
50759+ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
50760+ else
50761+ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
50762+ *area += sizeof *sd;
50763+ return 0;
50764+}
50765+
50766+static int
50767+present_large_times_sd(struct inode *inode /* object being processed */ ,
50768+ char **area /* position in stat-data */ ,
50769+ int *len /* remaining length */ )
50770+{
50771+ if (*len >= (int)sizeof(reiser4_large_times_stat)) {
50772+ reiser4_large_times_stat *sd_lt;
50773+
50774+ sd_lt = (reiser4_large_times_stat *) * area;
50775+
50776+ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
50777+ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
50778+ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
50779+
50780+ move_on(len, area, sizeof *sd_lt);
50781+ return 0;
50782+ } else
50783+ return not_enough_space(inode, "large times sd");
50784+}
50785+
50786+static int
50787+save_len_large_times_sd(struct inode *inode UNUSED_ARG
50788+ /* object being processed */ )
50789+{
50790+ return sizeof(reiser4_large_times_stat);
50791+}
50792+
50793+static int
50794+save_large_times_sd(struct inode *inode /* object being processed */ ,
50795+ char **area /* position in stat-data */ )
50796+{
50797+ reiser4_large_times_stat *sd;
50798+
50799+ assert("nikita-2817", inode != NULL);
50800+ assert("nikita-2818", area != NULL);
50801+ assert("nikita-2819", *area != NULL);
50802+
50803+ sd = (reiser4_large_times_stat *) * area;
50804+
50805+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
50806+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
50807+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
50808+
50809+ *area += sizeof *sd;
50810+ return 0;
50811+}
50812+
50813+/* symlink stat data extension */
50814+
50815+/* allocate memory for symlink target and attach it to inode->i_private */
50816+static int
50817+symlink_target_to_inode(struct inode *inode, const char *target, int len)
50818+{
50819+ assert("vs-845", inode->i_private == NULL);
50820+ assert("vs-846", !reiser4_inode_get_flag(inode,
50821+ REISER4_GENERIC_PTR_USED));
50822+ /* FIXME-VS: this is prone to deadlock. Not more than other similar
50823+ places, though */
50824+ inode->i_private = kmalloc((size_t) len + 1,
50825+ reiser4_ctx_gfp_mask_get());
50826+ if (!inode->i_private)
50827+ return RETERR(-ENOMEM);
50828+
50829+ memcpy((char *)(inode->i_private), target, (size_t) len);
50830+ ((char *)(inode->i_private))[len] = 0;
50831+ reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
50832+ return 0;
50833+}
50834+
50835+/* this is called on read_inode. There is nothing to do actually, but some
50836+ sanity checks */
50837+static int present_symlink_sd(struct inode *inode, char **area, int *len)
50838+{
50839+ int result;
50840+ int length;
50841+ reiser4_symlink_stat *sd;
50842+
50843+ length = (int)inode->i_size;
50844+ /*
50845+ * *len is number of bytes in stat data item from *area to the end of
50846+ * item. It must be not less than size of symlink + 1 for ending 0
50847+ */
50848+ if (length > *len)
50849+ return not_enough_space(inode, "symlink");
50850+
50851+ if (*(*area + length) != 0) {
50852+ warning("vs-840", "Symlink is not zero terminated");
50853+ return RETERR(-EIO);
50854+ }
50855+
50856+ sd = (reiser4_symlink_stat *) * area;
50857+ result = symlink_target_to_inode(inode, sd->body, length);
50858+
50859+ move_on(len, area, length + 1);
50860+ return result;
50861+}
50862+
50863+static int save_len_symlink_sd(struct inode *inode)
50864+{
50865+ return inode->i_size + 1;
50866+}
50867+
50868+/* this is called on create and update stat data. Do nothing on update but
50869+ update @area */
50870+static int save_symlink_sd(struct inode *inode, char **area)
50871+{
50872+ int result;
50873+ int length;
50874+ reiser4_symlink_stat *sd;
50875+
50876+ length = (int)inode->i_size;
50877+ /* inode->i_size must be set already */
50878+ assert("vs-841", length);
50879+
50880+ result = 0;
50881+ sd = (reiser4_symlink_stat *) * area;
50882+ if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
50883+ const char *target;
50884+
50885+ target = (const char *)(inode->i_private);
50886+ inode->i_private = NULL;
50887+
50888+ result = symlink_target_to_inode(inode, target, length);
50889+
50890+ /* copy symlink to stat data */
50891+ memcpy(sd->body, target, (size_t) length);
50892+ (*area)[length] = 0;
50893+ } else {
50894+ /* there is nothing to do in update but move area */
50895+ assert("vs-844",
50896+ !memcmp(inode->i_private, sd->body,
50897+ (size_t) length + 1));
50898+ }
50899+
50900+ *area += (length + 1);
50901+ return result;
50902+}
50903+
50904+static int present_flags_sd(struct inode *inode /* object being processed */ ,
50905+ char **area /* position in stat-data */ ,
50906+ int *len /* remaining length */ )
50907+{
50908+ assert("nikita-645", inode != NULL);
50909+ assert("nikita-646", area != NULL);
50910+ assert("nikita-647", *area != NULL);
50911+ assert("nikita-648", len != NULL);
50912+ assert("nikita-649", *len > 0);
50913+
50914+ if (*len >= (int)sizeof(reiser4_flags_stat)) {
50915+ reiser4_flags_stat *sd;
50916+
50917+ sd = (reiser4_flags_stat *) * area;
50918+ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
50919+ move_on(len, area, sizeof *sd);
50920+ return 0;
50921+ } else
50922+ return not_enough_space(inode, "generation and attrs");
50923+}
50924+
50925+/* Audited by: green(2002.06.14) */
50926+static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
50927+ * processed */ )
50928+{
50929+ return sizeof(reiser4_flags_stat);
50930+}
50931+
50932+static int save_flags_sd(struct inode *inode /* object being processed */ ,
50933+ char **area /* position in stat-data */ )
50934+{
50935+ reiser4_flags_stat *sd;
50936+
50937+ assert("nikita-650", inode != NULL);
50938+ assert("nikita-651", area != NULL);
50939+ assert("nikita-652", *area != NULL);
50940+
50941+ sd = (reiser4_flags_stat *) * area;
50942+ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
50943+ *area += sizeof *sd;
50944+ return 0;
50945+}
50946+
50947+static int absent_plugin_sd(struct inode *inode);
50948+static int present_plugin_sd(struct inode *inode /* object being processed */ ,
50949+ char **area /* position in stat-data */ ,
50950+ int *len /* remaining length */,
50951+ int is_pset /* 1 if plugin set, 0 if heir set. */)
50952+{
50953+ reiser4_plugin_stat *sd;
50954+ reiser4_plugin *plugin;
50955+ reiser4_inode *info;
50956+ int i;
50957+ __u16 mask;
50958+ int result;
50959+ int num_of_plugins;
50960+
50961+ assert("nikita-653", inode != NULL);
50962+ assert("nikita-654", area != NULL);
50963+ assert("nikita-655", *area != NULL);
50964+ assert("nikita-656", len != NULL);
50965+ assert("nikita-657", *len > 0);
50966+
50967+ if (*len < (int)sizeof(reiser4_plugin_stat))
50968+ return not_enough_space(inode, "plugin");
50969+
50970+ sd = (reiser4_plugin_stat *) * area;
50971+ info = reiser4_inode_data(inode);
50972+
50973+ mask = 0;
50974+ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
50975+ move_on(len, area, sizeof *sd);
50976+ result = 0;
50977+ for (i = 0; i < num_of_plugins; ++i) {
50978+ reiser4_plugin_slot *slot;
50979+ reiser4_plugin_type type;
50980+ pset_member memb;
50981+
50982+ slot = (reiser4_plugin_slot *) * area;
50983+ if (*len < (int)sizeof *slot)
50984+ return not_enough_space(inode, "additional plugin");
50985+
50986+ memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
50987+ type = aset_member_to_type_unsafe(memb);
50988+
50989+ if (type == REISER4_PLUGIN_TYPES) {
50990+ warning("nikita-3502",
50991+ "wrong %s member (%i) for %llu", is_pset ?
50992+ "pset" : "hset", memb,
50993+ (unsigned long long)get_inode_oid(inode));
50994+ return RETERR(-EINVAL);
50995+ }
50996+ plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
50997+ type, &slot->id);
50998+ if (plugin == NULL)
50999+ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
51000+
51001+ /* plugin is loaded into inode, mark this into inode's
51002+ bitmask of loaded non-standard plugins */
51003+ if (!(mask & (1 << memb))) {
51004+ mask |= (1 << memb);
51005+ } else {
51006+ warning("nikita-658", "duplicate plugin for %llu",
51007+ (unsigned long long)get_inode_oid(inode));
51008+ return RETERR(-EINVAL);
51009+ }
51010+ move_on(len, area, sizeof *slot);
51011+ /* load plugin data, if any */
51012+ if (plugin->h.pops != NULL && plugin->h.pops->load)
51013+ result = plugin->h.pops->load(inode, plugin, area, len);
51014+ else
51015+ result = aset_set_unsafe(is_pset ? &info->pset :
51016+ &info->hset, memb, plugin);
51017+ if (result)
51018+ return result;
51019+ }
51020+ if (is_pset) {
51021+ /* if object plugin wasn't loaded from stat-data, guess it by
51022+ mode bits */
51023+ plugin = file_plugin_to_plugin(inode_file_plugin(inode));
51024+ if (plugin == NULL)
51025+ result = absent_plugin_sd(inode);
51026+ info->plugin_mask = mask;
51027+ } else
51028+ info->heir_mask = mask;
51029+
51030+ return result;
51031+}
51032+
51033+static int present_pset_sd(struct inode *inode, char **area, int *len) {
51034+ return present_plugin_sd(inode, area, len, 1 /* pset */);
51035+}
51036+
51037+/* Determine object plugin for @inode based on i_mode.
51038+
51039+ Many objects in reiser4 file system are controlled by standard object
51040+ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
51041+
51042+ For such files we don't explicitly store plugin id in object stat
51043+ data. Rather required plugin is guessed from mode bits, where file "type"
51044+ is encoded (see stat(2)).
51045+*/
51046+static int
51047+guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
51048+{
51049+ int fplug_id;
51050+ int dplug_id;
51051+ reiser4_inode *info;
51052+
51053+ assert("nikita-736", inode != NULL);
51054+
51055+ dplug_id = fplug_id = -1;
51056+
51057+ switch (inode->i_mode & S_IFMT) {
51058+ case S_IFSOCK:
51059+ case S_IFBLK:
51060+ case S_IFCHR:
51061+ case S_IFIFO:
51062+ fplug_id = SPECIAL_FILE_PLUGIN_ID;
51063+ break;
51064+ case S_IFLNK:
51065+ fplug_id = SYMLINK_FILE_PLUGIN_ID;
51066+ break;
51067+ case S_IFDIR:
51068+ fplug_id = DIRECTORY_FILE_PLUGIN_ID;
51069+ dplug_id = HASHED_DIR_PLUGIN_ID;
51070+ break;
51071+ default:
51072+ warning("nikita-737", "wrong file mode: %o", inode->i_mode);
51073+ return RETERR(-EIO);
51074+ case S_IFREG:
51075+ fplug_id = UNIX_FILE_PLUGIN_ID;
51076+ break;
51077+ }
51078+ info = reiser4_inode_data(inode);
51079+ set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
51080+ plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
51081+ set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
51082+ plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
51083+ return 0;
51084+}
51085+
51086+/* Audited by: green(2002.06.14) */
51087+static int absent_plugin_sd(struct inode *inode /* object being processed */ )
51088+{
51089+ int result;
51090+
51091+ assert("nikita-659", inode != NULL);
51092+
51093+ result = guess_plugin_by_mode(inode);
51094+ /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
51095+ but setup_inode_ops() will call make_bad_inode().
51096+ Another, more logical but bit more complex solution is to add
51097+ "bad-file plugin". */
51098+ /* FIXME-VS: activate was called here */
51099+ return result;
51100+}
51101+
51102+/* helper function for plugin_sd_save_len(): calculate how much space
51103+ required to save state of given plugin */
51104+/* Audited by: green(2002.06.14) */
51105+static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
51106+ struct inode *inode /* object being processed */ ,
51107+ pset_member memb,
51108+ int len, int is_pset)
51109+{
51110+ reiser4_inode *info;
51111+ assert("nikita-661", inode != NULL);
51112+
51113+ if (plugin == NULL)
51114+ return len;
51115+
51116+ info = reiser4_inode_data(inode);
51117+ if (is_pset ?
51118+ info->plugin_mask & (1 << memb) :
51119+ info->heir_mask & (1 << memb)) {
51120+ len += sizeof(reiser4_plugin_slot);
51121+ if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
51122+ /* non-standard plugin, call method */
51123+ /* commented as it is incompatible with alignment
51124+ * policy in save_plug() -edward */
51125+ /* len = round_up(len, plugin->h.pops->alignment); */
51126+ len += plugin->h.pops->save_len(inode, plugin);
51127+ }
51128+ }
51129+ return len;
51130+}
51131+
51132+/* calculate how much space is required to save state of all plugins,
51133+ associated with inode */
51134+static int save_len_plugin_sd(struct inode *inode /* object being processed */,
51135+ int is_pset)
51136+{
51137+ int len;
51138+ int last;
51139+ reiser4_inode *state;
51140+ pset_member memb;
51141+
51142+ assert("nikita-663", inode != NULL);
51143+
51144+ state = reiser4_inode_data(inode);
51145+
51146+ /* common case: no non-standard plugins */
51147+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51148+ return 0;
51149+ len = sizeof(reiser4_plugin_stat);
51150+ last = PSET_LAST;
51151+
51152+ for (memb = 0; memb < last; ++memb) {
51153+ len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
51154+ inode, memb, len, is_pset);
51155+ }
51156+ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
51157+ return len;
51158+}
51159+
51160+static int save_len_pset_sd(struct inode *inode) {
51161+ return save_len_plugin_sd(inode, 1 /* pset */);
51162+}
51163+
51164+/* helper function for plugin_sd_save(): save plugin, associated with
51165+ inode. */
51166+static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
51167+ struct inode *inode /* object being processed */ ,
51168+ int memb /* what element of pset is saved */ ,
51169+ char **area /* position in stat-data */ ,
51170+ int *count /* incremented if plugin were actually saved. */,
51171+ int is_pset /* 1 for plugin set, 0 for heir set */)
51172+{
51173+ reiser4_plugin_slot *slot;
51174+ int fake_len;
51175+ int result;
51176+
51177+ assert("nikita-665", inode != NULL);
51178+ assert("nikita-666", area != NULL);
51179+ assert("nikita-667", *area != NULL);
51180+
51181+ if (plugin == NULL)
51182+ return 0;
51183+
51184+ if (is_pset ?
51185+ !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
51186+ !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
51187+ return 0;
51188+ slot = (reiser4_plugin_slot *) * area;
51189+ put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
51190+ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
51191+ fake_len = (int)0xffff;
51192+ move_on(&fake_len, area, sizeof *slot);
51193+ ++*count;
51194+ result = 0;
51195+ if (plugin->h.pops != NULL) {
51196+ if (plugin->h.pops->save != NULL)
51197+ result = plugin->h.pops->save(inode, plugin, area);
51198+ }
51199+ return result;
51200+}
51201+
51202+/* save state of all non-standard plugins associated with inode */
51203+static int save_plugin_sd(struct inode *inode /* object being processed */ ,
51204+ char **area /* position in stat-data */,
51205+ int is_pset /* 1 for pset, 0 for hset */)
51206+{
51207+ int fake_len;
51208+ int result = 0;
51209+ int num_of_plugins;
51210+ reiser4_plugin_stat *sd;
51211+ reiser4_inode *state;
51212+ pset_member memb;
51213+
51214+ assert("nikita-669", inode != NULL);
51215+ assert("nikita-670", area != NULL);
51216+ assert("nikita-671", *area != NULL);
51217+
51218+ state = reiser4_inode_data(inode);
51219+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51220+ return 0;
51221+ sd = (reiser4_plugin_stat *) * area;
51222+ fake_len = (int)0xffff;
51223+ move_on(&fake_len, area, sizeof *sd);
51224+
51225+ num_of_plugins = 0;
51226+ for (memb = 0; memb < PSET_LAST; ++memb) {
51227+ result = save_plug(aset_get(is_pset ? state->pset : state->hset,
51228+ memb),
51229+ inode, memb, area, &num_of_plugins, is_pset);
51230+ if (result != 0)
51231+ break;
51232+ }
51233+
51234+ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
51235+ return result;
51236+}
51237+
51238+static int save_pset_sd(struct inode *inode, char **area) {
51239+ return save_plugin_sd(inode, area, 1 /* pset */);
51240+}
51241+
51242+static int present_hset_sd(struct inode *inode, char **area, int *len) {
51243+ return present_plugin_sd(inode, area, len, 0 /* hset */);
51244+}
51245+
51246+static int save_len_hset_sd(struct inode *inode) {
51247+ return save_len_plugin_sd(inode, 0 /* pset */);
51248+}
51249+
51250+static int save_hset_sd(struct inode *inode, char **area) {
51251+ return save_plugin_sd(inode, area, 0 /* hset */);
51252+}
51253+
51254+/* helper function for crypto_sd_present(), crypto_sd_save.
51255+ Extract crypto info from stat-data and attach it to inode */
51256+static int extract_crypto_info (struct inode * inode,
51257+ reiser4_crypto_stat * sd)
51258+{
51259+ struct reiser4_crypto_info * info;
51260+ assert("edward-11", !inode_crypto_info(inode));
51261+ assert("edward-1413",
51262+ !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
51263+ /* create and attach a crypto-stat without secret key loaded */
51264+ info = reiser4_alloc_crypto_info(inode);
51265+ if (IS_ERR(info))
51266+ return PTR_ERR(info);
51267+ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
51268+ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
51269+ reiser4_attach_crypto_info(inode, info);
51270+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51271+ return 0;
51272+}
51273+
51274+/* crypto stat-data extension */
51275+
51276+static int present_crypto_sd(struct inode *inode, char **area, int *len)
51277+{
51278+ int result;
51279+ reiser4_crypto_stat *sd;
51280+ digest_plugin *dplug = inode_digest_plugin(inode);
51281+
51282+ assert("edward-06", dplug != NULL);
51283+ assert("edward-684", dplug->fipsize);
51284+ assert("edward-07", area != NULL);
51285+ assert("edward-08", *area != NULL);
51286+ assert("edward-09", len != NULL);
51287+ assert("edward-10", *len > 0);
51288+
51289+ if (*len < (int)sizeof(reiser4_crypto_stat)) {
51290+ return not_enough_space(inode, "crypto-sd");
51291+ }
51292+ /* *len is number of bytes in stat data item from *area to the end of
51293+ item. It must be not less than size of this extension */
51294+ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
51295+
51296+ sd = (reiser4_crypto_stat *) * area;
51297+ result = extract_crypto_info(inode, sd);
51298+ move_on(len, area, sizeof(*sd) + dplug->fipsize);
51299+
51300+ return result;
51301+}
51302+
51303+static int save_len_crypto_sd(struct inode *inode)
51304+{
51305+ return sizeof(reiser4_crypto_stat) +
51306+ inode_digest_plugin(inode)->fipsize;
51307+}
51308+
51309+static int save_crypto_sd(struct inode *inode, char **area)
51310+{
51311+ int result = 0;
51312+ reiser4_crypto_stat *sd;
51313+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
51314+ digest_plugin *dplug = inode_digest_plugin(inode);
51315+
51316+ assert("edward-12", dplug != NULL);
51317+ assert("edward-13", area != NULL);
51318+ assert("edward-14", *area != NULL);
51319+ assert("edward-15", info != NULL);
51320+ assert("edward-1414", info->keyid != NULL);
51321+ assert("edward-1415", info->keysize != 0);
51322+ assert("edward-76", reiser4_inode_data(inode) != NULL);
51323+
51324+ if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
51325+ /* file is just created */
51326+ sd = (reiser4_crypto_stat *) *area;
51327+ /* copy everything but private key to the disk stat-data */
51328+ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
51329+ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
51330+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51331+ }
51332+ *area += (sizeof(*sd) + dplug->fipsize);
51333+ return result;
51334+}
51335+
51336+static int eio(struct inode *inode, char **area, int *len)
51337+{
51338+ return RETERR(-EIO);
51339+}
51340+
51341+sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
51342+ [LIGHT_WEIGHT_STAT] = {
51343+ .h = {
51344+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51345+ .id = LIGHT_WEIGHT_STAT,
51346+ .pops = NULL,
51347+ .label = "light-weight sd",
51348+ .desc = "sd for light-weight files",
51349+ .linkage = {NULL,NULL}
51350+ },
51351+ .present = present_lw_sd,
51352+ .absent = NULL,
51353+ .save_len = save_len_lw_sd,
51354+ .save = save_lw_sd,
51355+ .alignment = 8
51356+ },
51357+ [UNIX_STAT] = {
51358+ .h = {
51359+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51360+ .id = UNIX_STAT,
51361+ .pops = NULL,
51362+ .label = "unix-sd",
51363+ .desc = "unix stat-data fields",
51364+ .linkage = {NULL,NULL}
51365+ },
51366+ .present = present_unix_sd,
51367+ .absent = absent_unix_sd,
51368+ .save_len = save_len_unix_sd,
51369+ .save = save_unix_sd,
51370+ .alignment = 8
51371+ },
51372+ [LARGE_TIMES_STAT] = {
51373+ .h = {
51374+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51375+ .id = LARGE_TIMES_STAT,
51376+ .pops = NULL,
51377+ .label = "64time-sd",
51378+ .desc = "nanosecond resolution for times",
51379+ .linkage = {NULL,NULL}
51380+ },
51381+ .present = present_large_times_sd,
51382+ .absent = NULL,
51383+ .save_len = save_len_large_times_sd,
51384+ .save = save_large_times_sd,
51385+ .alignment = 8
51386+ },
51387+ [SYMLINK_STAT] = {
51388+ /* stat data of symlink has this extension */
51389+ .h = {
51390+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51391+ .id = SYMLINK_STAT,
51392+ .pops = NULL,
51393+ .label = "symlink-sd",
51394+ .desc =
51395+ "stat data is appended with symlink name",
51396+ .linkage = {NULL,NULL}
51397+ },
51398+ .present = present_symlink_sd,
51399+ .absent = NULL,
51400+ .save_len = save_len_symlink_sd,
51401+ .save = save_symlink_sd,
51402+ .alignment = 8
51403+ },
51404+ [PLUGIN_STAT] = {
51405+ .h = {
51406+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51407+ .id = PLUGIN_STAT,
51408+ .pops = NULL,
51409+ .label = "plugin-sd",
51410+ .desc = "plugin stat-data fields",
51411+ .linkage = {NULL,NULL}
51412+ },
51413+ .present = present_pset_sd,
51414+ .absent = absent_plugin_sd,
51415+ .save_len = save_len_pset_sd,
51416+ .save = save_pset_sd,
51417+ .alignment = 8
51418+ },
51419+ [HEIR_STAT] = {
51420+ .h = {
51421+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51422+ .id = HEIR_STAT,
51423+ .pops = NULL,
51424+ .label = "heir-plugin-sd",
51425+ .desc = "heir plugin stat-data fields",
51426+ .linkage = {NULL,NULL}
51427+ },
51428+ .present = present_hset_sd,
51429+ .absent = NULL,
51430+ .save_len = save_len_hset_sd,
51431+ .save = save_hset_sd,
51432+ .alignment = 8
51433+ },
51434+ [FLAGS_STAT] = {
51435+ .h = {
51436+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51437+ .id = FLAGS_STAT,
51438+ .pops = NULL,
51439+ .label = "flags-sd",
51440+ .desc = "inode bit flags",
51441+ .linkage = {NULL, NULL}
51442+ },
51443+ .present = present_flags_sd,
51444+ .absent = NULL,
51445+ .save_len = save_len_flags_sd,
51446+ .save = save_flags_sd,
51447+ .alignment = 8
51448+ },
51449+ [CAPABILITIES_STAT] = {
51450+ .h = {
51451+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51452+ .id = CAPABILITIES_STAT,
51453+ .pops = NULL,
51454+ .label = "capabilities-sd",
51455+ .desc = "capabilities",
51456+ .linkage = {NULL, NULL}
51457+ },
51458+ .present = eio,
51459+ .absent = NULL,
51460+ .save_len = save_len_flags_sd,
51461+ .save = save_flags_sd,
51462+ .alignment = 8
51463+ },
51464+ [CRYPTO_STAT] = {
51465+ .h = {
51466+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51467+ .id = CRYPTO_STAT,
51468+ .pops = NULL,
51469+ .label = "crypto-sd",
51470+ .desc = "secret key size and id",
51471+ .linkage = {NULL, NULL}
51472+ },
51473+ .present = present_crypto_sd,
51474+ .absent = NULL,
51475+ .save_len = save_len_crypto_sd,
51476+ .save = save_crypto_sd,
51477+ .alignment = 8
51478+ }
51479+};
51480+
51481+/* Make Linus happy.
51482+ Local variables:
51483+ c-indentation-style: "K&R"
51484+ mode-name: "LC"
51485+ c-basic-offset: 8
51486+ tab-width: 8
51487+ fill-column: 120
51488+ End:
51489+*/
51490diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.23/fs/reiser4/plugin/item/static_stat.h
51491--- linux-2.6.23.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 03:00:00.000000000 +0300
51492+++ linux-2.6.23/fs/reiser4/plugin/item/static_stat.h 2007-12-04 16:49:30.000000000 +0300
51493@@ -0,0 +1,224 @@
51494+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51495+
51496+/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
51497+
51498+In the case where each file has not less than the fields needed by the
51499+stat() syscall, it is more compact to store those fields in this
51500+struct.
51501+
51502+If this item does not exist, then all stats are dynamically resolved.
51503+At the moment, we either resolve all stats dynamically or all of them
51504+statically. If you think this is not fully optimal, and the rest of
51505+reiser4 is working, then fix it...:-)
51506+
51507+*/
51508+
51509+#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
51510+#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
51511+
51512+#include "../../forward.h"
51513+#include "../../dformat.h"
51514+
51515+#include <linux/fs.h> /* for struct inode */
51516+
51517+/* Stat data layout: goals and implementation.
51518+
51519+ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
51520+ them, including not having semantic metadata attached to them.
51521+
51522+ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
51523+ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
51524+ sized structure because the statically sized structure knows without recording it what the names and lengths of the
51525+ attributes are.
51526+
51527+ This leads to a natural compromise, which is to special case those files which have simply the standard unix file
51528+ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
51529+ file in their use of file attributes.
51530+
51531+ Yet this compromise deserves to be compromised a little.
51532+
51533+ We accommodate the case where you have no more than the standard unix file attributes by using an "extension
51534+ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
51535+
51536+ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
51537+ from parent directory (as uid, gid) or initialised to some sane values.
51538+
51539+ To capitalize on existing code infrastructure, extensions are
51540+ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
51541+ Each stat-data extension plugin implements four methods:
51542+
51543+ ->present() called by sd_load() when this extension is found in stat-data
51544+ ->absent() called by sd_load() when this extension is not found in stat-data
51545+ ->save_len() called by sd_len() to calculate total length of stat-data
51546+ ->save() called by sd_save() to store extension data into stat-data
51547+
51548+ Implementation is in fs/reiser4/plugin/item/static_stat.c
51549+*/
51550+
51551+/* stat-data extension. Please order this by presumed frequency of use */
51552+typedef enum {
51553+ /* support for light-weight files */
51554+ LIGHT_WEIGHT_STAT,
51555+ /* data required to implement unix stat(2) call. Layout is in
51556+ reiser4_unix_stat. If this is not present, file is light-weight */
51557+ UNIX_STAT,
51558+ /* this contains additional set of 32bit [anc]time fields to implement
51559+ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
51560+ if this extension is governed by 32bittimes mount option. */
51561+ LARGE_TIMES_STAT,
51562+ /* stat data has link name included */
51563+ SYMLINK_STAT,
51564+ /* on-disk slots of non-standard plugins for main plugin table
51565+ (@reiser4_inode->pset), that is, plugins that cannot be deduced
51566+ from file mode bits), for example, aggregation, interpolation etc. */
51567+ PLUGIN_STAT,
51568+ /* this extension contains persistent inode flags. These flags are
51569+ single bits: immutable, append, only, etc. Layout is in
51570+ reiser4_flags_stat. */
51571+ FLAGS_STAT,
51572+ /* this extension contains capabilities sets, associated with this
51573+ file. Layout is in reiser4_capabilities_stat */
51574+ CAPABILITIES_STAT,
51575+ /* this extension contains size and public id of the secret key.
51576+ Layout is in reiser4_crypto_stat */
51577+ CRYPTO_STAT,
51578+ /* on-disk slots of non-default plugins for inheritance, which
51579+ are extracted to special plugin table (@reiser4_inode->hset).
51580+ By default, children of the object will inherit plugins from
51581+ its main plugin table (pset). */
51582+ HEIR_STAT,
51583+ LAST_SD_EXTENSION,
51584+ /*
51585+ * init_inode_static_sd() iterates over extension mask until all
51586+ * non-zero bits are processed. This means, that neither ->present(),
51587+ * nor ->absent() methods will be called for stat-data extensions that
51588+ * go after last present extension. But some basic extensions, we want
51589+ * either ->absent() or ->present() method to be called, because these
51590+ * extensions set up something in inode even when they are not
51591+ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
51592+ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
51593+ * ->present(), or ->absent() method will be called, independently of
51594+ * what other extensions are present.
51595+ */
51596+ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
51597+} sd_ext_bits;
51598+
51599+/* minimal stat-data. This allows to support light-weight files. */
51600+typedef struct reiser4_stat_data_base {
51601+ /* 0 */ __le16 extmask;
51602+ /* 2 */
51603+} PACKED reiser4_stat_data_base;
51604+
51605+typedef struct reiser4_light_weight_stat {
51606+ /* 0 */ __le16 mode;
51607+ /* 2 */ __le32 nlink;
51608+ /* 6 */ __le64 size;
51609+ /* size in bytes */
51610+ /* 14 */
51611+} PACKED reiser4_light_weight_stat;
51612+
51613+typedef struct reiser4_unix_stat {
51614+ /* owner id */
51615+ /* 0 */ __le32 uid;
51616+ /* group id */
51617+ /* 4 */ __le32 gid;
51618+ /* access time */
51619+ /* 8 */ __le32 atime;
51620+ /* modification time */
51621+ /* 12 */ __le32 mtime;
51622+ /* change time */
51623+ /* 16 */ __le32 ctime;
51624+ union {
51625+ /* minor:major for device files */
51626+ /* 20 */ __le64 rdev;
51627+ /* bytes used by file */
51628+ /* 20 */ __le64 bytes;
51629+ } u;
51630+ /* 28 */
51631+} PACKED reiser4_unix_stat;
51632+
51633+/* symlink stored as part of inode */
51634+typedef struct reiser4_symlink_stat {
51635+ char body[0];
51636+} PACKED reiser4_symlink_stat;
51637+
51638+typedef struct reiser4_plugin_slot {
51639+ /* 0 */ __le16 pset_memb;
51640+ /* 2 */ __le16 id;
51641+ /* 4 *//* here plugin stores its persistent state */
51642+} PACKED reiser4_plugin_slot;
51643+
51644+/* stat-data extension for files with non-standard plugin. */
51645+typedef struct reiser4_plugin_stat {
51646+ /* number of additional plugins, associated with this object */
51647+ /* 0 */ __le16 plugins_no;
51648+ /* 2 */ reiser4_plugin_slot slot[0];
51649+ /* 2 */
51650+} PACKED reiser4_plugin_stat;
51651+
51652+/* stat-data extension for inode flags. Currently it is just fixed-width 32
51653+ * bit mask. If need arise, this can be replaced with variable width
51654+ * bitmask. */
51655+typedef struct reiser4_flags_stat {
51656+ /* 0 */ __le32 flags;
51657+ /* 4 */
51658+} PACKED reiser4_flags_stat;
51659+
51660+typedef struct reiser4_capabilities_stat {
51661+ /* 0 */ __le32 effective;
51662+ /* 8 */ __le32 permitted;
51663+ /* 16 */
51664+} PACKED reiser4_capabilities_stat;
51665+
51666+typedef struct reiser4_cluster_stat {
51667+/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
51668+ /* 0 */ d8 cluster_shift;
51669+ /* 1 */
51670+} PACKED reiser4_cluster_stat;
51671+
51672+typedef struct reiser4_crypto_stat {
51673+ /* secret key size, bits */
51674+ /* 0 */ d16 keysize;
51675+ /* secret key id */
51676+ /* 2 */ d8 keyid[0];
51677+ /* 2 */
51678+} PACKED reiser4_crypto_stat;
51679+
51680+typedef struct reiser4_large_times_stat {
51681+ /* access time */
51682+ /* 0 */ d32 atime;
51683+ /* modification time */
51684+ /* 4 */ d32 mtime;
51685+ /* change time */
51686+ /* 8 */ d32 ctime;
51687+ /* 12 */
51688+} PACKED reiser4_large_times_stat;
51689+
51690+/* this structure is filled by sd_item_stat */
51691+typedef struct sd_stat {
51692+ int dirs;
51693+ int files;
51694+ int others;
51695+} sd_stat;
51696+
51697+/* plugin->item.common.* */
51698+extern void print_sd(const char *prefix, coord_t * coord);
51699+extern void item_stat_static_sd(const coord_t * coord, void *vp);
51700+
51701+/* plugin->item.s.sd.* */
51702+extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
51703+extern int save_len_static_sd(struct inode *inode);
51704+extern int save_static_sd(struct inode *inode, char **area);
51705+
51706+/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
51707+#endif
51708+
51709+/* Make Linus happy.
51710+ Local variables:
51711+ c-indentation-style: "K&R"
51712+ mode-name: "LC"
51713+ c-basic-offset: 8
51714+ tab-width: 8
51715+ fill-column: 120
51716+ End:
51717+*/
51718diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/tail.c linux-2.6.23/fs/reiser4/plugin/item/tail.c
51719--- linux-2.6.23.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 03:00:00.000000000 +0300
51720+++ linux-2.6.23/fs/reiser4/plugin/item/tail.c 2007-12-04 23:04:00.738308094 +0300
51721@@ -0,0 +1,809 @@
51722+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51723+
51724+#include "item.h"
51725+#include "../../inode.h"
51726+#include "../../page_cache.h"
51727+#include "../../carry.h"
51728+#include "../../vfs_ops.h"
51729+
51730+#include <linux/quotaops.h>
51731+#include <asm/uaccess.h>
51732+#include <linux/swap.h>
51733+#include <linux/writeback.h>
51734+
51735+/* plugin->u.item.b.max_key_inside */
51736+reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
51737+{
51738+ item_key_by_coord(coord, key);
51739+ set_key_offset(key, get_key_offset(reiser4_max_key()));
51740+ return key;
51741+}
51742+
51743+/* plugin->u.item.b.can_contain_key */
51744+int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
51745+ const reiser4_item_data *data)
51746+{
51747+ reiser4_key item_key;
51748+
51749+ if (item_plugin_by_coord(coord) != data->iplug)
51750+ return 0;
51751+
51752+ item_key_by_coord(coord, &item_key);
51753+ if (get_key_locality(key) != get_key_locality(&item_key) ||
51754+ get_key_objectid(key) != get_key_objectid(&item_key))
51755+ return 0;
51756+
51757+ return 1;
51758+}
51759+
51760+/* plugin->u.item.b.mergeable
51761+ first item is of tail type */
51762+/* Audited by: green(2002.06.14) */
51763+int mergeable_tail(const coord_t *p1, const coord_t *p2)
51764+{
51765+ reiser4_key key1, key2;
51766+
51767+ assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
51768+ UNIX_FILE_METADATA_ITEM_TYPE));
51769+ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
51770+
51771+ if (item_id_by_coord(p2) != FORMATTING_ID) {
51772+ /* second item is of another type */
51773+ return 0;
51774+ }
51775+
51776+ item_key_by_coord(p1, &key1);
51777+ item_key_by_coord(p2, &key2);
51778+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
51779+ get_key_objectid(&key1) != get_key_objectid(&key2)
51780+ || get_key_type(&key1) != get_key_type(&key2)) {
51781+ /* items of different objects */
51782+ return 0;
51783+ }
51784+ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
51785+ /* not adjacent items */
51786+ return 0;
51787+ }
51788+ return 1;
51789+}
51790+
51791+/* plugin->u.item.b.print
51792+ plugin->u.item.b.check */
51793+
51794+/* plugin->u.item.b.nr_units */
51795+pos_in_node_t nr_units_tail(const coord_t * coord)
51796+{
51797+ return item_length_by_coord(coord);
51798+}
51799+
51800+/* plugin->u.item.b.lookup */
51801+lookup_result
51802+lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
51803+{
51804+ reiser4_key item_key;
51805+ __u64 lookuped, offset;
51806+ unsigned nr_units;
51807+
51808+ item_key_by_coord(coord, &item_key);
51809+ offset = get_key_offset(item_key_by_coord(coord, &item_key));
51810+ nr_units = nr_units_tail(coord);
51811+
51812+ /* key we are looking for must be greater than key of item @coord */
51813+ assert("vs-416", keygt(key, &item_key));
51814+
51815+ /* offset we are looking for */
51816+ lookuped = get_key_offset(key);
51817+
51818+ if (lookuped >= offset && lookuped < offset + nr_units) {
51819+ /* byte we are looking for is in this item */
51820+ coord->unit_pos = lookuped - offset;
51821+ coord->between = AT_UNIT;
51822+ return CBK_COORD_FOUND;
51823+ }
51824+
51825+ /* set coord after last unit */
51826+ coord->unit_pos = nr_units - 1;
51827+ coord->between = AFTER_UNIT;
51828+ return bias ==
51829+ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
51830+}
51831+
51832+/* plugin->u.item.b.paste */
51833+int
51834+paste_tail(coord_t *coord, reiser4_item_data *data,
51835+ carry_plugin_info *info UNUSED_ARG)
51836+{
51837+ unsigned old_item_length;
51838+ char *item;
51839+
51840+ /* length the item had before resizing has been performed */
51841+ old_item_length = item_length_by_coord(coord) - data->length;
51842+
51843+ /* tail items never get pasted in the middle */
51844+ assert("vs-363",
51845+ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
51846+ (coord->unit_pos == old_item_length - 1 &&
51847+ coord->between == AFTER_UNIT) ||
51848+ (coord->unit_pos == 0 && old_item_length == 0
51849+ && coord->between == AT_UNIT));
51850+
51851+ item = item_body_by_coord(coord);
51852+ if (coord->unit_pos == 0)
51853+ /* make space for pasted data when pasting at the beginning of
51854+ the item */
51855+ memmove(item + data->length, item, old_item_length);
51856+
51857+ if (coord->between == AFTER_UNIT)
51858+ coord->unit_pos++;
51859+
51860+ if (data->data) {
51861+ assert("vs-554", data->user == 0 || data->user == 1);
51862+ if (data->user) {
51863+ assert("nikita-3035", reiser4_schedulable());
51864+ /* copy from user space */
51865+ if (__copy_from_user(item + coord->unit_pos,
51866+ (const char __user *)data->data,
51867+ (unsigned)data->length))
51868+ return RETERR(-EFAULT);
51869+ } else
51870+ /* copy from kernel space */
51871+ memcpy(item + coord->unit_pos, data->data,
51872+ (unsigned)data->length);
51873+ } else {
51874+ memset(item + coord->unit_pos, 0, (unsigned)data->length);
51875+ }
51876+ return 0;
51877+}
51878+
51879+/* plugin->u.item.b.fast_paste */
51880+
51881+/* plugin->u.item.b.can_shift
51882+ number of units is returned via return value, number of bytes via @size. For
51883+ tail items they coincide */
51884+int
51885+can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
51886+ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
51887+ unsigned *size, unsigned want)
51888+{
51889+ /* make sure that that we do not want to shift more than we have */
51890+ assert("vs-364", want > 0
51891+ && want <= (unsigned)item_length_by_coord(source));
51892+
51893+ *size = min(want, free_space);
51894+ return *size;
51895+}
51896+
51897+/* plugin->u.item.b.copy_units */
51898+void
51899+copy_units_tail(coord_t * target, coord_t * source,
51900+ unsigned from, unsigned count,
51901+ shift_direction where_is_free_space,
51902+ unsigned free_space UNUSED_ARG)
51903+{
51904+ /* make sure that item @target is expanded already */
51905+ assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
51906+ assert("vs-370", free_space >= count);
51907+
51908+ if (where_is_free_space == SHIFT_LEFT) {
51909+ /* append item @target with @count first bytes of @source */
51910+ assert("vs-365", from == 0);
51911+
51912+ memcpy((char *)item_body_by_coord(target) +
51913+ item_length_by_coord(target) - count,
51914+ (char *)item_body_by_coord(source), count);
51915+ } else {
51916+ /* target item is moved to right already */
51917+ reiser4_key key;
51918+
51919+ assert("vs-367",
51920+ (unsigned)item_length_by_coord(source) == from + count);
51921+
51922+ memcpy((char *)item_body_by_coord(target),
51923+ (char *)item_body_by_coord(source) + from, count);
51924+
51925+ /* new units are inserted before first unit in an item,
51926+ therefore, we have to update item key */
51927+ item_key_by_coord(source, &key);
51928+ set_key_offset(&key, get_key_offset(&key) + from);
51929+
51930+ node_plugin_by_node(target->node)->update_item_key(target, &key,
51931+ NULL /*info */);
51932+ }
51933+}
51934+
51935+/* plugin->u.item.b.create_hook */
51936+
51937+/* item_plugin->b.kill_hook
51938+ this is called when @count units starting from @from-th one are going to be removed
51939+ */
51940+int
51941+kill_hook_tail(const coord_t * coord, pos_in_node_t from,
51942+ pos_in_node_t count, struct carry_kill_data *kdata)
51943+{
51944+ reiser4_key key;
51945+ loff_t start, end;
51946+
51947+ assert("vs-1577", kdata);
51948+ assert("vs-1579", kdata->inode);
51949+
51950+ item_key_by_coord(coord, &key);
51951+ start = get_key_offset(&key) + from;
51952+ end = start + count;
51953+ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
51954+ return 0;
51955+}
51956+
51957+/* plugin->u.item.b.shift_hook */
51958+
51959+/* helper for kill_units_tail and cut_units_tail */
51960+static int
51961+do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51962+ reiser4_key * smallest_removed, reiser4_key * new_first)
51963+{
51964+ pos_in_node_t count;
51965+
51966+ /* this method is only called to remove part of item */
51967+ assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
51968+ /* tails items are never cut from the middle of an item */
51969+ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
51970+ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
51971+
51972+ count = to - from + 1;
51973+
51974+ if (smallest_removed) {
51975+ /* store smallest key removed */
51976+ item_key_by_coord(coord, smallest_removed);
51977+ set_key_offset(smallest_removed,
51978+ get_key_offset(smallest_removed) + from);
51979+ }
51980+ if (new_first) {
51981+ /* head of item is cut */
51982+ assert("vs-1529", from == 0);
51983+
51984+ item_key_by_coord(coord, new_first);
51985+ set_key_offset(new_first,
51986+ get_key_offset(new_first) + from + count);
51987+ }
51988+
51989+ if (REISER4_DEBUG)
51990+ memset((char *)item_body_by_coord(coord) + from, 0, count);
51991+ return count;
51992+}
51993+
51994+/* plugin->u.item.b.cut_units */
51995+int
51996+cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51997+ struct carry_cut_data *cdata UNUSED_ARG,
51998+ reiser4_key * smallest_removed, reiser4_key * new_first)
51999+{
52000+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52001+}
52002+
52003+/* plugin->u.item.b.kill_units */
52004+int
52005+kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52006+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
52007+ reiser4_key * new_first)
52008+{
52009+ kill_hook_tail(coord, from, to - from + 1, kdata);
52010+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52011+}
52012+
52013+/* plugin->u.item.b.unit_key */
52014+reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
52015+{
52016+ assert("vs-375", coord_is_existing_unit(coord));
52017+
52018+ item_key_by_coord(coord, key);
52019+ set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
52020+
52021+ return key;
52022+}
52023+
52024+/* plugin->u.item.b.estimate
52025+ plugin->u.item.b.item_data_by_flow */
52026+
52027+/* tail redpage function. It is called from readpage_tail(). */
52028+static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
52029+{
52030+ tap_t tap;
52031+ int result;
52032+ coord_t coord;
52033+ lock_handle lh;
52034+ int count, mapped;
52035+ struct inode *inode;
52036+ char *pagedata;
52037+
52038+ /* saving passed coord in order to do not move it by tap. */
52039+ init_lh(&lh);
52040+ copy_lh(&lh, uf_coord->lh);
52041+ inode = page->mapping->host;
52042+ coord_dup(&coord, &uf_coord->coord);
52043+
52044+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
52045+
52046+ if ((result = reiser4_tap_load(&tap)))
52047+ goto out_tap_done;
52048+
52049+ /* lookup until page is filled up. */
52050+ for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
52051+ /* number of bytes to be copied to page */
52052+ count = item_length_by_coord(&coord) - coord.unit_pos;
52053+ if (count > PAGE_CACHE_SIZE - mapped)
52054+ count = PAGE_CACHE_SIZE - mapped;
52055+
52056+ /* attach @page to address space and get data address */
52057+ pagedata = kmap_atomic(page, KM_USER0);
52058+
52059+ /* copy tail item to page */
52060+ memcpy(pagedata + mapped,
52061+ ((char *)item_body_by_coord(&coord) + coord.unit_pos),
52062+ count);
52063+ mapped += count;
52064+
52065+ flush_dcache_page(page);
52066+
52067+ /* dettach page from address space */
52068+ kunmap_atomic(pagedata, KM_USER0);
52069+
52070+ /* Getting next tail item. */
52071+ if (mapped < PAGE_CACHE_SIZE) {
52072+ /*
52073+ * unlock page in order to avoid keep it locked
52074+ * during tree lookup, which takes long term locks
52075+ */
52076+ unlock_page(page);
52077+
52078+ /* getting right neighbour. */
52079+ result = go_dir_el(&tap, RIGHT_SIDE, 0);
52080+
52081+ /* lock page back */
52082+ lock_page(page);
52083+ if (PageUptodate(page)) {
52084+ /*
52085+ * another thread read the page, we have
52086+ * nothing to do
52087+ */
52088+ result = 0;
52089+ goto out_unlock_page;
52090+ }
52091+
52092+ if (result) {
52093+ if (result == -E_NO_NEIGHBOR) {
52094+ /*
52095+ * rigth neighbor is not a formatted
52096+ * node
52097+ */
52098+ result = 0;
52099+ goto done;
52100+ } else {
52101+ goto out_tap_relse;
52102+ }
52103+ } else {
52104+ if (!inode_file_plugin(inode)->
52105+ owns_item(inode, &coord)) {
52106+ /* item of another file is found */
52107+ result = 0;
52108+ goto done;
52109+ }
52110+ }
52111+ }
52112+ }
52113+
52114+ done:
52115+ if (mapped != PAGE_CACHE_SIZE)
52116+ zero_user_page(page, mapped, PAGE_CACHE_SIZE - mapped,
52117+ KM_USER0);
52118+ SetPageUptodate(page);
52119+ out_unlock_page:
52120+ unlock_page(page);
52121+ out_tap_relse:
52122+ reiser4_tap_relse(&tap);
52123+ out_tap_done:
52124+ reiser4_tap_done(&tap);
52125+ return result;
52126+}
52127+
52128+/*
52129+ plugin->s.file.readpage
52130+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
52131+ or
52132+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
52133+
52134+ At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
52135+ item. */
52136+int readpage_tail(void *vp, struct page *page)
52137+{
52138+ uf_coord_t *uf_coord = vp;
52139+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
52140+ ON_DEBUG(reiser4_key key);
52141+
52142+ assert("umka-2515", PageLocked(page));
52143+ assert("umka-2516", !PageUptodate(page));
52144+ assert("umka-2517", !jprivate(page) && !PagePrivate(page));
52145+ assert("umka-2518", page->mapping && page->mapping->host);
52146+
52147+ assert("umka-2519", znode_is_loaded(coord->node));
52148+ assert("umka-2520", item_is_tail(coord));
52149+ assert("umka-2521", coord_is_existing_unit(coord));
52150+ assert("umka-2522", znode_is_rlocked(coord->node));
52151+ assert("umka-2523",
52152+ page->mapping->host->i_ino ==
52153+ get_key_objectid(item_key_by_coord(coord, &key)));
52154+
52155+ return do_readpage_tail(uf_coord, page);
52156+}
52157+
52158+/**
52159+ * overwrite_tail
52160+ * @flow:
52161+ * @coord:
52162+ *
52163+ * Overwrites tail item or its part by user data. Returns number of bytes
52164+ * written or error code.
52165+ */
52166+static int overwrite_tail(flow_t *flow, coord_t *coord)
52167+{
52168+ unsigned count;
52169+
52170+ assert("vs-570", flow->user == 1);
52171+ assert("vs-946", flow->data);
52172+ assert("vs-947", coord_is_existing_unit(coord));
52173+ assert("vs-948", znode_is_write_locked(coord->node));
52174+ assert("nikita-3036", reiser4_schedulable());
52175+
52176+ count = item_length_by_coord(coord) - coord->unit_pos;
52177+ if (count > flow->length)
52178+ count = flow->length;
52179+
52180+ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
52181+ (const char __user *)flow->data, count))
52182+ return RETERR(-EFAULT);
52183+
52184+ znode_make_dirty(coord->node);
52185+ return count;
52186+}
52187+
52188+/**
52189+ * insert_first_tail
52190+ * @inode:
52191+ * @flow:
52192+ * @coord:
52193+ * @lh:
52194+ *
52195+ * Returns number of bytes written or error code.
52196+ */
52197+static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
52198+ coord_t *coord, lock_handle *lh)
52199+{
52200+ int result;
52201+ loff_t to_write;
52202+ struct unix_file_info *uf_info;
52203+
52204+ if (get_key_offset(&flow->key) != 0) {
52205+ /*
52206+ * file is empty and we have to write not to the beginning of
52207+ * file. Create a hole at the beginning of file. On success
52208+ * insert_flow returns 0 as number of written bytes which is
52209+ * what we have to return on padding a file with holes
52210+ */
52211+ flow->data = NULL;
52212+ flow->length = get_key_offset(&flow->key);
52213+ set_key_offset(&flow->key, 0);
52214+ /*
52215+ * holes in files built of tails are stored just like if there
52216+ * were real data which are all zeros. Therefore we have to
52217+ * allocate quota here as well
52218+ */
52219+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52220+ return RETERR(-EDQUOT);
52221+ result = reiser4_insert_flow(coord, lh, flow);
52222+ if (flow->length)
52223+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52224+
52225+ uf_info = unix_file_inode_data(inode);
52226+
52227+ /*
52228+ * first item insertion is only possible when writing to empty
52229+ * file or performing tail conversion
52230+ */
52231+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
52232+ (reiser4_inode_get_flag(inode,
52233+ REISER4_PART_MIXED) &&
52234+ reiser4_inode_get_flag(inode,
52235+ REISER4_PART_IN_CONV))));
52236+ /* if file was empty - update its state */
52237+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
52238+ uf_info->container = UF_CONTAINER_TAILS;
52239+ return result;
52240+ }
52241+
52242+ /* check quota before appending data */
52243+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52244+ return RETERR(-EDQUOT);
52245+
52246+ to_write = flow->length;
52247+ result = reiser4_insert_flow(coord, lh, flow);
52248+ if (flow->length)
52249+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52250+ return (to_write - flow->length) ? (to_write - flow->length) : result;
52251+}
52252+
52253+/**
52254+ * append_tail
52255+ * @inode:
52256+ * @flow:
52257+ * @coord:
52258+ * @lh:
52259+ *
52260+ * Returns number of bytes written or error code.
52261+ */
52262+static ssize_t append_tail(struct inode *inode,
52263+ flow_t *flow, coord_t *coord, lock_handle *lh)
52264+{
52265+ int result;
52266+ reiser4_key append_key;
52267+ loff_t to_write;
52268+
52269+ if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
52270+ flow->data = NULL;
52271+ flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
52272+ set_key_offset(&flow->key, get_key_offset(&append_key));
52273+ /*
52274+ * holes in files built of tails are stored just like if there
52275+ * were real data which are all zeros. Therefore we have to
52276+ * allocate quota here as well
52277+ */
52278+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52279+ return RETERR(-EDQUOT);
52280+ result = reiser4_insert_flow(coord, lh, flow);
52281+ if (flow->length)
52282+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52283+ return result;
52284+ }
52285+
52286+ /* check quota before appending data */
52287+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52288+ return RETERR(-EDQUOT);
52289+
52290+ to_write = flow->length;
52291+ result = reiser4_insert_flow(coord, lh, flow);
52292+ if (flow->length)
52293+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52294+ return (to_write - flow->length) ? (to_write - flow->length) : result;
52295+}
52296+
52297+/**
52298+ * write_tail_reserve_space - reserve space for tail write operation
52299+ * @inode:
52300+ *
52301+ * Estimates and reserves space which may be required for writing one flow to a
52302+ * file
52303+ */
52304+static int write_extent_reserve_space(struct inode *inode)
52305+{
52306+ __u64 count;
52307+ reiser4_tree *tree;
52308+
52309+ /*
52310+ * to write one flow to a file by tails we have to reserve disk space for:
52311+
52312+ * 1. find_file_item may have to insert empty node to the tree (empty
52313+ * leaf node between two extent items). This requires 1 block and
52314+ * number of blocks which are necessary to perform insertion of an
52315+ * internal item into twig level.
52316+ *
52317+ * 2. flow insertion
52318+ *
52319+ * 3. stat data update
52320+ */
52321+ tree = reiser4_tree_by_inode(inode);
52322+ count = estimate_one_insert_item(tree) +
52323+ estimate_insert_flow(tree->height) +
52324+ estimate_one_insert_item(tree);
52325+ grab_space_enable();
52326+ return reiser4_grab_space(count, 0 /* flags */);
52327+}
52328+
52329+#define PAGE_PER_FLOW 4
52330+
52331+static loff_t faultin_user_pages(const char __user *buf, size_t count)
52332+{
52333+ loff_t faulted;
52334+ int to_fault;
52335+
52336+ if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
52337+ count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
52338+ faulted = 0;
52339+ while (count > 0) {
52340+ to_fault = PAGE_CACHE_SIZE;
52341+ if (count < to_fault)
52342+ to_fault = count;
52343+ fault_in_pages_readable(buf + faulted, to_fault);
52344+ count -= to_fault;
52345+ faulted += to_fault;
52346+ }
52347+ return faulted;
52348+}
52349+
52350+/**
52351+ * reiser4_write_extent - write method of tail item plugin
52352+ * @file: file to write to
52353+ * @buf: address of user-space buffer
52354+ * @count: number of bytes to write
52355+ * @pos: position in file to write to
52356+ *
52357+ * Returns number of written bytes or error code.
52358+ */
52359+ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
52360+ size_t count, loff_t *pos)
52361+{
52362+ struct inode *inode;
52363+ struct hint hint;
52364+ int result;
52365+ flow_t flow;
52366+ coord_t *coord;
52367+ lock_handle *lh;
52368+ znode *loaded;
52369+
52370+ inode = file->f_dentry->d_inode;
52371+
52372+ if (write_extent_reserve_space(inode))
52373+ return RETERR(-ENOSPC);
52374+
52375+ result = load_file_hint(file, &hint);
52376+ BUG_ON(result != 0);
52377+
52378+ flow.length = faultin_user_pages(buf, count);
52379+ flow.user = 1;
52380+ memcpy(&flow.data, &buf, sizeof(buf));
52381+ flow.op = WRITE_OP;
52382+ key_by_inode_and_offset_common(inode, *pos, &flow.key);
52383+
52384+ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
52385+ if (IS_CBKERR(result))
52386+ return result;
52387+
52388+ coord = &hint.ext_coord.coord;
52389+ lh = hint.ext_coord.lh;
52390+
52391+ result = zload(coord->node);
52392+ BUG_ON(result != 0);
52393+ loaded = coord->node;
52394+
52395+ if (coord->between == AFTER_UNIT) {
52396+ /* append with data or hole */
52397+ result = append_tail(inode, &flow, coord, lh);
52398+ } else if (coord->between == AT_UNIT) {
52399+ /* overwrite */
52400+ result = overwrite_tail(&flow, coord);
52401+ } else {
52402+ /* no items of this file yet. insert data or hole */
52403+ result = insert_first_tail(inode, &flow, coord, lh);
52404+ }
52405+ zrelse(loaded);
52406+ if (result < 0) {
52407+ done_lh(lh);
52408+ return result;
52409+ }
52410+
52411+ /* seal and unlock znode */
52412+ hint.ext_coord.valid = 0;
52413+ if (hint.ext_coord.valid)
52414+ reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
52415+ else
52416+ reiser4_unset_hint(&hint);
52417+
52418+ save_file_hint(file, &hint);
52419+ return result;
52420+}
52421+
52422+#if REISER4_DEBUG
52423+
52424+static int
52425+coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
52426+{
52427+ reiser4_key item_key;
52428+
52429+ assert("vs-1356", coord_is_existing_unit(coord));
52430+ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
52431+ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
52432+ return get_key_offset(key) ==
52433+ get_key_offset(&item_key) + coord->unit_pos;
52434+
52435+}
52436+
52437+#endif
52438+
52439+/* plugin->u.item.s.file.read */
52440+int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
52441+{
52442+ unsigned count;
52443+ int item_length;
52444+ coord_t *coord;
52445+ uf_coord_t *uf_coord;
52446+
52447+ uf_coord = &hint->ext_coord;
52448+ coord = &uf_coord->coord;
52449+
52450+ assert("vs-571", f->user == 1);
52451+ assert("vs-571", f->data);
52452+ assert("vs-967", coord && coord->node);
52453+ assert("vs-1117", znode_is_rlocked(coord->node));
52454+ assert("vs-1118", znode_is_loaded(coord->node));
52455+
52456+ assert("nikita-3037", reiser4_schedulable());
52457+ assert("vs-1357", coord_matches_key_tail(coord, &f->key));
52458+
52459+ /* calculate number of bytes to read off the item */
52460+ item_length = item_length_by_coord(coord);
52461+ count = item_length_by_coord(coord) - coord->unit_pos;
52462+ if (count > f->length)
52463+ count = f->length;
52464+
52465+ /* user page has to be brought in so that major page fault does not
52466+ * occur here when longtem lock is held */
52467+ if (__copy_to_user((char __user *)f->data,
52468+ ((char *)item_body_by_coord(coord) + coord->unit_pos),
52469+ count))
52470+ return RETERR(-EFAULT);
52471+
52472+ /* probably mark_page_accessed() should only be called if
52473+ * coord->unit_pos is zero. */
52474+ mark_page_accessed(znode_page(coord->node));
52475+ move_flow_forward(f, count);
52476+
52477+ coord->unit_pos += count;
52478+ if (item_length == coord->unit_pos) {
52479+ coord->unit_pos--;
52480+ coord->between = AFTER_UNIT;
52481+ }
52482+ reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK);
52483+ return 0;
52484+}
52485+
52486+/*
52487+ plugin->u.item.s.file.append_key
52488+ key of first byte which is the next to last byte by addressed by this item
52489+*/
52490+reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
52491+{
52492+ item_key_by_coord(coord, key);
52493+ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
52494+ return key;
52495+}
52496+
52497+/* plugin->u.item.s.file.init_coord_extension */
52498+void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
52499+{
52500+ uf_coord->valid = 1;
52501+}
52502+
52503+/*
52504+ plugin->u.item.s.file.get_block
52505+*/
52506+int
52507+get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
52508+{
52509+ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
52510+
52511+ if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
52512+ /* if node has'nt obtainet its block number yet, return 0.
52513+ * Lets avoid upsetting users with some cosmic numbers beyond
52514+ * the device capacity.*/
52515+ *block = 0;
52516+ else
52517+ *block = *znode_get_block(coord->node);
52518+ return 0;
52519+}
52520+
52521+/*
52522+ * Local variables:
52523+ * c-indentation-style: "K&R"
52524+ * mode-name: "LC"
52525+ * c-basic-offset: 8
52526+ * tab-width: 8
52527+ * fill-column: 79
52528+ * scroll-step: 1
52529+ * End:
52530+ */
52531diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/tail.h linux-2.6.23/fs/reiser4/plugin/item/tail.h
52532--- linux-2.6.23.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 03:00:00.000000000 +0300
52533+++ linux-2.6.23/fs/reiser4/plugin/item/tail.h 2007-12-04 16:49:30.000000000 +0300
52534@@ -0,0 +1,58 @@
52535+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52536+
52537+#if !defined( __REISER4_TAIL_H__ )
52538+#define __REISER4_TAIL_H__
52539+
52540+struct tail_coord_extension {
52541+ int not_used;
52542+};
52543+
52544+struct cut_list;
52545+
52546+/* plugin->u.item.b.* */
52547+reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
52548+int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
52549+ const reiser4_item_data *);
52550+int mergeable_tail(const coord_t * p1, const coord_t * p2);
52551+pos_in_node_t nr_units_tail(const coord_t *);
52552+lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
52553+int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
52554+int can_shift_tail(unsigned free_space, coord_t * source,
52555+ znode * target, shift_direction, unsigned *size,
52556+ unsigned want);
52557+void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
52558+ unsigned count, shift_direction, unsigned free_space);
52559+int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
52560+ struct carry_kill_data *);
52561+int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52562+ struct carry_cut_data *, reiser4_key * smallest_removed,
52563+ reiser4_key * new_first);
52564+int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52565+ struct carry_kill_data *, reiser4_key * smallest_removed,
52566+ reiser4_key * new_first);
52567+reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
52568+
52569+/* plugin->u.item.s.* */
52570+ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
52571+ size_t count, loff_t *pos);
52572+int reiser4_read_tail(struct file *, flow_t *, hint_t *);
52573+int readpage_tail(void *vp, struct page *page);
52574+reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
52575+void init_coord_extension_tail(uf_coord_t *, loff_t offset);
52576+int get_block_address_tail(const coord_t *, sector_t, sector_t *);
52577+int item_balance_dirty_pages(struct address_space *, const flow_t *,
52578+ hint_t *, int back_to_dirty, int set_hint);
52579+
52580+/* __REISER4_TAIL_H__ */
52581+#endif
52582+
52583+/* Make Linus happy.
52584+ Local variables:
52585+ c-indentation-style: "K&R"
52586+ mode-name: "LC"
52587+ c-basic-offset: 8
52588+ tab-width: 8
52589+ fill-column: 120
52590+ scroll-step: 1
52591+ End:
52592+*/
52593diff -urN linux-2.6.23.orig/fs/reiser4/plugin/Makefile linux-2.6.23/fs/reiser4/plugin/Makefile
52594--- linux-2.6.23.orig/fs/reiser4/plugin/Makefile 1970-01-01 03:00:00.000000000 +0300
52595+++ linux-2.6.23/fs/reiser4/plugin/Makefile 2007-12-04 16:49:30.000000000 +0300
52596@@ -0,0 +1,26 @@
52597+obj-$(CONFIG_REISER4_FS) += plugins.o
52598+
52599+plugins-objs := \
52600+ plugin.o \
52601+ plugin_set.o \
52602+ object.o \
52603+ inode_ops.o \
52604+ inode_ops_rename.o \
52605+ file_ops.o \
52606+ file_ops_readdir.o \
52607+ file_plugin_common.o \
52608+ dir_plugin_common.o \
52609+ digest.o \
52610+ hash.o \
52611+ fibration.o \
52612+ tail_policy.o \
52613+ regular.o
52614+
52615+obj-$(CONFIG_REISER4_FS) += item/
52616+obj-$(CONFIG_REISER4_FS) += file/
52617+obj-$(CONFIG_REISER4_FS) += dir/
52618+obj-$(CONFIG_REISER4_FS) += node/
52619+obj-$(CONFIG_REISER4_FS) += compress/
52620+obj-$(CONFIG_REISER4_FS) += space/
52621+obj-$(CONFIG_REISER4_FS) += disk_format/
52622+obj-$(CONFIG_REISER4_FS) += security/
52623diff -urN linux-2.6.23.orig/fs/reiser4/plugin/node/Makefile linux-2.6.23/fs/reiser4/plugin/node/Makefile
52624--- linux-2.6.23.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 03:00:00.000000000 +0300
52625+++ linux-2.6.23/fs/reiser4/plugin/node/Makefile 2007-12-04 16:49:30.000000000 +0300
52626@@ -0,0 +1,5 @@
52627+obj-$(CONFIG_REISER4_FS) += node_plugins.o
52628+
52629+node_plugins-objs := \
52630+ node.o \
52631+ node40.o
52632diff -urN linux-2.6.23.orig/fs/reiser4/plugin/node/node40.c linux-2.6.23/fs/reiser4/plugin/node/node40.c
52633--- linux-2.6.23.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 03:00:00.000000000 +0300
52634+++ linux-2.6.23/fs/reiser4/plugin/node/node40.c 2007-12-04 16:49:30.000000000 +0300
52635@@ -0,0 +1,2924 @@
52636+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52637+
52638+#include "../../debug.h"
52639+#include "../../key.h"
52640+#include "../../coord.h"
52641+#include "../plugin_header.h"
52642+#include "../item/item.h"
52643+#include "node.h"
52644+#include "node40.h"
52645+#include "../plugin.h"
52646+#include "../../jnode.h"
52647+#include "../../znode.h"
52648+#include "../../pool.h"
52649+#include "../../carry.h"
52650+#include "../../tap.h"
52651+#include "../../tree.h"
52652+#include "../../super.h"
52653+#include "../../reiser4.h"
52654+
52655+#include <asm/uaccess.h>
52656+#include <linux/types.h>
52657+#include <linux/prefetch.h>
52658+
52659+/* leaf 40 format:
52660+
52661+ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
52662+ plugin_id (16) key
52663+ free_space (16) pluginid (16)
52664+ free_space_start (16) offset (16)
52665+ level (8)
52666+ num_items (16)
52667+ magic (32)
52668+ flush_time (32)
52669+*/
52670+/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
52671+/* magic number that is stored in ->magic field of node header */
52672+static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
52673+
52674+static int prepare_for_update(znode * left, znode * right,
52675+ carry_plugin_info * info);
52676+
52677+/* header of node of reiser40 format is at the beginning of node */
52678+static inline node40_header *node40_node_header(const znode * node /* node to
52679+ * query */ )
52680+{
52681+ assert("nikita-567", node != NULL);
52682+ assert("nikita-568", znode_page(node) != NULL);
52683+ assert("nikita-569", zdata(node) != NULL);
52684+ return (node40_header *) zdata(node);
52685+}
52686+
52687+/* functions to get/set fields of node40_header */
52688+#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
52689+#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
52690+#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
52691+#define nh40_get_level(nh) get_unaligned(&(nh)->level)
52692+#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
52693+#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
52694+
52695+#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
52696+#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
52697+#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
52698+#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
52699+#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
52700+#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
52701+
52702+/* plugin field of node header should be read/set by
52703+ plugin_by_disk_id/save_disk_plugin */
52704+
52705+/* array of item headers is at the end of node */
52706+static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
52707+{
52708+ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
52709+}
52710+
52711+/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
52712+ */
52713+static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
52714+{
52715+ return (item_header40 *) (zdata(coord->node) +
52716+ znode_size(coord->node)) - (coord->item_pos) -
52717+ 1;
52718+}
52719+
52720+/* functions to get/set fields of item_header40 */
52721+#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
52722+
52723+#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
52724+
52725+/* plugin field of item header should be read/set by
52726+ plugin_by_disk_id/save_disk_plugin */
52727+
52728+/* plugin methods */
52729+
52730+/* plugin->u.node.item_overhead
52731+ look for description of this method in plugin/node/node.h */
52732+size_t
52733+item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
52734+{
52735+ return sizeof(item_header40);
52736+}
52737+
52738+/* plugin->u.node.free_space
52739+ look for description of this method in plugin/node/node.h */
52740+size_t free_space_node40(znode * node)
52741+{
52742+ assert("nikita-577", node != NULL);
52743+ assert("nikita-578", znode_is_loaded(node));
52744+ assert("nikita-579", zdata(node) != NULL);
52745+
52746+ return nh40_get_free_space(node40_node_header(node));
52747+}
52748+
52749+/* private inline version of node40_num_of_items() for use in this file. This
52750+ is necessary, because address of node40_num_of_items() is taken and it is
52751+ never inlined as a result. */
52752+static inline short node40_num_of_items_internal(const znode * node)
52753+{
52754+ return nh40_get_num_items(node40_node_header(node));
52755+}
52756+
52757+#if REISER4_DEBUG
52758+static inline void check_num_items(const znode * node)
52759+{
52760+ assert("nikita-2749",
52761+ node40_num_of_items_internal(node) == node->nr_items);
52762+ assert("nikita-2746", znode_is_write_locked(node));
52763+}
52764+#else
52765+#define check_num_items(node) noop
52766+#endif
52767+
52768+/* plugin->u.node.num_of_items
52769+ look for description of this method in plugin/node/node.h */
52770+int num_of_items_node40(const znode * node)
52771+{
52772+ return node40_num_of_items_internal(node);
52773+}
52774+
52775+static void
52776+node40_set_num_items(znode * node, node40_header * nh, unsigned value)
52777+{
52778+ assert("nikita-2751", node != NULL);
52779+ assert("nikita-2750", nh == node40_node_header(node));
52780+
52781+ check_num_items(node);
52782+ nh40_set_num_items(nh, value);
52783+ node->nr_items = value;
52784+ check_num_items(node);
52785+}
52786+
52787+/* plugin->u.node.item_by_coord
52788+ look for description of this method in plugin/node/node.h */
52789+char *item_by_coord_node40(const coord_t * coord)
52790+{
52791+ item_header40 *ih;
52792+ char *p;
52793+
52794+ /* @coord is set to existing item */
52795+ assert("nikita-596", coord != NULL);
52796+ assert("vs-255", coord_is_existing_item(coord));
52797+
52798+ ih = node40_ih_at_coord(coord);
52799+ p = zdata(coord->node) + ih40_get_offset(ih);
52800+ return p;
52801+}
52802+
52803+/* plugin->u.node.length_by_coord
52804+ look for description of this method in plugin/node/node.h */
52805+int length_by_coord_node40(const coord_t * coord)
52806+{
52807+ item_header40 *ih;
52808+ int result;
52809+
52810+ /* @coord is set to existing item */
52811+ assert("vs-256", coord != NULL);
52812+ assert("vs-257", coord_is_existing_item(coord));
52813+
52814+ ih = node40_ih_at_coord(coord);
52815+ if ((int)coord->item_pos ==
52816+ node40_num_of_items_internal(coord->node) - 1)
52817+ result =
52818+ nh40_get_free_space_start(node40_node_header(coord->node)) -
52819+ ih40_get_offset(ih);
52820+ else
52821+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
52822+
52823+ return result;
52824+}
52825+
52826+static pos_in_node_t
52827+node40_item_length(const znode * node, pos_in_node_t item_pos)
52828+{
52829+ item_header40 *ih;
52830+ pos_in_node_t result;
52831+
52832+ /* @coord is set to existing item */
52833+ assert("vs-256", node != NULL);
52834+ assert("vs-257", node40_num_of_items_internal(node) > item_pos);
52835+
52836+ ih = node40_ih_at(node, item_pos);
52837+ if (item_pos == node40_num_of_items_internal(node) - 1)
52838+ result =
52839+ nh40_get_free_space_start(node40_node_header(node)) -
52840+ ih40_get_offset(ih);
52841+ else
52842+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
52843+
52844+ return result;
52845+}
52846+
52847+/* plugin->u.node.plugin_by_coord
52848+ look for description of this method in plugin/node/node.h */
52849+item_plugin *plugin_by_coord_node40(const coord_t * coord)
52850+{
52851+ item_header40 *ih;
52852+ item_plugin *result;
52853+
52854+ /* @coord is set to existing item */
52855+ assert("vs-258", coord != NULL);
52856+ assert("vs-259", coord_is_existing_item(coord));
52857+
52858+ ih = node40_ih_at_coord(coord);
52859+ /* pass NULL in stead of current tree. This is time critical call. */
52860+ result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
52861+ return result;
52862+}
52863+
52864+/* plugin->u.node.key_at
52865+ look for description of this method in plugin/node/node.h */
52866+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
52867+{
52868+ item_header40 *ih;
52869+
52870+ assert("nikita-1765", coord_is_existing_item(coord));
52871+
52872+ /* @coord is set to existing item */
52873+ ih = node40_ih_at_coord(coord);
52874+ memcpy(key, &ih->key, sizeof(reiser4_key));
52875+ return key;
52876+}
52877+
52878+/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
52879+
52880+#define NODE_INCSTAT(n, counter) \
52881+ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
52882+
52883+#define NODE_ADDSTAT(n, counter, val) \
52884+ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
52885+
52886+/* plugin->u.node.lookup
52887+ look for description of this method in plugin/node/node.h */
52888+node_search_result lookup_node40(znode * node /* node to query */ ,
52889+ const reiser4_key * key /* key to look for */ ,
52890+ lookup_bias bias /* search bias */ ,
52891+ coord_t * coord /* resulting coord */ )
52892+{
52893+ int left;
52894+ int right;
52895+ int found;
52896+ int items;
52897+
52898+ item_header40 *lefth;
52899+ item_header40 *righth;
52900+
52901+ item_plugin *iplug;
52902+ item_header40 *bstop;
52903+ item_header40 *ih;
52904+ cmp_t order;
52905+
52906+ assert("nikita-583", node != NULL);
52907+ assert("nikita-584", key != NULL);
52908+ assert("nikita-585", coord != NULL);
52909+ assert("nikita-2693", znode_is_any_locked(node));
52910+ cassert(REISER4_SEQ_SEARCH_BREAK > 2);
52911+
52912+ items = node_num_items(node);
52913+
52914+ if (unlikely(items == 0)) {
52915+ coord_init_first_unit(coord, node);
52916+ return NS_NOT_FOUND;
52917+ }
52918+
52919+ /* binary search for item that can contain given key */
52920+ left = 0;
52921+ right = items - 1;
52922+ coord->node = node;
52923+ coord_clear_iplug(coord);
52924+ found = 0;
52925+
52926+ lefth = node40_ih_at(node, left);
52927+ righth = node40_ih_at(node, right);
52928+
52929+ /* It is known that for small arrays sequential search is on average
52930+ more efficient than binary. This is because sequential search is
52931+ coded as tight loop that can be better optimized by compilers and
52932+ for small array size gain from this optimization makes sequential
52933+ search the winner. Another, maybe more important, reason for this,
52934+ is that sequential array is more CPU cache friendly, whereas binary
52935+ search effectively destroys CPU caching.
52936+
52937+ Critical here is the notion of "smallness". Reasonable value of
52938+ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
52939+ fs/reiser4/ulevel/ulevel.c:test_search().
52940+
52941+ Don't try to further optimize sequential search by scanning from
52942+ right to left in attempt to use more efficient loop termination
52943+ condition (comparison with 0). This doesn't work.
52944+
52945+ */
52946+
52947+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
52948+ int median;
52949+ item_header40 *medianh;
52950+
52951+ median = (left + right) / 2;
52952+ medianh = node40_ih_at(node, median);
52953+
52954+ assert("nikita-1084", median >= 0);
52955+ assert("nikita-1085", median < items);
52956+ switch (keycmp(key, &medianh->key)) {
52957+ case LESS_THAN:
52958+ right = median;
52959+ righth = medianh;
52960+ break;
52961+ default:
52962+ wrong_return_value("nikita-586", "keycmp");
52963+ case GREATER_THAN:
52964+ left = median;
52965+ lefth = medianh;
52966+ break;
52967+ case EQUAL_TO:
52968+ do {
52969+ --median;
52970+ /* headers are ordered from right to left */
52971+ ++medianh;
52972+ } while (median >= 0 && keyeq(key, &medianh->key));
52973+ right = left = median + 1;
52974+ ih = lefth = righth = medianh - 1;
52975+ found = 1;
52976+ break;
52977+ }
52978+ }
52979+ /* sequential scan. Item headers, and, therefore, keys are stored at
52980+ the rightmost part of a node from right to left. We are trying to
52981+ access memory from left to right, and hence, scan in _descending_
52982+ order of item numbers.
52983+ */
52984+ if (!found) {
52985+ for (left = right, ih = righth; left >= 0; ++ih, --left) {
52986+ cmp_t comparison;
52987+
52988+ prefetchkey(&(ih + 1)->key);
52989+ comparison = keycmp(&ih->key, key);
52990+ if (comparison == GREATER_THAN)
52991+ continue;
52992+ if (comparison == EQUAL_TO) {
52993+ found = 1;
52994+ do {
52995+ --left;
52996+ ++ih;
52997+ } while (left >= 0 && keyeq(&ih->key, key));
52998+ ++left;
52999+ --ih;
53000+ } else {
53001+ assert("nikita-1256", comparison == LESS_THAN);
53002+ }
53003+ break;
53004+ }
53005+ if (unlikely(left < 0))
53006+ left = 0;
53007+ }
53008+
53009+ assert("nikita-3212", right >= left);
53010+ assert("nikita-3214",
53011+ equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
53012+
53013+ coord_set_item_pos(coord, left);
53014+ coord->unit_pos = 0;
53015+ coord->between = AT_UNIT;
53016+
53017+ /* key < leftmost key in a mode or node is corrupted and keys
53018+ are not sorted */
53019+ bstop = node40_ih_at(node, (unsigned)left);
53020+ order = keycmp(&bstop->key, key);
53021+ if (unlikely(order == GREATER_THAN)) {
53022+ if (unlikely(left != 0)) {
53023+ /* screw up */
53024+ warning("nikita-587", "Key less than %i key in a node",
53025+ left);
53026+ reiser4_print_key("key", key);
53027+ reiser4_print_key("min", &bstop->key);
53028+ print_coord_content("coord", coord);
53029+ return RETERR(-EIO);
53030+ } else {
53031+ coord->between = BEFORE_UNIT;
53032+ return NS_NOT_FOUND;
53033+ }
53034+ }
53035+ /* left <= key, ok */
53036+ iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
53037+
53038+ if (unlikely(iplug == NULL)) {
53039+ warning("nikita-588", "Unknown plugin %i",
53040+ le16_to_cpu(get_unaligned(&bstop->plugin_id)));
53041+ reiser4_print_key("key", key);
53042+ print_coord_content("coord", coord);
53043+ return RETERR(-EIO);
53044+ }
53045+
53046+ coord_set_iplug(coord, iplug);
53047+
53048+ /* if exact key from item header was found by binary search, no
53049+ further checks are necessary. */
53050+ if (found) {
53051+ assert("nikita-1259", order == EQUAL_TO);
53052+ return NS_FOUND;
53053+ }
53054+ if (iplug->b.max_key_inside != NULL) {
53055+ reiser4_key max_item_key;
53056+
53057+ /* key > max_item_key --- outside of an item */
53058+ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
53059+ coord->unit_pos = 0;
53060+ coord->between = AFTER_ITEM;
53061+ /* FIXME-VS: key we are looking for does not fit into
53062+ found item. Return NS_NOT_FOUND then. Without that
53063+ the following case does not work: there is extent of
53064+ file 10000, 10001. File 10000, 10002 has been just
53065+ created. When writing to position 0 in that file -
53066+ traverse_tree will stop here on twig level. When we
53067+ want it to go down to leaf level
53068+ */
53069+ return NS_NOT_FOUND;
53070+ }
53071+ }
53072+
53073+ if (iplug->b.lookup != NULL) {
53074+ return iplug->b.lookup(key, bias, coord);
53075+ } else {
53076+ assert("nikita-1260", order == LESS_THAN);
53077+ coord->between = AFTER_UNIT;
53078+ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
53079+ }
53080+}
53081+
53082+#undef NODE_ADDSTAT
53083+#undef NODE_INCSTAT
53084+
53085+/* plugin->u.node.estimate
53086+ look for description of this method in plugin/node/node.h */
53087+size_t estimate_node40(znode * node)
53088+{
53089+ size_t result;
53090+
53091+ assert("nikita-597", node != NULL);
53092+
53093+ result = free_space_node40(node) - sizeof(item_header40);
53094+
53095+ return (result > 0) ? result : 0;
53096+}
53097+
53098+/* plugin->u.node.check
53099+ look for description of this method in plugin/node/node.h */
53100+int check_node40(const znode * node /* node to check */ ,
53101+ __u32 flags /* check flags */ ,
53102+ const char **error /* where to store error message */ )
53103+{
53104+ int nr_items;
53105+ int i;
53106+ reiser4_key prev;
53107+ unsigned old_offset;
53108+ tree_level level;
53109+ coord_t coord;
53110+ int result;
53111+
53112+ assert("nikita-580", node != NULL);
53113+ assert("nikita-581", error != NULL);
53114+ assert("nikita-2948", znode_is_loaded(node));
53115+
53116+ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
53117+ return 0;
53118+
53119+ assert("nikita-582", zdata(node) != NULL);
53120+
53121+ nr_items = node40_num_of_items_internal(node);
53122+ if (nr_items < 0) {
53123+ *error = "Negative number of items";
53124+ return -1;
53125+ }
53126+
53127+ if (flags & REISER4_NODE_DKEYS)
53128+ prev = *znode_get_ld_key((znode *) node);
53129+ else
53130+ prev = *reiser4_min_key();
53131+
53132+ old_offset = 0;
53133+ coord_init_zero(&coord);
53134+ coord.node = (znode *) node;
53135+ coord.unit_pos = 0;
53136+ coord.between = AT_UNIT;
53137+ level = znode_get_level(node);
53138+ for (i = 0; i < nr_items; i++) {
53139+ item_header40 *ih;
53140+ reiser4_key unit_key;
53141+ unsigned j;
53142+
53143+ ih = node40_ih_at(node, (unsigned)i);
53144+ coord_set_item_pos(&coord, i);
53145+ if ((ih40_get_offset(ih) >=
53146+ znode_size(node) - nr_items * sizeof(item_header40)) ||
53147+ (ih40_get_offset(ih) < sizeof(node40_header))) {
53148+ *error = "Offset is out of bounds";
53149+ return -1;
53150+ }
53151+ if (ih40_get_offset(ih) <= old_offset) {
53152+ *error = "Offsets are in wrong order";
53153+ return -1;
53154+ }
53155+ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
53156+ *error = "Wrong offset of first item";
53157+ return -1;
53158+ }
53159+ old_offset = ih40_get_offset(ih);
53160+
53161+ if (keygt(&prev, &ih->key)) {
53162+ *error = "Keys are in wrong order";
53163+ return -1;
53164+ }
53165+ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
53166+ *error = "Wrong key of first unit";
53167+ return -1;
53168+ }
53169+ prev = ih->key;
53170+ for (j = 0; j < coord_num_units(&coord); ++j) {
53171+ coord.unit_pos = j;
53172+ unit_key_by_coord(&coord, &unit_key);
53173+ if (keygt(&prev, &unit_key)) {
53174+ *error = "Unit keys are in wrong order";
53175+ return -1;
53176+ }
53177+ prev = unit_key;
53178+ }
53179+ coord.unit_pos = 0;
53180+ if (level != TWIG_LEVEL && item_is_extent(&coord)) {
53181+ *error = "extent on the wrong level";
53182+ return -1;
53183+ }
53184+ if (level == LEAF_LEVEL && item_is_internal(&coord)) {
53185+ *error = "internal item on the wrong level";
53186+ return -1;
53187+ }
53188+ if (level != LEAF_LEVEL &&
53189+ !item_is_internal(&coord) && !item_is_extent(&coord)) {
53190+ *error = "wrong item on the internal level";
53191+ return -1;
53192+ }
53193+ if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
53194+ *error = "non-internal item on the internal level";
53195+ return -1;
53196+ }
53197+#if REISER4_DEBUG
53198+ if (item_plugin_by_coord(&coord)->b.check
53199+ && item_plugin_by_coord(&coord)->b.check(&coord, error))
53200+ return -1;
53201+#endif
53202+ if (i) {
53203+ coord_t prev_coord;
53204+ /* two neighboring items can not be mergeable */
53205+ coord_dup(&prev_coord, &coord);
53206+ coord_prev_item(&prev_coord);
53207+ if (are_items_mergeable(&prev_coord, &coord)) {
53208+ *error = "mergeable items in one node";
53209+ return -1;
53210+ }
53211+
53212+ }
53213+ }
53214+
53215+ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
53216+ coord_t coord;
53217+ item_plugin *iplug;
53218+
53219+ coord_init_last_unit(&coord, node);
53220+ iplug = item_plugin_by_coord(&coord);
53221+ if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
53222+ iplug->s.file.append_key != NULL) {
53223+ reiser4_key mkey;
53224+
53225+ iplug->s.file.append_key(&coord, &mkey);
53226+ set_key_offset(&mkey, get_key_offset(&mkey) - 1);
53227+ read_lock_dk(current_tree);
53228+ result = keygt(&mkey, znode_get_rd_key((znode *) node));
53229+ read_unlock_dk(current_tree);
53230+ if (result) {
53231+ *error = "key of rightmost item is too large";
53232+ return -1;
53233+ }
53234+ }
53235+ }
53236+ if (flags & REISER4_NODE_DKEYS) {
53237+ read_lock_tree(current_tree);
53238+ read_lock_dk(current_tree);
53239+
53240+ flags |= REISER4_NODE_TREE_STABLE;
53241+
53242+ if (keygt(&prev, znode_get_rd_key((znode *) node))) {
53243+ if (flags & REISER4_NODE_TREE_STABLE) {
53244+ *error = "Last key is greater than rdkey";
53245+ read_unlock_dk(current_tree);
53246+ read_unlock_tree(current_tree);
53247+ return -1;
53248+ }
53249+ }
53250+ if (keygt
53251+ (znode_get_ld_key((znode *) node),
53252+ znode_get_rd_key((znode *) node))) {
53253+ *error = "ldkey is greater than rdkey";
53254+ read_unlock_dk(current_tree);
53255+ read_unlock_tree(current_tree);
53256+ return -1;
53257+ }
53258+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
53259+ (node->left != NULL) &&
53260+ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
53261+ ergo(flags & REISER4_NODE_TREE_STABLE,
53262+ !keyeq(znode_get_rd_key(node->left),
53263+ znode_get_ld_key((znode *) node)))
53264+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53265+ keygt(znode_get_rd_key(node->left),
53266+ znode_get_ld_key((znode *) node)))) {
53267+ *error = "left rdkey or ldkey is wrong";
53268+ read_unlock_dk(current_tree);
53269+ read_unlock_tree(current_tree);
53270+ return -1;
53271+ }
53272+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
53273+ (node->right != NULL) &&
53274+ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
53275+ ergo(flags & REISER4_NODE_TREE_STABLE,
53276+ !keyeq(znode_get_rd_key((znode *) node),
53277+ znode_get_ld_key(node->right)))
53278+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53279+ keygt(znode_get_rd_key((znode *) node),
53280+ znode_get_ld_key(node->right)))) {
53281+ *error = "rdkey or right ldkey is wrong";
53282+ read_unlock_dk(current_tree);
53283+ read_unlock_tree(current_tree);
53284+ return -1;
53285+ }
53286+
53287+ read_unlock_dk(current_tree);
53288+ read_unlock_tree(current_tree);
53289+ }
53290+
53291+ return 0;
53292+}
53293+
53294+/* plugin->u.node.parse
53295+ look for description of this method in plugin/node/node.h */
53296+int parse_node40(znode * node /* node to parse */ )
53297+{
53298+ node40_header *header;
53299+ int result;
53300+ d8 level;
53301+
53302+ header = node40_node_header((znode *) node);
53303+ result = -EIO;
53304+ level = nh40_get_level(header);
53305+ if (unlikely(((__u8) znode_get_level(node)) != level))
53306+ warning("nikita-494", "Wrong level found in node: %i != %i",
53307+ znode_get_level(node), level);
53308+ else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
53309+ warning("nikita-495",
53310+ "Wrong magic in tree node: want %x, got %x",
53311+ REISER4_NODE_MAGIC, nh40_get_magic(header));
53312+ else {
53313+ node->nr_items = node40_num_of_items_internal(node);
53314+ result = 0;
53315+ }
53316+ return RETERR(result);
53317+}
53318+
53319+/* plugin->u.node.init
53320+ look for description of this method in plugin/node/node.h */
53321+int init_node40(znode * node /* node to initialise */ )
53322+{
53323+ node40_header *header;
53324+
53325+ assert("nikita-570", node != NULL);
53326+ assert("nikita-572", zdata(node) != NULL);
53327+
53328+ header = node40_node_header(node);
53329+ memset(header, 0, sizeof(node40_header));
53330+ nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
53331+ nh40_set_free_space_start(header, sizeof(node40_header));
53332+ /* sane hypothesis: 0 in CPU format is 0 in disk format */
53333+ /* items: 0 */
53334+ save_plugin_id(node_plugin_to_plugin(node->nplug),
53335+ &header->common_header.plugin_id);
53336+ nh40_set_level(header, znode_get_level(node));
53337+ nh40_set_magic(header, REISER4_NODE_MAGIC);
53338+ node->nr_items = 0;
53339+ nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
53340+
53341+ /* flags: 0 */
53342+ return 0;
53343+}
53344+
53345+#ifdef GUESS_EXISTS
53346+int guess_node40(const znode * node /* node to guess plugin of */ )
53347+{
53348+ node40_header *nethack;
53349+
53350+ assert("nikita-1058", node != NULL);
53351+ nethack = node40_node_header(node);
53352+ return
53353+ (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
53354+ (plugin_by_disk_id(znode_get_tree(node),
53355+ REISER4_NODE_PLUGIN_TYPE,
53356+ &nethack->common_header.plugin_id)->h.id ==
53357+ NODE40_ID);
53358+}
53359+#endif
53360+
53361+/* plugin->u.node.chage_item_size
53362+ look for description of this method in plugin/node/node.h */
53363+void change_item_size_node40(coord_t * coord, int by)
53364+{
53365+ node40_header *nh;
53366+ item_header40 *ih;
53367+ char *item_data;
53368+ int item_length;
53369+ unsigned i;
53370+
53371+ /* make sure that @item is coord of existing item */
53372+ assert("vs-210", coord_is_existing_item(coord));
53373+
53374+ nh = node40_node_header(coord->node);
53375+
53376+ item_data = item_by_coord_node40(coord);
53377+ item_length = length_by_coord_node40(coord);
53378+
53379+ /* move item bodies */
53380+ ih = node40_ih_at_coord(coord);
53381+ memmove(item_data + item_length + by, item_data + item_length,
53382+ nh40_get_free_space_start(node40_node_header(coord->node)) -
53383+ (ih40_get_offset(ih) + item_length));
53384+
53385+ /* update offsets of moved items */
53386+ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
53387+ ih = node40_ih_at(coord->node, i);
53388+ ih40_set_offset(ih, ih40_get_offset(ih) + by);
53389+ }
53390+
53391+ /* update node header */
53392+ nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
53393+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
53394+}
53395+
53396+static int should_notify_parent(const znode * node)
53397+{
53398+ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
53399+ return !disk_addr_eq(znode_get_block(node),
53400+ &znode_get_tree(node)->root_block);
53401+}
53402+
53403+/* plugin->u.node.create_item
53404+ look for description of this method in plugin/node/node.h */
53405+int
53406+create_item_node40(coord_t *target, const reiser4_key *key,
53407+ reiser4_item_data *data, carry_plugin_info *info)
53408+{
53409+ node40_header *nh;
53410+ item_header40 *ih;
53411+ unsigned offset;
53412+ unsigned i;
53413+
53414+ nh = node40_node_header(target->node);
53415+
53416+ assert("vs-212", coord_is_between_items(target));
53417+ /* node must have enough free space */
53418+ assert("vs-254",
53419+ free_space_node40(target->node) >=
53420+ data->length + sizeof(item_header40));
53421+ assert("vs-1410", data->length >= 0);
53422+
53423+ if (coord_set_to_right(target))
53424+ /* there are not items to the right of @target, so, new item
53425+ will be inserted after last one */
53426+ coord_set_item_pos(target, nh40_get_num_items(nh));
53427+
53428+ if (target->item_pos < nh40_get_num_items(nh)) {
53429+ /* there are items to be moved to prepare space for new
53430+ item */
53431+ ih = node40_ih_at_coord(target);
53432+ /* new item will start at this offset */
53433+ offset = ih40_get_offset(ih);
53434+
53435+ memmove(zdata(target->node) + offset + data->length,
53436+ zdata(target->node) + offset,
53437+ nh40_get_free_space_start(nh) - offset);
53438+ /* update headers of moved items */
53439+ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
53440+ ih = node40_ih_at(target->node, i);
53441+ ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
53442+ }
53443+
53444+ /* @ih is set to item header of the last item, move item headers */
53445+ memmove(ih - 1, ih,
53446+ sizeof(item_header40) * (nh40_get_num_items(nh) -
53447+ target->item_pos));
53448+ } else {
53449+ /* new item will start at this offset */
53450+ offset = nh40_get_free_space_start(nh);
53451+ }
53452+
53453+ /* make item header for the new item */
53454+ ih = node40_ih_at_coord(target);
53455+ memcpy(&ih->key, key, sizeof(reiser4_key));
53456+ ih40_set_offset(ih, offset);
53457+ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
53458+
53459+ /* update node header */
53460+ nh40_set_free_space(nh,
53461+ nh40_get_free_space(nh) - data->length -
53462+ sizeof(item_header40));
53463+ nh40_set_free_space_start(nh,
53464+ nh40_get_free_space_start(nh) + data->length);
53465+ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
53466+
53467+ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
53468+ target->unit_pos = 0;
53469+ target->between = AT_UNIT;
53470+ coord_clear_iplug(target);
53471+
53472+ /* initialize item */
53473+ if (data->iplug->b.init != NULL) {
53474+ data->iplug->b.init(target, NULL, data);
53475+ }
53476+ /* copy item body */
53477+ if (data->iplug->b.paste != NULL) {
53478+ data->iplug->b.paste(target, data, info);
53479+ } else if (data->data != NULL) {
53480+ if (data->user) {
53481+ /* AUDIT: Are we really should not check that pointer
53482+ from userspace was valid and data bytes were
53483+ available? How will we return -EFAULT of some kind
53484+ without this check? */
53485+ assert("nikita-3038", reiser4_schedulable());
53486+ /* copy data from user space */
53487+ __copy_from_user(zdata(target->node) + offset,
53488+ (const char __user *)data->data,
53489+ (unsigned)data->length);
53490+ } else
53491+ /* copy from kernel space */
53492+ memcpy(zdata(target->node) + offset, data->data,
53493+ (unsigned)data->length);
53494+ }
53495+
53496+ if (target->item_pos == 0) {
53497+ /* left delimiting key has to be updated */
53498+ prepare_for_update(NULL, target->node, info);
53499+ }
53500+
53501+ if (item_plugin_by_coord(target)->b.create_hook != NULL) {
53502+ item_plugin_by_coord(target)->b.create_hook(target, data->arg);
53503+ }
53504+
53505+ return 0;
53506+}
53507+
53508+/* plugin->u.node.update_item_key
53509+ look for description of this method in plugin/node/node.h */
53510+void
53511+update_item_key_node40(coord_t * target, const reiser4_key * key,
53512+ carry_plugin_info * info)
53513+{
53514+ item_header40 *ih;
53515+
53516+ ih = node40_ih_at_coord(target);
53517+ memcpy(&ih->key, key, sizeof(reiser4_key));
53518+
53519+ if (target->item_pos == 0) {
53520+ prepare_for_update(NULL, target->node, info);
53521+ }
53522+}
53523+
53524+/* this bits encode cut mode */
53525+#define CMODE_TAIL 1
53526+#define CMODE_WHOLE 2
53527+#define CMODE_HEAD 4
53528+
53529+struct cut40_info {
53530+ int mode;
53531+ pos_in_node_t tail_removed; /* position of item which gets tail removed */
53532+ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
53533+ pos_in_node_t removed_count; /* number of items removed completely */
53534+ pos_in_node_t head_removed; /* position of item which gets head removed */
53535+
53536+ pos_in_node_t freed_space_start;
53537+ pos_in_node_t freed_space_end;
53538+ pos_in_node_t first_moved;
53539+ pos_in_node_t head_removed_location;
53540+};
53541+
53542+static void init_cinfo(struct cut40_info *cinfo)
53543+{
53544+ cinfo->mode = 0;
53545+ cinfo->tail_removed = MAX_POS_IN_NODE;
53546+ cinfo->first_removed = MAX_POS_IN_NODE;
53547+ cinfo->removed_count = MAX_POS_IN_NODE;
53548+ cinfo->head_removed = MAX_POS_IN_NODE;
53549+ cinfo->freed_space_start = MAX_POS_IN_NODE;
53550+ cinfo->freed_space_end = MAX_POS_IN_NODE;
53551+ cinfo->first_moved = MAX_POS_IN_NODE;
53552+ cinfo->head_removed_location = MAX_POS_IN_NODE;
53553+}
53554+
53555+/* complete cut_node40/kill_node40 content by removing the gap created by */
53556+static void compact(znode * node, struct cut40_info *cinfo)
53557+{
53558+ node40_header *nh;
53559+ item_header40 *ih;
53560+ pos_in_node_t freed;
53561+ pos_in_node_t pos, nr_items;
53562+
53563+ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
53564+ cinfo->freed_space_end != MAX_POS_IN_NODE &&
53565+ cinfo->first_moved != MAX_POS_IN_NODE));
53566+ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
53567+
53568+ nh = node40_node_header(node);
53569+ nr_items = nh40_get_num_items(nh);
53570+
53571+ /* remove gap made up by removal */
53572+ memmove(zdata(node) + cinfo->freed_space_start,
53573+ zdata(node) + cinfo->freed_space_end,
53574+ nh40_get_free_space_start(nh) - cinfo->freed_space_end);
53575+
53576+ /* update item headers of moved items - change their locations */
53577+ pos = cinfo->first_moved;
53578+ ih = node40_ih_at(node, pos);
53579+ if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
53580+ assert("vs-1580", pos == cinfo->head_removed);
53581+ ih40_set_offset(ih, cinfo->head_removed_location);
53582+ pos++;
53583+ ih--;
53584+ }
53585+
53586+ freed = cinfo->freed_space_end - cinfo->freed_space_start;
53587+ for (; pos < nr_items; pos++, ih--) {
53588+ assert("vs-1581", ih == node40_ih_at(node, pos));
53589+ ih40_set_offset(ih, ih40_get_offset(ih) - freed);
53590+ }
53591+
53592+ /* free space start moved to right */
53593+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
53594+
53595+ if (cinfo->removed_count != MAX_POS_IN_NODE) {
53596+ /* number of items changed. Remove item headers of those items */
53597+ ih = node40_ih_at(node, nr_items - 1);
53598+ memmove(ih + cinfo->removed_count, ih,
53599+ sizeof(item_header40) * (nr_items -
53600+ cinfo->removed_count -
53601+ cinfo->first_removed));
53602+ freed += sizeof(item_header40) * cinfo->removed_count;
53603+ node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
53604+ }
53605+
53606+ /* total amount of free space increased */
53607+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
53608+}
53609+
53610+int shrink_item_node40(coord_t * coord, int delta)
53611+{
53612+ node40_header *nh;
53613+ item_header40 *ih;
53614+ pos_in_node_t pos;
53615+ pos_in_node_t nr_items;
53616+ char *end;
53617+ znode *node;
53618+ int off;
53619+
53620+ assert("nikita-3487", coord != NULL);
53621+ assert("nikita-3488", delta >= 0);
53622+
53623+ node = coord->node;
53624+ nh = node40_node_header(node);
53625+ nr_items = nh40_get_num_items(nh);
53626+
53627+ ih = node40_ih_at_coord(coord);
53628+ assert("nikita-3489", delta <= length_by_coord_node40(coord));
53629+ off = ih40_get_offset(ih) + length_by_coord_node40(coord);
53630+ end = zdata(node) + off;
53631+
53632+ /* remove gap made up by removal */
53633+ memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
53634+
53635+ /* update item headers of moved items - change their locations */
53636+ pos = coord->item_pos + 1;
53637+ ih = node40_ih_at(node, pos);
53638+ for (; pos < nr_items; pos++, ih--) {
53639+ assert("nikita-3490", ih == node40_ih_at(node, pos));
53640+ ih40_set_offset(ih, ih40_get_offset(ih) - delta);
53641+ }
53642+
53643+ /* free space start moved to left */
53644+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
53645+ /* total amount of free space increased */
53646+ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
53647+ /*
53648+ * This method does _not_ changes number of items. Hence, it cannot
53649+ * make node empty. Also it doesn't remove items at all, which means
53650+ * that no keys have to be updated either.
53651+ */
53652+ return 0;
53653+}
53654+
53655+/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
53656+ of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
53657+ rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
53658+ getting head cut. Function returns 0 in this case */
53659+static int
53660+parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
53661+{
53662+ reiser4_key left_key, right_key;
53663+ reiser4_key min_from_key, max_to_key;
53664+ const reiser4_key *from_key, *to_key;
53665+
53666+ init_cinfo(cinfo);
53667+
53668+ /* calculate minimal key stored in first item of items to be cut (params->from) */
53669+ item_key_by_coord(params->from, &min_from_key);
53670+ /* and max key stored in last item of items to be cut (params->to) */
53671+ max_item_key_by_coord(params->to, &max_to_key);
53672+
53673+ /* if cut key range is not defined in input parameters - define it using cut coord range */
53674+ if (params->from_key == NULL) {
53675+ assert("vs-1513", params->to_key == NULL);
53676+ unit_key_by_coord(params->from, &left_key);
53677+ from_key = &left_key;
53678+ max_unit_key_by_coord(params->to, &right_key);
53679+ to_key = &right_key;
53680+ } else {
53681+ from_key = params->from_key;
53682+ to_key = params->to_key;
53683+ }
53684+
53685+ if (params->from->item_pos == params->to->item_pos) {
53686+ if (keylt(&min_from_key, from_key)
53687+ && keylt(to_key, &max_to_key))
53688+ return 1;
53689+
53690+ if (keygt(from_key, &min_from_key)) {
53691+ /* tail of item is to be cut cut */
53692+ cinfo->tail_removed = params->from->item_pos;
53693+ cinfo->mode |= CMODE_TAIL;
53694+ } else if (keylt(to_key, &max_to_key)) {
53695+ /* head of item is to be cut */
53696+ cinfo->head_removed = params->from->item_pos;
53697+ cinfo->mode |= CMODE_HEAD;
53698+ } else {
53699+ /* item is removed completely */
53700+ cinfo->first_removed = params->from->item_pos;
53701+ cinfo->removed_count = 1;
53702+ cinfo->mode |= CMODE_WHOLE;
53703+ }
53704+ } else {
53705+ cinfo->first_removed = params->from->item_pos + 1;
53706+ cinfo->removed_count =
53707+ params->to->item_pos - params->from->item_pos - 1;
53708+
53709+ if (keygt(from_key, &min_from_key)) {
53710+ /* first item is not cut completely */
53711+ cinfo->tail_removed = params->from->item_pos;
53712+ cinfo->mode |= CMODE_TAIL;
53713+ } else {
53714+ cinfo->first_removed--;
53715+ cinfo->removed_count++;
53716+ }
53717+ if (keylt(to_key, &max_to_key)) {
53718+ /* last item is not cut completely */
53719+ cinfo->head_removed = params->to->item_pos;
53720+ cinfo->mode |= CMODE_HEAD;
53721+ } else {
53722+ cinfo->removed_count++;
53723+ }
53724+ if (cinfo->removed_count)
53725+ cinfo->mode |= CMODE_WHOLE;
53726+ }
53727+
53728+ return 0;
53729+}
53730+
53731+static void
53732+call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
53733+ carry_kill_data * kdata)
53734+{
53735+ coord_t coord;
53736+ item_plugin *iplug;
53737+ pos_in_node_t pos;
53738+
53739+ coord.node = node;
53740+ coord.unit_pos = 0;
53741+ coord.between = AT_UNIT;
53742+ for (pos = 0; pos < count; pos++) {
53743+ coord_set_item_pos(&coord, from + pos);
53744+ coord.unit_pos = 0;
53745+ coord.between = AT_UNIT;
53746+ iplug = item_plugin_by_coord(&coord);
53747+ if (iplug->b.kill_hook) {
53748+ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
53749+ kdata);
53750+ }
53751+ }
53752+}
53753+
53754+/* this is used to kill item partially */
53755+static pos_in_node_t
53756+kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
53757+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
53758+{
53759+ struct carry_kill_data *kdata;
53760+ item_plugin *iplug;
53761+
53762+ kdata = data;
53763+ iplug = item_plugin_by_coord(coord);
53764+
53765+ assert("vs-1524", iplug->b.kill_units);
53766+ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
53767+ new_first_key);
53768+}
53769+
53770+/* call item plugin to cut tail of file */
53771+static pos_in_node_t
53772+kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
53773+{
53774+ struct carry_kill_data *kdata;
53775+ pos_in_node_t to;
53776+
53777+ kdata = data;
53778+ to = coord_last_unit_pos(coord);
53779+ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
53780+ NULL);
53781+}
53782+
53783+/* call item plugin to cut head of item */
53784+static pos_in_node_t
53785+kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
53786+ reiser4_key * new_first_key)
53787+{
53788+ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
53789+ new_first_key);
53790+}
53791+
53792+/* this is used to cut item partially */
53793+static pos_in_node_t
53794+cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
53795+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
53796+{
53797+ carry_cut_data *cdata;
53798+ item_plugin *iplug;
53799+
53800+ cdata = data;
53801+ iplug = item_plugin_by_coord(coord);
53802+ assert("vs-302", iplug->b.cut_units);
53803+ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
53804+ new_first_key);
53805+}
53806+
53807+/* call item plugin to cut tail of file */
53808+static pos_in_node_t
53809+cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
53810+{
53811+ carry_cut_data *cdata;
53812+ pos_in_node_t to;
53813+
53814+ cdata = data;
53815+ to = coord_last_unit_pos(cdata->params.from);
53816+ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
53817+}
53818+
53819+/* call item plugin to cut head of item */
53820+static pos_in_node_t
53821+cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
53822+ reiser4_key * new_first_key)
53823+{
53824+ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
53825+ new_first_key);
53826+}
53827+
53828+/* this returns 1 of key of first item changed, 0 - if it did not */
53829+static int
53830+prepare_for_compact(struct cut40_info *cinfo,
53831+ const struct cut_kill_params *params, int is_cut,
53832+ void *data, carry_plugin_info * info)
53833+{
53834+ znode *node;
53835+ item_header40 *ih;
53836+ pos_in_node_t freed;
53837+ pos_in_node_t item_pos;
53838+ coord_t coord;
53839+ reiser4_key new_first_key;
53840+ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
53841+ void *, reiser4_key *, reiser4_key *);
53842+ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
53843+ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
53844+ reiser4_key *);
53845+ int retval;
53846+
53847+ retval = 0;
53848+
53849+ node = params->from->node;
53850+
53851+ assert("vs-184", node == params->to->node);
53852+ assert("vs-312", !node_is_empty(node));
53853+ assert("vs-297",
53854+ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
53855+
53856+ if (is_cut) {
53857+ kill_units_f = cut_units;
53858+ kill_tail_f = cut_tail;
53859+ kill_head_f = cut_head;
53860+ } else {
53861+ kill_units_f = kill_units;
53862+ kill_tail_f = kill_tail;
53863+ kill_head_f = kill_head;
53864+ }
53865+
53866+ if (parse_cut(cinfo, params) == 1) {
53867+ /* cut from the middle of item */
53868+ freed =
53869+ kill_units_f(params->from, params->from->unit_pos,
53870+ params->to->unit_pos, data,
53871+ params->smallest_removed, NULL);
53872+
53873+ item_pos = params->from->item_pos;
53874+ ih = node40_ih_at(node, item_pos);
53875+ cinfo->freed_space_start =
53876+ ih40_get_offset(ih) + node40_item_length(node,
53877+ item_pos) - freed;
53878+ cinfo->freed_space_end = cinfo->freed_space_start + freed;
53879+ cinfo->first_moved = item_pos + 1;
53880+ } else {
53881+ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
53882+ cinfo->first_removed != MAX_POS_IN_NODE ||
53883+ cinfo->head_removed != MAX_POS_IN_NODE));
53884+
53885+ switch (cinfo->mode) {
53886+ case CMODE_TAIL:
53887+ /* one item gets cut partially from its end */
53888+ assert("vs-1562",
53889+ cinfo->tail_removed == params->from->item_pos);
53890+
53891+ freed =
53892+ kill_tail_f(params->from, data,
53893+ params->smallest_removed);
53894+
53895+ item_pos = cinfo->tail_removed;
53896+ ih = node40_ih_at(node, item_pos);
53897+ cinfo->freed_space_start =
53898+ ih40_get_offset(ih) + node40_item_length(node,
53899+ item_pos) -
53900+ freed;
53901+ cinfo->freed_space_end =
53902+ cinfo->freed_space_start + freed;
53903+ cinfo->first_moved = cinfo->tail_removed + 1;
53904+ break;
53905+
53906+ case CMODE_WHOLE:
53907+ /* one or more items get removed completely */
53908+ assert("vs-1563",
53909+ cinfo->first_removed == params->from->item_pos);
53910+ assert("vs-1564", cinfo->removed_count > 0
53911+ && cinfo->removed_count != MAX_POS_IN_NODE);
53912+
53913+ /* call kill hook for all items removed completely */
53914+ if (is_cut == 0)
53915+ call_kill_hooks(node, cinfo->first_removed,
53916+ cinfo->removed_count, data);
53917+
53918+ item_pos = cinfo->first_removed;
53919+ ih = node40_ih_at(node, item_pos);
53920+
53921+ if (params->smallest_removed)
53922+ memcpy(params->smallest_removed, &ih->key,
53923+ sizeof(reiser4_key));
53924+
53925+ cinfo->freed_space_start = ih40_get_offset(ih);
53926+
53927+ item_pos += (cinfo->removed_count - 1);
53928+ ih -= (cinfo->removed_count - 1);
53929+ cinfo->freed_space_end =
53930+ ih40_get_offset(ih) + node40_item_length(node,
53931+ item_pos);
53932+ cinfo->first_moved = item_pos + 1;
53933+ if (cinfo->first_removed == 0)
53934+ /* key of first item of the node changes */
53935+ retval = 1;
53936+ break;
53937+
53938+ case CMODE_HEAD:
53939+ /* one item gets cut partially from its head */
53940+ assert("vs-1565",
53941+ cinfo->head_removed == params->from->item_pos);
53942+
53943+ freed =
53944+ kill_head_f(params->to, data,
53945+ params->smallest_removed,
53946+ &new_first_key);
53947+
53948+ item_pos = cinfo->head_removed;
53949+ ih = node40_ih_at(node, item_pos);
53950+ cinfo->freed_space_start = ih40_get_offset(ih);
53951+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
53952+ cinfo->first_moved = cinfo->head_removed + 1;
53953+
53954+ /* item head is removed, therefore, item key changed */
53955+ coord.node = node;
53956+ coord_set_item_pos(&coord, item_pos);
53957+ coord.unit_pos = 0;
53958+ coord.between = AT_UNIT;
53959+ update_item_key_node40(&coord, &new_first_key, NULL);
53960+ if (item_pos == 0)
53961+ /* key of first item of the node changes */
53962+ retval = 1;
53963+ break;
53964+
53965+ case CMODE_TAIL | CMODE_WHOLE:
53966+ /* one item gets cut from its end and one or more items get removed completely */
53967+ assert("vs-1566",
53968+ cinfo->tail_removed == params->from->item_pos);
53969+ assert("vs-1567",
53970+ cinfo->first_removed == cinfo->tail_removed + 1);
53971+ assert("vs-1564", cinfo->removed_count > 0
53972+ && cinfo->removed_count != MAX_POS_IN_NODE);
53973+
53974+ freed =
53975+ kill_tail_f(params->from, data,
53976+ params->smallest_removed);
53977+
53978+ item_pos = cinfo->tail_removed;
53979+ ih = node40_ih_at(node, item_pos);
53980+ cinfo->freed_space_start =
53981+ ih40_get_offset(ih) + node40_item_length(node,
53982+ item_pos) -
53983+ freed;
53984+
53985+ /* call kill hook for all items removed completely */
53986+ if (is_cut == 0)
53987+ call_kill_hooks(node, cinfo->first_removed,
53988+ cinfo->removed_count, data);
53989+
53990+ item_pos += cinfo->removed_count;
53991+ ih -= cinfo->removed_count;
53992+ cinfo->freed_space_end =
53993+ ih40_get_offset(ih) + node40_item_length(node,
53994+ item_pos);
53995+ cinfo->first_moved = item_pos + 1;
53996+ break;
53997+
53998+ case CMODE_WHOLE | CMODE_HEAD:
53999+ /* one or more items get removed completely and one item gets cut partially from its head */
54000+ assert("vs-1568",
54001+ cinfo->first_removed == params->from->item_pos);
54002+ assert("vs-1564", cinfo->removed_count > 0
54003+ && cinfo->removed_count != MAX_POS_IN_NODE);
54004+ assert("vs-1569",
54005+ cinfo->head_removed ==
54006+ cinfo->first_removed + cinfo->removed_count);
54007+
54008+ /* call kill hook for all items removed completely */
54009+ if (is_cut == 0)
54010+ call_kill_hooks(node, cinfo->first_removed,
54011+ cinfo->removed_count, data);
54012+
54013+ item_pos = cinfo->first_removed;
54014+ ih = node40_ih_at(node, item_pos);
54015+
54016+ if (params->smallest_removed)
54017+ memcpy(params->smallest_removed, &ih->key,
54018+ sizeof(reiser4_key));
54019+
54020+ freed =
54021+ kill_head_f(params->to, data, NULL, &new_first_key);
54022+
54023+ cinfo->freed_space_start = ih40_get_offset(ih);
54024+
54025+ ih = node40_ih_at(node, cinfo->head_removed);
54026+ /* this is the most complex case. Item which got head removed and items which are to be moved
54027+ intact change their location differently. */
54028+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
54029+ cinfo->first_moved = cinfo->head_removed;
54030+ cinfo->head_removed_location = cinfo->freed_space_start;
54031+
54032+ /* item head is removed, therefore, item key changed */
54033+ coord.node = node;
54034+ coord_set_item_pos(&coord, cinfo->head_removed);
54035+ coord.unit_pos = 0;
54036+ coord.between = AT_UNIT;
54037+ update_item_key_node40(&coord, &new_first_key, NULL);
54038+
54039+ assert("vs-1579", cinfo->first_removed == 0);
54040+ /* key of first item of the node changes */
54041+ retval = 1;
54042+ break;
54043+
54044+ case CMODE_TAIL | CMODE_HEAD:
54045+ /* one item get cut from its end and its neighbor gets cut from its tail */
54046+ impossible("vs-1576", "this can not happen currently");
54047+ break;
54048+
54049+ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
54050+ impossible("vs-1577", "this can not happen currently");
54051+ break;
54052+ default:
54053+ impossible("vs-1578", "unexpected cut mode");
54054+ break;
54055+ }
54056+ }
54057+ return retval;
54058+}
54059+
54060+/* plugin->u.node.kill
54061+ return value is number of items removed completely */
54062+int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
54063+{
54064+ znode *node;
54065+ struct cut40_info cinfo;
54066+ int first_key_changed;
54067+
54068+ node = kdata->params.from->node;
54069+
54070+ first_key_changed =
54071+ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
54072+ info);
54073+ compact(node, &cinfo);
54074+
54075+ if (info) {
54076+ /* it is not called by node40_shift, so we have to take care
54077+ of changes on upper levels */
54078+ if (node_is_empty(node)
54079+ && !(kdata->flags & DELETE_RETAIN_EMPTY))
54080+ /* all contents of node is deleted */
54081+ prepare_removal_node40(node, info);
54082+ else if (first_key_changed) {
54083+ prepare_for_update(NULL, node, info);
54084+ }
54085+ }
54086+
54087+ coord_clear_iplug(kdata->params.from);
54088+ coord_clear_iplug(kdata->params.to);
54089+
54090+ znode_make_dirty(node);
54091+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54092+}
54093+
54094+/* plugin->u.node.cut
54095+ return value is number of items removed completely */
54096+int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
54097+{
54098+ znode *node;
54099+ struct cut40_info cinfo;
54100+ int first_key_changed;
54101+
54102+ node = cdata->params.from->node;
54103+
54104+ first_key_changed =
54105+ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
54106+ info);
54107+ compact(node, &cinfo);
54108+
54109+ if (info) {
54110+ /* it is not called by node40_shift, so we have to take care
54111+ of changes on upper levels */
54112+ if (node_is_empty(node))
54113+ /* all contents of node is deleted */
54114+ prepare_removal_node40(node, info);
54115+ else if (first_key_changed) {
54116+ prepare_for_update(NULL, node, info);
54117+ }
54118+ }
54119+
54120+ coord_clear_iplug(cdata->params.from);
54121+ coord_clear_iplug(cdata->params.to);
54122+
54123+ znode_make_dirty(node);
54124+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54125+}
54126+
54127+/* this structure is used by shift method of node40 plugin */
54128+struct shift_params {
54129+ shift_direction pend; /* when @pend == append - we are shifting to
54130+ left, when @pend == prepend - to right */
54131+ coord_t wish_stop; /* when shifting to left this is last unit we
54132+ want shifted, when shifting to right - this
54133+ is set to unit we want to start shifting
54134+ from */
54135+ znode *target;
54136+ int everything; /* it is set to 1 if everything we have to shift is
54137+ shifted, 0 - otherwise */
54138+
54139+ /* FIXME-VS: get rid of read_stop */
54140+
54141+ /* these are set by estimate_shift */
54142+ coord_t real_stop; /* this will be set to last unit which will be
54143+ really shifted */
54144+
54145+ /* coordinate in source node before operation of unit which becomes
54146+ first after shift to left of last after shift to right */
54147+ union {
54148+ coord_t future_first;
54149+ coord_t future_last;
54150+ } u;
54151+
54152+ unsigned merging_units; /* number of units of first item which have to
54153+ be merged with last item of target node */
54154+ unsigned merging_bytes; /* number of bytes in those units */
54155+
54156+ unsigned entire; /* items shifted in their entirety */
54157+ unsigned entire_bytes; /* number of bytes in those items */
54158+
54159+ unsigned part_units; /* number of units of partially copied item */
54160+ unsigned part_bytes; /* number of bytes in those units */
54161+
54162+ unsigned shift_bytes; /* total number of bytes in items shifted (item
54163+ headers not included) */
54164+
54165+};
54166+
54167+static int item_creation_overhead(coord_t *item)
54168+{
54169+ return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
54170+}
54171+
54172+/* how many units are there in @source starting from source->unit_pos
54173+ but not further than @stop_coord */
54174+static int
54175+wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
54176+{
54177+ if (pend == SHIFT_LEFT) {
54178+ assert("vs-181", source->unit_pos == 0);
54179+ } else {
54180+ assert("vs-182",
54181+ source->unit_pos == coord_last_unit_pos(source));
54182+ }
54183+
54184+ if (source->item_pos != stop_coord->item_pos) {
54185+ /* @source and @stop_coord are different items */
54186+ return coord_last_unit_pos(source) + 1;
54187+ }
54188+
54189+ if (pend == SHIFT_LEFT) {
54190+ return stop_coord->unit_pos + 1;
54191+ } else {
54192+ return source->unit_pos - stop_coord->unit_pos + 1;
54193+ }
54194+}
54195+
54196+/* this calculates what can be copied from @shift->wish_stop.node to
54197+ @shift->target */
54198+static void
54199+estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
54200+{
54201+ unsigned target_free_space, size;
54202+ pos_in_node_t stop_item; /* item which estimating should not consider */
54203+ unsigned want; /* number of units of item we want shifted */
54204+ coord_t source; /* item being estimated */
54205+ item_plugin *iplug;
54206+
54207+ /* shifting to left/right starts from first/last units of
54208+ @shift->wish_stop.node */
54209+ if (shift->pend == SHIFT_LEFT) {
54210+ coord_init_first_unit(&source, shift->wish_stop.node);
54211+ } else {
54212+ coord_init_last_unit(&source, shift->wish_stop.node);
54213+ }
54214+ shift->real_stop = source;
54215+
54216+ /* free space in target node and number of items in source */
54217+ target_free_space = znode_free_space(shift->target);
54218+
54219+ shift->everything = 0;
54220+ if (!node_is_empty(shift->target)) {
54221+ /* target node is not empty, check for boundary items
54222+ mergeability */
54223+ coord_t to;
54224+
54225+ /* item we try to merge @source with */
54226+ if (shift->pend == SHIFT_LEFT) {
54227+ coord_init_last_unit(&to, shift->target);
54228+ } else {
54229+ coord_init_first_unit(&to, shift->target);
54230+ }
54231+
54232+ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
54233+ &source) :
54234+ are_items_mergeable(&source, &to)) {
54235+ /* how many units of @source do we want to merge to
54236+ item @to */
54237+ want =
54238+ wanted_units(&source, &shift->wish_stop,
54239+ shift->pend);
54240+
54241+ /* how many units of @source we can merge to item
54242+ @to */
54243+ iplug = item_plugin_by_coord(&source);
54244+ if (iplug->b.can_shift != NULL)
54245+ shift->merging_units =
54246+ iplug->b.can_shift(target_free_space,
54247+ &source, shift->target,
54248+ shift->pend, &size,
54249+ want);
54250+ else {
54251+ shift->merging_units = 0;
54252+ size = 0;
54253+ }
54254+ shift->merging_bytes = size;
54255+ shift->shift_bytes += size;
54256+ /* update stop coord to be set to last unit of @source
54257+ we can merge to @target */
54258+ if (shift->merging_units)
54259+ /* at least one unit can be shifted */
54260+ shift->real_stop.unit_pos =
54261+ (shift->merging_units - source.unit_pos -
54262+ 1) * shift->pend;
54263+ else {
54264+ /* nothing can be shifted */
54265+ if (shift->pend == SHIFT_LEFT)
54266+ coord_init_before_first_item(&shift->
54267+ real_stop,
54268+ source.
54269+ node);
54270+ else
54271+ coord_init_after_last_item(&shift->
54272+ real_stop,
54273+ source.node);
54274+ }
54275+ assert("nikita-2081", shift->real_stop.unit_pos + 1);
54276+
54277+ if (shift->merging_units != want) {
54278+ /* we could not copy as many as we want, so,
54279+ there is no reason for estimating any
54280+ longer */
54281+ return;
54282+ }
54283+
54284+ target_free_space -= size;
54285+ coord_add_item_pos(&source, shift->pend);
54286+ }
54287+ }
54288+
54289+ /* number of item nothing of which we want to shift */
54290+ stop_item = shift->wish_stop.item_pos + shift->pend;
54291+
54292+ /* calculate how many items can be copied into given free
54293+ space as whole */
54294+ for (; source.item_pos != stop_item;
54295+ coord_add_item_pos(&source, shift->pend)) {
54296+ if (shift->pend == SHIFT_RIGHT)
54297+ source.unit_pos = coord_last_unit_pos(&source);
54298+
54299+ /* how many units of @source do we want to copy */
54300+ want = wanted_units(&source, &shift->wish_stop, shift->pend);
54301+
54302+ if (want == coord_last_unit_pos(&source) + 1) {
54303+ /* we want this item to be copied entirely */
54304+ size =
54305+ item_length_by_coord(&source) +
54306+ item_creation_overhead(&source);
54307+ if (size <= target_free_space) {
54308+ /* item fits into target node as whole */
54309+ target_free_space -= size;
54310+ shift->shift_bytes +=
54311+ size - item_creation_overhead(&source);
54312+ shift->entire_bytes +=
54313+ size - item_creation_overhead(&source);
54314+ shift->entire++;
54315+
54316+ /* update shift->real_stop coord to be set to
54317+ last unit of @source we can merge to
54318+ @target */
54319+ shift->real_stop = source;
54320+ if (shift->pend == SHIFT_LEFT)
54321+ shift->real_stop.unit_pos =
54322+ coord_last_unit_pos(&shift->
54323+ real_stop);
54324+ else
54325+ shift->real_stop.unit_pos = 0;
54326+ continue;
54327+ }
54328+ }
54329+
54330+ /* we reach here only for an item which does not fit into
54331+ target node in its entirety. This item may be either
54332+ partially shifted, or not shifted at all. We will have to
54333+ create new item in target node, so decrease amout of free
54334+ space by an item creation overhead. We can reach here also
54335+ if stop coord is in this item */
54336+ if (target_free_space >=
54337+ (unsigned)item_creation_overhead(&source)) {
54338+ target_free_space -= item_creation_overhead(&source);
54339+ iplug = item_plugin_by_coord(&source);
54340+ if (iplug->b.can_shift) {
54341+ shift->part_units = iplug->b.can_shift(target_free_space,
54342+ &source,
54343+ NULL, /* target */
54344+ shift->pend,
54345+ &size,
54346+ want);
54347+ } else {
54348+ target_free_space = 0;
54349+ shift->part_units = 0;
54350+ size = 0;
54351+ }
54352+ } else {
54353+ target_free_space = 0;
54354+ shift->part_units = 0;
54355+ size = 0;
54356+ }
54357+ shift->part_bytes = size;
54358+ shift->shift_bytes += size;
54359+
54360+ /* set @shift->real_stop to last unit of @source we can merge
54361+ to @shift->target */
54362+ if (shift->part_units) {
54363+ shift->real_stop = source;
54364+ shift->real_stop.unit_pos =
54365+ (shift->part_units - source.unit_pos -
54366+ 1) * shift->pend;
54367+ assert("nikita-2082", shift->real_stop.unit_pos + 1);
54368+ }
54369+
54370+ if (want != shift->part_units)
54371+ /* not everything wanted were shifted */
54372+ return;
54373+ break;
54374+ }
54375+
54376+ shift->everything = 1;
54377+}
54378+
54379+static void
54380+copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
54381+ shift_direction dir, unsigned free_space)
54382+{
54383+ item_plugin *iplug;
54384+
54385+ assert("nikita-1463", target != NULL);
54386+ assert("nikita-1464", source != NULL);
54387+ assert("nikita-1465", from + count <= coord_num_units(source));
54388+
54389+ iplug = item_plugin_by_coord(source);
54390+ assert("nikita-1468", iplug == item_plugin_by_coord(target));
54391+ iplug->b.copy_units(target, source, from, count, dir, free_space);
54392+
54393+ if (dir == SHIFT_RIGHT) {
54394+ /* FIXME-VS: this looks not necessary. update_item_key was
54395+ called already by copy_units method */
54396+ reiser4_key split_key;
54397+
54398+ assert("nikita-1469", target->unit_pos == 0);
54399+
54400+ unit_key_by_coord(target, &split_key);
54401+ node_plugin_by_coord(target)->update_item_key(target,
54402+ &split_key, NULL);
54403+ }
54404+}
54405+
54406+/* copy part of @shift->real_stop.node starting either from its beginning or
54407+ from its end and ending at @shift->real_stop to either the end or the
54408+ beginning of @shift->target */
54409+static void copy(struct shift_params *shift)
54410+{
54411+ node40_header *nh;
54412+ coord_t from;
54413+ coord_t to;
54414+ item_header40 *from_ih, *to_ih;
54415+ int free_space_start;
54416+ int new_items;
54417+ unsigned old_items;
54418+ int old_offset;
54419+ unsigned i;
54420+
54421+ nh = node40_node_header(shift->target);
54422+ free_space_start = nh40_get_free_space_start(nh);
54423+ old_items = nh40_get_num_items(nh);
54424+ new_items = shift->entire + (shift->part_units ? 1 : 0);
54425+ assert("vs-185",
54426+ shift->shift_bytes ==
54427+ shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
54428+
54429+ from = shift->wish_stop;
54430+
54431+ coord_init_first_unit(&to, shift->target);
54432+
54433+ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
54434+ hence to.between is set to EMPTY_NODE above. Looks like we want it
54435+ to be AT_UNIT.
54436+
54437+ Oh, wonders of ->betweeness...
54438+
54439+ */
54440+ to.between = AT_UNIT;
54441+
54442+ if (shift->pend == SHIFT_LEFT) {
54443+ /* copying to left */
54444+
54445+ coord_set_item_pos(&from, 0);
54446+ from_ih = node40_ih_at(from.node, 0);
54447+
54448+ coord_set_item_pos(&to,
54449+ node40_num_of_items_internal(to.node) - 1);
54450+ if (shift->merging_units) {
54451+ /* expand last item, so that plugin methods will see
54452+ correct data */
54453+ free_space_start += shift->merging_bytes;
54454+ nh40_set_free_space_start(nh,
54455+ (unsigned)free_space_start);
54456+ nh40_set_free_space(nh,
54457+ nh40_get_free_space(nh) -
54458+ shift->merging_bytes);
54459+
54460+ /* appending last item of @target */
54461+ copy_units(&to, &from, 0, /* starting from 0-th unit */
54462+ shift->merging_units, SHIFT_LEFT,
54463+ shift->merging_bytes);
54464+ coord_inc_item_pos(&from);
54465+ from_ih--;
54466+ coord_inc_item_pos(&to);
54467+ }
54468+
54469+ to_ih = node40_ih_at(shift->target, old_items);
54470+ if (shift->entire) {
54471+ /* copy @entire items entirely */
54472+
54473+ /* copy item headers */
54474+ memcpy(to_ih - shift->entire + 1,
54475+ from_ih - shift->entire + 1,
54476+ shift->entire * sizeof(item_header40));
54477+ /* update item header offset */
54478+ old_offset = ih40_get_offset(from_ih);
54479+ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
54480+ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
54481+ ih40_set_offset(to_ih,
54482+ ih40_get_offset(from_ih) -
54483+ old_offset + free_space_start);
54484+
54485+ /* copy item bodies */
54486+ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
54487+ shift->entire_bytes);
54488+
54489+ coord_add_item_pos(&from, (int)shift->entire);
54490+ coord_add_item_pos(&to, (int)shift->entire);
54491+ }
54492+
54493+ nh40_set_free_space_start(nh,
54494+ free_space_start +
54495+ shift->shift_bytes -
54496+ shift->merging_bytes);
54497+ nh40_set_free_space(nh,
54498+ nh40_get_free_space(nh) -
54499+ (shift->shift_bytes - shift->merging_bytes +
54500+ sizeof(item_header40) * new_items));
54501+
54502+ /* update node header */
54503+ node40_set_num_items(shift->target, nh, old_items + new_items);
54504+ assert("vs-170",
54505+ nh40_get_free_space(nh) < znode_size(shift->target));
54506+
54507+ if (shift->part_units) {
54508+ /* copy heading part (@part units) of @source item as
54509+ a new item into @target->node */
54510+
54511+ /* copy item header of partially copied item */
54512+ coord_set_item_pos(&to,
54513+ node40_num_of_items_internal(to.node)
54514+ - 1);
54515+ memcpy(to_ih, from_ih, sizeof(item_header40));
54516+ ih40_set_offset(to_ih,
54517+ nh40_get_free_space_start(nh) -
54518+ shift->part_bytes);
54519+ if (item_plugin_by_coord(&to)->b.init)
54520+ item_plugin_by_coord(&to)->b.init(&to, &from,
54521+ NULL);
54522+ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
54523+ shift->part_bytes);
54524+ }
54525+
54526+ } else {
54527+ /* copying to right */
54528+
54529+ coord_set_item_pos(&from,
54530+ node40_num_of_items_internal(from.node) - 1);
54531+ from_ih = node40_ih_at_coord(&from);
54532+
54533+ coord_set_item_pos(&to, 0);
54534+
54535+ /* prepare space for new items */
54536+ memmove(zdata(to.node) + sizeof(node40_header) +
54537+ shift->shift_bytes,
54538+ zdata(to.node) + sizeof(node40_header),
54539+ free_space_start - sizeof(node40_header));
54540+ /* update item headers of moved items */
54541+ to_ih = node40_ih_at(to.node, 0);
54542+ /* first item gets @merging_bytes longer. free space appears
54543+ at its beginning */
54544+ if (!node_is_empty(to.node))
54545+ ih40_set_offset(to_ih,
54546+ ih40_get_offset(to_ih) +
54547+ shift->shift_bytes -
54548+ shift->merging_bytes);
54549+
54550+ for (i = 1; i < old_items; i++)
54551+ ih40_set_offset(to_ih - i,
54552+ ih40_get_offset(to_ih - i) +
54553+ shift->shift_bytes);
54554+
54555+ /* move item headers to make space for new items */
54556+ memmove(to_ih - old_items + 1 - new_items,
54557+ to_ih - old_items + 1,
54558+ sizeof(item_header40) * old_items);
54559+ to_ih -= (new_items - 1);
54560+
54561+ nh40_set_free_space_start(nh,
54562+ free_space_start +
54563+ shift->shift_bytes);
54564+ nh40_set_free_space(nh,
54565+ nh40_get_free_space(nh) -
54566+ (shift->shift_bytes +
54567+ sizeof(item_header40) * new_items));
54568+
54569+ /* update node header */
54570+ node40_set_num_items(shift->target, nh, old_items + new_items);
54571+ assert("vs-170",
54572+ nh40_get_free_space(nh) < znode_size(shift->target));
54573+
54574+ if (shift->merging_units) {
54575+ coord_add_item_pos(&to, new_items);
54576+ to.unit_pos = 0;
54577+ to.between = AT_UNIT;
54578+ /* prepend first item of @to */
54579+ copy_units(&to, &from,
54580+ coord_last_unit_pos(&from) -
54581+ shift->merging_units + 1,
54582+ shift->merging_units, SHIFT_RIGHT,
54583+ shift->merging_bytes);
54584+ coord_dec_item_pos(&from);
54585+ from_ih++;
54586+ }
54587+
54588+ if (shift->entire) {
54589+ /* copy @entire items entirely */
54590+
54591+ /* copy item headers */
54592+ memcpy(to_ih, from_ih,
54593+ shift->entire * sizeof(item_header40));
54594+
54595+ /* update item header offset */
54596+ old_offset =
54597+ ih40_get_offset(from_ih + shift->entire - 1);
54598+ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
54599+ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
54600+ ih40_set_offset(to_ih,
54601+ ih40_get_offset(from_ih) -
54602+ old_offset +
54603+ sizeof(node40_header) +
54604+ shift->part_bytes);
54605+ /* copy item bodies */
54606+ coord_add_item_pos(&from, -(int)(shift->entire - 1));
54607+ memcpy(zdata(to.node) + sizeof(node40_header) +
54608+ shift->part_bytes, item_by_coord_node40(&from),
54609+ shift->entire_bytes);
54610+ coord_dec_item_pos(&from);
54611+ }
54612+
54613+ if (shift->part_units) {
54614+ coord_set_item_pos(&to, 0);
54615+ to.unit_pos = 0;
54616+ to.between = AT_UNIT;
54617+ /* copy heading part (@part units) of @source item as
54618+ a new item into @target->node */
54619+
54620+ /* copy item header of partially copied item */
54621+ memcpy(to_ih, from_ih, sizeof(item_header40));
54622+ ih40_set_offset(to_ih, sizeof(node40_header));
54623+ if (item_plugin_by_coord(&to)->b.init)
54624+ item_plugin_by_coord(&to)->b.init(&to, &from,
54625+ NULL);
54626+ copy_units(&to, &from,
54627+ coord_last_unit_pos(&from) -
54628+ shift->part_units + 1, shift->part_units,
54629+ SHIFT_RIGHT, shift->part_bytes);
54630+ }
54631+ }
54632+}
54633+
54634+/* remove everything either before or after @fact_stop. Number of items
54635+ removed completely is returned */
54636+static int delete_copied(struct shift_params *shift)
54637+{
54638+ coord_t from;
54639+ coord_t to;
54640+ struct carry_cut_data cdata;
54641+
54642+ if (shift->pend == SHIFT_LEFT) {
54643+ /* we were shifting to left, remove everything from the
54644+ beginning of @shift->wish_stop->node upto
54645+ @shift->wish_stop */
54646+ coord_init_first_unit(&from, shift->real_stop.node);
54647+ to = shift->real_stop;
54648+
54649+ /* store old coordinate of unit which will be first after
54650+ shift to left */
54651+ shift->u.future_first = to;
54652+ coord_next_unit(&shift->u.future_first);
54653+ } else {
54654+ /* we were shifting to right, remove everything from
54655+ @shift->stop_coord upto to end of
54656+ @shift->stop_coord->node */
54657+ from = shift->real_stop;
54658+ coord_init_last_unit(&to, from.node);
54659+
54660+ /* store old coordinate of unit which will be last after
54661+ shift to right */
54662+ shift->u.future_last = from;
54663+ coord_prev_unit(&shift->u.future_last);
54664+ }
54665+
54666+ cdata.params.from = &from;
54667+ cdata.params.to = &to;
54668+ cdata.params.from_key = NULL;
54669+ cdata.params.to_key = NULL;
54670+ cdata.params.smallest_removed = NULL;
54671+ return cut_node40(&cdata, NULL);
54672+}
54673+
54674+/* something was moved between @left and @right. Add carry operation to @info
54675+ list to have carry to update delimiting key between them */
54676+static int
54677+prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
54678+{
54679+ carry_op *op;
54680+ carry_node *cn;
54681+
54682+ if (info == NULL)
54683+ /* nowhere to send operation to. */
54684+ return 0;
54685+
54686+ if (!should_notify_parent(right))
54687+ return 0;
54688+
54689+ op = node_post_carry(info, COP_UPDATE, right, 1);
54690+ if (IS_ERR(op) || op == NULL)
54691+ return op ? PTR_ERR(op) : -EIO;
54692+
54693+ if (left != NULL) {
54694+ carry_node *reference;
54695+
54696+ if (info->doing)
54697+ reference = insert_carry_node(info->doing,
54698+ info->todo, left);
54699+ else
54700+ reference = op->node;
54701+ assert("nikita-2992", reference != NULL);
54702+ cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
54703+ if (IS_ERR(cn))
54704+ return PTR_ERR(cn);
54705+ cn->parent = 1;
54706+ cn->node = left;
54707+ if (ZF_ISSET(left, JNODE_ORPHAN))
54708+ cn->left_before = 1;
54709+ op->u.update.left = cn;
54710+ } else
54711+ op->u.update.left = NULL;
54712+ return 0;
54713+}
54714+
54715+/* plugin->u.node.prepare_removal
54716+ to delete a pointer to @empty from the tree add corresponding carry
54717+ operation (delete) to @info list */
54718+int prepare_removal_node40(znode * empty, carry_plugin_info * info)
54719+{
54720+ carry_op *op;
54721+ reiser4_tree *tree;
54722+
54723+ if (!should_notify_parent(empty))
54724+ return 0;
54725+ /* already on a road to Styx */
54726+ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
54727+ return 0;
54728+ op = node_post_carry(info, COP_DELETE, empty, 1);
54729+ if (IS_ERR(op) || op == NULL)
54730+ return RETERR(op ? PTR_ERR(op) : -EIO);
54731+
54732+ op->u.delete.child = NULL;
54733+ op->u.delete.flags = 0;
54734+
54735+ /* fare thee well */
54736+ tree = znode_get_tree(empty);
54737+ read_lock_tree(tree);
54738+ write_lock_dk(tree);
54739+ znode_set_ld_key(empty, znode_get_rd_key(empty));
54740+ if (znode_is_left_connected(empty) && empty->left)
54741+ znode_set_rd_key(empty->left, znode_get_rd_key(empty));
54742+ write_unlock_dk(tree);
54743+ read_unlock_tree(tree);
54744+
54745+ ZF_SET(empty, JNODE_HEARD_BANSHEE);
54746+ return 0;
54747+}
54748+
54749+/* something were shifted from @insert_coord->node to @shift->target, update
54750+ @insert_coord correspondingly */
54751+static void
54752+adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
54753+ int including_insert_coord)
54754+{
54755+ /* item plugin was invalidated by shifting */
54756+ coord_clear_iplug(insert_coord);
54757+
54758+ if (node_is_empty(shift->wish_stop.node)) {
54759+ assert("vs-242", shift->everything);
54760+ if (including_insert_coord) {
54761+ if (shift->pend == SHIFT_RIGHT) {
54762+ /* set @insert_coord before first unit of
54763+ @shift->target node */
54764+ coord_init_before_first_item(insert_coord,
54765+ shift->target);
54766+ } else {
54767+ /* set @insert_coord after last in target node */
54768+ coord_init_after_last_item(insert_coord,
54769+ shift->target);
54770+ }
54771+ } else {
54772+ /* set @insert_coord inside of empty node. There is
54773+ only one possible coord within an empty
54774+ node. init_first_unit will set that coord */
54775+ coord_init_first_unit(insert_coord,
54776+ shift->wish_stop.node);
54777+ }
54778+ return;
54779+ }
54780+
54781+ if (shift->pend == SHIFT_RIGHT) {
54782+ /* there was shifting to right */
54783+ if (shift->everything) {
54784+ /* everything wanted was shifted */
54785+ if (including_insert_coord) {
54786+ /* @insert_coord is set before first unit of
54787+ @to node */
54788+ coord_init_before_first_item(insert_coord,
54789+ shift->target);
54790+ insert_coord->between = BEFORE_UNIT;
54791+ } else {
54792+ /* @insert_coord is set after last unit of
54793+ @insert->node */
54794+ coord_init_last_unit(insert_coord,
54795+ shift->wish_stop.node);
54796+ insert_coord->between = AFTER_UNIT;
54797+ }
54798+ }
54799+ return;
54800+ }
54801+
54802+ /* there was shifting to left */
54803+ if (shift->everything) {
54804+ /* everything wanted was shifted */
54805+ if (including_insert_coord) {
54806+ /* @insert_coord is set after last unit in @to node */
54807+ coord_init_after_last_item(insert_coord, shift->target);
54808+ } else {
54809+ /* @insert_coord is set before first unit in the same
54810+ node */
54811+ coord_init_before_first_item(insert_coord,
54812+ shift->wish_stop.node);
54813+ }
54814+ return;
54815+ }
54816+
54817+ /* FIXME-VS: the code below is complicated because with between ==
54818+ AFTER_ITEM unit_pos is set to 0 */
54819+
54820+ if (!removed) {
54821+ /* no items were shifted entirely */
54822+ assert("vs-195", shift->merging_units == 0
54823+ || shift->part_units == 0);
54824+
54825+ if (shift->real_stop.item_pos == insert_coord->item_pos) {
54826+ if (shift->merging_units) {
54827+ if (insert_coord->between == AFTER_UNIT) {
54828+ assert("nikita-1441",
54829+ insert_coord->unit_pos >=
54830+ shift->merging_units);
54831+ insert_coord->unit_pos -=
54832+ shift->merging_units;
54833+ } else if (insert_coord->between == BEFORE_UNIT) {
54834+ assert("nikita-2090",
54835+ insert_coord->unit_pos >
54836+ shift->merging_units);
54837+ insert_coord->unit_pos -=
54838+ shift->merging_units;
54839+ }
54840+
54841+ assert("nikita-2083",
54842+ insert_coord->unit_pos + 1);
54843+ } else {
54844+ if (insert_coord->between == AFTER_UNIT) {
54845+ assert("nikita-1442",
54846+ insert_coord->unit_pos >=
54847+ shift->part_units);
54848+ insert_coord->unit_pos -=
54849+ shift->part_units;
54850+ } else if (insert_coord->between == BEFORE_UNIT) {
54851+ assert("nikita-2089",
54852+ insert_coord->unit_pos >
54853+ shift->part_units);
54854+ insert_coord->unit_pos -=
54855+ shift->part_units;
54856+ }
54857+
54858+ assert("nikita-2084",
54859+ insert_coord->unit_pos + 1);
54860+ }
54861+ }
54862+ return;
54863+ }
54864+
54865+ /* we shifted to left and there was no enough space for everything */
54866+ switch (insert_coord->between) {
54867+ case AFTER_UNIT:
54868+ case BEFORE_UNIT:
54869+ if (shift->real_stop.item_pos == insert_coord->item_pos)
54870+ insert_coord->unit_pos -= shift->part_units;
54871+ case AFTER_ITEM:
54872+ coord_add_item_pos(insert_coord, -removed);
54873+ break;
54874+ default:
54875+ impossible("nikita-2087", "not ready");
54876+ }
54877+ assert("nikita-2085", insert_coord->unit_pos + 1);
54878+}
54879+
54880+static int call_shift_hooks(struct shift_params *shift)
54881+{
54882+ unsigned i, shifted;
54883+ coord_t coord;
54884+ item_plugin *iplug;
54885+
54886+ assert("vs-275", !node_is_empty(shift->target));
54887+
54888+ /* number of items shift touches */
54889+ shifted =
54890+ shift->entire + (shift->merging_units ? 1 : 0) +
54891+ (shift->part_units ? 1 : 0);
54892+
54893+ if (shift->pend == SHIFT_LEFT) {
54894+ /* moved items are at the end */
54895+ coord_init_last_unit(&coord, shift->target);
54896+ coord.unit_pos = 0;
54897+
54898+ assert("vs-279", shift->pend == 1);
54899+ for (i = 0; i < shifted; i++) {
54900+ unsigned from, count;
54901+
54902+ iplug = item_plugin_by_coord(&coord);
54903+ if (i == 0 && shift->part_units) {
54904+ assert("vs-277",
54905+ coord_num_units(&coord) ==
54906+ shift->part_units);
54907+ count = shift->part_units;
54908+ from = 0;
54909+ } else if (i == shifted - 1 && shift->merging_units) {
54910+ count = shift->merging_units;
54911+ from = coord_num_units(&coord) - count;
54912+ } else {
54913+ count = coord_num_units(&coord);
54914+ from = 0;
54915+ }
54916+
54917+ if (iplug->b.shift_hook) {
54918+ iplug->b.shift_hook(&coord, from, count,
54919+ shift->wish_stop.node);
54920+ }
54921+ coord_add_item_pos(&coord, -shift->pend);
54922+ }
54923+ } else {
54924+ /* moved items are at the beginning */
54925+ coord_init_first_unit(&coord, shift->target);
54926+
54927+ assert("vs-278", shift->pend == -1);
54928+ for (i = 0; i < shifted; i++) {
54929+ unsigned from, count;
54930+
54931+ iplug = item_plugin_by_coord(&coord);
54932+ if (i == 0 && shift->part_units) {
54933+ assert("vs-277",
54934+ coord_num_units(&coord) ==
54935+ shift->part_units);
54936+ count = coord_num_units(&coord);
54937+ from = 0;
54938+ } else if (i == shifted - 1 && shift->merging_units) {
54939+ count = shift->merging_units;
54940+ from = 0;
54941+ } else {
54942+ count = coord_num_units(&coord);
54943+ from = 0;
54944+ }
54945+
54946+ if (iplug->b.shift_hook) {
54947+ iplug->b.shift_hook(&coord, from, count,
54948+ shift->wish_stop.node);
54949+ }
54950+ coord_add_item_pos(&coord, -shift->pend);
54951+ }
54952+ }
54953+
54954+ return 0;
54955+}
54956+
54957+/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
54958+static int
54959+unit_moved_left(const struct shift_params *shift, const coord_t * old)
54960+{
54961+ assert("vs-944", shift->real_stop.node == old->node);
54962+
54963+ if (shift->real_stop.item_pos < old->item_pos)
54964+ return 0;
54965+ if (shift->real_stop.item_pos == old->item_pos) {
54966+ if (shift->real_stop.unit_pos < old->unit_pos)
54967+ return 0;
54968+ }
54969+ return 1;
54970+}
54971+
54972+/* shift to right is completed. Return 1 if unit @old was moved to right
54973+ neighbor */
54974+static int
54975+unit_moved_right(const struct shift_params *shift, const coord_t * old)
54976+{
54977+ assert("vs-944", shift->real_stop.node == old->node);
54978+
54979+ if (shift->real_stop.item_pos > old->item_pos)
54980+ return 0;
54981+ if (shift->real_stop.item_pos == old->item_pos) {
54982+ if (shift->real_stop.unit_pos > old->unit_pos)
54983+ return 0;
54984+ }
54985+ return 1;
54986+}
54987+
54988+/* coord @old was set in node from which shift was performed. What was shifted
54989+ is stored in @shift. Update @old correspondingly to performed shift */
54990+static coord_t *adjust_coord2(const struct shift_params *shift,
54991+ const coord_t * old, coord_t * new)
54992+{
54993+ coord_clear_iplug(new);
54994+ new->between = old->between;
54995+
54996+ coord_clear_iplug(new);
54997+ if (old->node == shift->target) {
54998+ if (shift->pend == SHIFT_LEFT) {
54999+ /* coord which is set inside of left neighbor does not
55000+ change during shift to left */
55001+ coord_dup(new, old);
55002+ return new;
55003+ }
55004+ new->node = old->node;
55005+ coord_set_item_pos(new,
55006+ old->item_pos + shift->entire +
55007+ (shift->part_units ? 1 : 0));
55008+ new->unit_pos = old->unit_pos;
55009+ if (old->item_pos == 0 && shift->merging_units)
55010+ new->unit_pos += shift->merging_units;
55011+ return new;
55012+ }
55013+
55014+ assert("vs-977", old->node == shift->wish_stop.node);
55015+ if (shift->pend == SHIFT_LEFT) {
55016+ if (unit_moved_left(shift, old)) {
55017+ /* unit @old moved to left neighbor. Calculate its
55018+ coordinate there */
55019+ new->node = shift->target;
55020+ coord_set_item_pos(new,
55021+ node_num_items(shift->target) -
55022+ shift->entire -
55023+ (shift->part_units ? 1 : 0) +
55024+ old->item_pos);
55025+
55026+ new->unit_pos = old->unit_pos;
55027+ if (shift->merging_units) {
55028+ coord_dec_item_pos(new);
55029+ if (old->item_pos == 0) {
55030+ /* unit_pos only changes if item got
55031+ merged */
55032+ new->unit_pos =
55033+ coord_num_units(new) -
55034+ (shift->merging_units -
55035+ old->unit_pos);
55036+ }
55037+ }
55038+ } else {
55039+ /* unit @old did not move to left neighbor.
55040+
55041+ Use _nocheck, because @old is outside of its node.
55042+ */
55043+ coord_dup_nocheck(new, old);
55044+ coord_add_item_pos(new,
55045+ -shift->u.future_first.item_pos);
55046+ if (new->item_pos == 0)
55047+ new->unit_pos -= shift->u.future_first.unit_pos;
55048+ }
55049+ } else {
55050+ if (unit_moved_right(shift, old)) {
55051+ /* unit @old moved to right neighbor */
55052+ new->node = shift->target;
55053+ coord_set_item_pos(new,
55054+ old->item_pos -
55055+ shift->real_stop.item_pos);
55056+ if (new->item_pos == 0) {
55057+ /* unit @old might change unit pos */
55058+ coord_set_item_pos(new,
55059+ old->unit_pos -
55060+ shift->real_stop.unit_pos);
55061+ }
55062+ } else {
55063+ /* unit @old did not move to right neighbor, therefore
55064+ it did not change */
55065+ coord_dup(new, old);
55066+ }
55067+ }
55068+ coord_set_iplug(new, item_plugin_by_coord(new));
55069+ return new;
55070+}
55071+
55072+/* this is called when shift is completed (something of source node is copied
55073+ to target and deleted in source) to update all taps set in current
55074+ context */
55075+static void update_taps(const struct shift_params *shift)
55076+{
55077+ tap_t *tap;
55078+ coord_t new;
55079+
55080+ for_all_taps(tap) {
55081+ /* update only taps set to nodes participating in shift */
55082+ if (tap->coord->node == shift->wish_stop.node
55083+ || tap->coord->node == shift->target)
55084+ tap_to_coord(tap,
55085+ adjust_coord2(shift, tap->coord, &new));
55086+ }
55087+}
55088+
55089+#if REISER4_DEBUG
55090+
55091+struct shift_check {
55092+ reiser4_key key;
55093+ __u16 plugin_id;
55094+ union {
55095+ __u64 bytes;
55096+ __u64 entries;
55097+ void *unused;
55098+ } u;
55099+};
55100+
55101+void *shift_check_prepare(const znode * left, const znode * right)
55102+{
55103+ pos_in_node_t i, nr_items;
55104+ int mergeable;
55105+ struct shift_check *data;
55106+ item_header40 *ih;
55107+
55108+ if (node_is_empty(left) || node_is_empty(right))
55109+ mergeable = 0;
55110+ else {
55111+ coord_t l, r;
55112+
55113+ coord_init_last_unit(&l, left);
55114+ coord_init_first_unit(&r, right);
55115+ mergeable = are_items_mergeable(&l, &r);
55116+ }
55117+ nr_items =
55118+ node40_num_of_items_internal(left) +
55119+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55120+ data =
55121+ kmalloc(sizeof(struct shift_check) * nr_items,
55122+ reiser4_ctx_gfp_mask_get());
55123+ if (data != NULL) {
55124+ coord_t coord;
55125+ pos_in_node_t item_pos;
55126+
55127+ coord_init_first_unit(&coord, left);
55128+ i = 0;
55129+
55130+ for (item_pos = 0;
55131+ item_pos < node40_num_of_items_internal(left);
55132+ item_pos++) {
55133+
55134+ coord_set_item_pos(&coord, item_pos);
55135+ ih = node40_ih_at_coord(&coord);
55136+
55137+ data[i].key = ih->key;
55138+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55139+ switch (data[i].plugin_id) {
55140+ case CTAIL_ID:
55141+ case FORMATTING_ID:
55142+ data[i].u.bytes = coord_num_units(&coord);
55143+ break;
55144+ case EXTENT_POINTER_ID:
55145+ data[i].u.bytes =
55146+ reiser4_extent_size(&coord,
55147+ coord_num_units(&coord));
55148+ break;
55149+ case COMPOUND_DIR_ID:
55150+ data[i].u.entries = coord_num_units(&coord);
55151+ break;
55152+ default:
55153+ data[i].u.unused = NULL;
55154+ break;
55155+ }
55156+ i++;
55157+ }
55158+
55159+ coord_init_first_unit(&coord, right);
55160+
55161+ if (mergeable) {
55162+ assert("vs-1609", i != 0);
55163+
55164+ ih = node40_ih_at_coord(&coord);
55165+
55166+ assert("vs-1589",
55167+ data[i - 1].plugin_id ==
55168+ le16_to_cpu(get_unaligned(&ih->plugin_id)));
55169+ switch (data[i - 1].plugin_id) {
55170+ case CTAIL_ID:
55171+ case FORMATTING_ID:
55172+ data[i - 1].u.bytes += coord_num_units(&coord);
55173+ break;
55174+ case EXTENT_POINTER_ID:
55175+ data[i - 1].u.bytes +=
55176+ reiser4_extent_size(&coord,
55177+ coord_num_units(&coord));
55178+ break;
55179+ case COMPOUND_DIR_ID:
55180+ data[i - 1].u.entries +=
55181+ coord_num_units(&coord);
55182+ break;
55183+ default:
55184+ impossible("vs-1605", "wrong mergeable item");
55185+ break;
55186+ }
55187+ item_pos = 1;
55188+ } else
55189+ item_pos = 0;
55190+ for (; item_pos < node40_num_of_items_internal(right);
55191+ item_pos++) {
55192+
55193+ assert("vs-1604", i < nr_items);
55194+ coord_set_item_pos(&coord, item_pos);
55195+ ih = node40_ih_at_coord(&coord);
55196+
55197+ data[i].key = ih->key;
55198+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55199+ switch (data[i].plugin_id) {
55200+ case CTAIL_ID:
55201+ case FORMATTING_ID:
55202+ data[i].u.bytes = coord_num_units(&coord);
55203+ break;
55204+ case EXTENT_POINTER_ID:
55205+ data[i].u.bytes =
55206+ reiser4_extent_size(&coord,
55207+ coord_num_units(&coord));
55208+ break;
55209+ case COMPOUND_DIR_ID:
55210+ data[i].u.entries = coord_num_units(&coord);
55211+ break;
55212+ default:
55213+ data[i].u.unused = NULL;
55214+ break;
55215+ }
55216+ i++;
55217+ }
55218+ assert("vs-1606", i == nr_items);
55219+ }
55220+ return data;
55221+}
55222+
55223+void shift_check(void *vp, const znode * left, const znode * right)
55224+{
55225+ pos_in_node_t i, nr_items;
55226+ coord_t coord;
55227+ __u64 last_bytes;
55228+ int mergeable;
55229+ item_header40 *ih;
55230+ pos_in_node_t item_pos;
55231+ struct shift_check *data;
55232+
55233+ data = (struct shift_check *)vp;
55234+
55235+ if (data == NULL)
55236+ return;
55237+
55238+ if (node_is_empty(left) || node_is_empty(right))
55239+ mergeable = 0;
55240+ else {
55241+ coord_t l, r;
55242+
55243+ coord_init_last_unit(&l, left);
55244+ coord_init_first_unit(&r, right);
55245+ mergeable = are_items_mergeable(&l, &r);
55246+ }
55247+
55248+ nr_items =
55249+ node40_num_of_items_internal(left) +
55250+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55251+
55252+ i = 0;
55253+ last_bytes = 0;
55254+
55255+ coord_init_first_unit(&coord, left);
55256+
55257+ for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
55258+ item_pos++) {
55259+
55260+ coord_set_item_pos(&coord, item_pos);
55261+ ih = node40_ih_at_coord(&coord);
55262+
55263+ assert("vs-1611", i == item_pos);
55264+ assert("vs-1590", keyeq(&ih->key, &data[i].key));
55265+ assert("vs-1591",
55266+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55267+ if ((i < (node40_num_of_items_internal(left) - 1))
55268+ || !mergeable) {
55269+ switch (data[i].plugin_id) {
55270+ case CTAIL_ID:
55271+ case FORMATTING_ID:
55272+ assert("vs-1592",
55273+ data[i].u.bytes ==
55274+ coord_num_units(&coord));
55275+ break;
55276+ case EXTENT_POINTER_ID:
55277+ assert("vs-1593",
55278+ data[i].u.bytes ==
55279+ reiser4_extent_size(&coord,
55280+ coord_num_units
55281+ (&coord)));
55282+ break;
55283+ case COMPOUND_DIR_ID:
55284+ assert("vs-1594",
55285+ data[i].u.entries ==
55286+ coord_num_units(&coord));
55287+ break;
55288+ default:
55289+ break;
55290+ }
55291+ }
55292+ if (item_pos == (node40_num_of_items_internal(left) - 1)
55293+ && mergeable) {
55294+ switch (data[i].plugin_id) {
55295+ case CTAIL_ID:
55296+ case FORMATTING_ID:
55297+ last_bytes = coord_num_units(&coord);
55298+ break;
55299+ case EXTENT_POINTER_ID:
55300+ last_bytes =
55301+ reiser4_extent_size(&coord,
55302+ coord_num_units(&coord));
55303+ break;
55304+ case COMPOUND_DIR_ID:
55305+ last_bytes = coord_num_units(&coord);
55306+ break;
55307+ default:
55308+ impossible("vs-1595", "wrong mergeable item");
55309+ break;
55310+ }
55311+ }
55312+ i++;
55313+ }
55314+
55315+ coord_init_first_unit(&coord, right);
55316+ if (mergeable) {
55317+ ih = node40_ih_at_coord(&coord);
55318+
55319+ assert("vs-1589",
55320+ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
55321+ assert("vs-1608", last_bytes != 0);
55322+ switch (data[i - 1].plugin_id) {
55323+ case CTAIL_ID:
55324+ case FORMATTING_ID:
55325+ assert("vs-1596",
55326+ data[i - 1].u.bytes ==
55327+ last_bytes + coord_num_units(&coord));
55328+ break;
55329+
55330+ case EXTENT_POINTER_ID:
55331+ assert("vs-1597",
55332+ data[i - 1].u.bytes ==
55333+ last_bytes + reiser4_extent_size(&coord,
55334+ coord_num_units
55335+ (&coord)));
55336+ break;
55337+
55338+ case COMPOUND_DIR_ID:
55339+ assert("vs-1598",
55340+ data[i - 1].u.bytes ==
55341+ last_bytes + coord_num_units(&coord));
55342+ break;
55343+ default:
55344+ impossible("vs-1599", "wrong mergeable item");
55345+ break;
55346+ }
55347+ item_pos = 1;
55348+ } else
55349+ item_pos = 0;
55350+
55351+ for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
55352+
55353+ coord_set_item_pos(&coord, item_pos);
55354+ ih = node40_ih_at_coord(&coord);
55355+
55356+ assert("vs-1612", keyeq(&ih->key, &data[i].key));
55357+ assert("vs-1613",
55358+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55359+ switch (data[i].plugin_id) {
55360+ case CTAIL_ID:
55361+ case FORMATTING_ID:
55362+ assert("vs-1600",
55363+ data[i].u.bytes == coord_num_units(&coord));
55364+ break;
55365+ case EXTENT_POINTER_ID:
55366+ assert("vs-1601",
55367+ data[i].u.bytes ==
55368+ reiser4_extent_size(&coord,
55369+ coord_num_units
55370+ (&coord)));
55371+ break;
55372+ case COMPOUND_DIR_ID:
55373+ assert("vs-1602",
55374+ data[i].u.entries == coord_num_units(&coord));
55375+ break;
55376+ default:
55377+ break;
55378+ }
55379+ i++;
55380+ }
55381+
55382+ assert("vs-1603", i == nr_items);
55383+ kfree(data);
55384+}
55385+
55386+#endif
55387+
55388+/* plugin->u.node.shift
55389+ look for description of this method in plugin/node/node.h */
55390+int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
55391+ deleted from the tree if this is set to 1 */
55392+ int including_stop_coord, carry_plugin_info * info)
55393+{
55394+ struct shift_params shift;
55395+ int result;
55396+ znode *left, *right;
55397+ znode *source;
55398+ int target_empty;
55399+
55400+ assert("nikita-2161", coord_check(from));
55401+
55402+ memset(&shift, 0, sizeof(shift));
55403+ shift.pend = pend;
55404+ shift.wish_stop = *from;
55405+ shift.target = to;
55406+
55407+ assert("nikita-1473", znode_is_write_locked(from->node));
55408+ assert("nikita-1474", znode_is_write_locked(to));
55409+
55410+ source = from->node;
55411+
55412+ /* set @shift.wish_stop to rightmost/leftmost unit among units we want
55413+ shifted */
55414+ if (pend == SHIFT_LEFT) {
55415+ result = coord_set_to_left(&shift.wish_stop);
55416+ left = to;
55417+ right = from->node;
55418+ } else {
55419+ result = coord_set_to_right(&shift.wish_stop);
55420+ left = from->node;
55421+ right = to;
55422+ }
55423+
55424+ if (result) {
55425+ /* move insertion coord even if there is nothing to move */
55426+ if (including_stop_coord) {
55427+ /* move insertion coord (@from) */
55428+ if (pend == SHIFT_LEFT) {
55429+ /* after last item in target node */
55430+ coord_init_after_last_item(from, to);
55431+ } else {
55432+ /* before first item in target node */
55433+ coord_init_before_first_item(from, to);
55434+ }
55435+ }
55436+
55437+ if (delete_child && node_is_empty(shift.wish_stop.node))
55438+ result =
55439+ prepare_removal_node40(shift.wish_stop.node, info);
55440+ else
55441+ result = 0;
55442+ /* there is nothing to shift */
55443+ assert("nikita-2078", coord_check(from));
55444+ return result;
55445+ }
55446+
55447+ target_empty = node_is_empty(to);
55448+
55449+ /* when first node plugin with item body compression is implemented,
55450+ this must be changed to call node specific plugin */
55451+
55452+ /* shift->stop_coord is updated to last unit which really will be
55453+ shifted */
55454+ estimate_shift(&shift, get_current_context());
55455+ if (!shift.shift_bytes) {
55456+ /* we could not shift anything */
55457+ assert("nikita-2079", coord_check(from));
55458+ return 0;
55459+ }
55460+
55461+ copy(&shift);
55462+
55463+ /* result value of this is important. It is used by adjust_coord below */
55464+ result = delete_copied(&shift);
55465+
55466+ assert("vs-1610", result >= 0);
55467+ assert("vs-1471",
55468+ ((reiser4_context *) current->journal_info)->magic ==
55469+ context_magic);
55470+
55471+ /* item which has been moved from one node to another might want to do
55472+ something on that event. This can be done by item's shift_hook
55473+ method, which will be now called for every moved items */
55474+ call_shift_hooks(&shift);
55475+
55476+ assert("vs-1472",
55477+ ((reiser4_context *) current->journal_info)->magic ==
55478+ context_magic);
55479+
55480+ update_taps(&shift);
55481+
55482+ assert("vs-1473",
55483+ ((reiser4_context *) current->journal_info)->magic ==
55484+ context_magic);
55485+
55486+ /* adjust @from pointer in accordance with @including_stop_coord flag
55487+ and amount of data which was really shifted */
55488+ adjust_coord(from, &shift, result, including_stop_coord);
55489+
55490+ if (target_empty)
55491+ /*
55492+ * items were shifted into empty node. Update delimiting key.
55493+ */
55494+ result = prepare_for_update(NULL, left, info);
55495+
55496+ /* add update operation to @info, which is the list of operations to
55497+ be performed on a higher level */
55498+ result = prepare_for_update(left, right, info);
55499+ if (!result && node_is_empty(source) && delete_child) {
55500+ /* all contents of @from->node is moved to @to and @from->node
55501+ has to be removed from the tree, so, on higher level we
55502+ will be removing the pointer to node @from->node */
55503+ result = prepare_removal_node40(source, info);
55504+ }
55505+ assert("nikita-2080", coord_check(from));
55506+ return result ? result : (int)shift.shift_bytes;
55507+}
55508+
55509+/* plugin->u.node.fast_insert()
55510+ look for description of this method in plugin/node/node.h */
55511+int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55512+{
55513+ return 1;
55514+}
55515+
55516+/* plugin->u.node.fast_paste()
55517+ look for description of this method in plugin/node/node.h */
55518+int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55519+{
55520+ return 1;
55521+}
55522+
55523+/* plugin->u.node.fast_cut()
55524+ look for description of this method in plugin/node/node.h */
55525+int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55526+{
55527+ return 1;
55528+}
55529+
55530+/* plugin->u.node.modify - not defined */
55531+
55532+/* plugin->u.node.max_item_size */
55533+int max_item_size_node40(void)
55534+{
55535+ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
55536+ sizeof(item_header40);
55537+}
55538+
55539+/* plugin->u.node.set_item_plugin */
55540+int set_item_plugin_node40(coord_t *coord, item_id id)
55541+{
55542+ item_header40 *ih;
55543+
55544+ ih = node40_ih_at_coord(coord);
55545+ put_unaligned(cpu_to_le16(id), &ih->plugin_id);
55546+ coord->iplugid = id;
55547+ return 0;
55548+}
55549+
55550+/*
55551+ Local variables:
55552+ c-indentation-style: "K&R"
55553+ mode-name: "LC"
55554+ c-basic-offset: 8
55555+ tab-width: 8
55556+ fill-column: 120
55557+ scroll-step: 1
55558+ End:
55559+*/
55560diff -urN linux-2.6.23.orig/fs/reiser4/plugin/node/node40.h linux-2.6.23/fs/reiser4/plugin/node/node40.h
55561--- linux-2.6.23.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 03:00:00.000000000 +0300
55562+++ linux-2.6.23/fs/reiser4/plugin/node/node40.h 2007-12-04 16:49:30.000000000 +0300
55563@@ -0,0 +1,125 @@
55564+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55565+
55566+#if !defined( __REISER4_NODE40_H__ )
55567+#define __REISER4_NODE40_H__
55568+
55569+#include "../../forward.h"
55570+#include "../../dformat.h"
55571+#include "node.h"
55572+
55573+#include <linux/types.h>
55574+
55575+/* format of node header for 40 node layouts. Keep bloat out of this struct. */
55576+typedef struct node40_header {
55577+ /* identifier of node plugin. Must be located at the very beginning
55578+ of a node. */
55579+ common_node_header common_header; /* this is 16 bits */
55580+ /* number of items. Should be first element in the node header,
55581+ because we haven't yet finally decided whether it shouldn't go into
55582+ common_header.
55583+ */
55584+/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
55585+ * node format at compile time, and it is this one, accesses do not function dereference when
55586+ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
55587+ d16 nr_items;
55588+ /* free space in node measured in bytes */
55589+ d16 free_space;
55590+ /* offset to start of free space in node */
55591+ d16 free_space_start;
55592+ /* for reiser4_fsck. When information about what is a free
55593+ block is corrupted, and we try to recover everything even
55594+ if marked as freed, then old versions of data may
55595+ duplicate newer versions, and this field allows us to
55596+ restore the newer version. Also useful for when users
55597+ who don't have the new trashcan installed on their linux distro
55598+ delete the wrong files and send us desperate emails
55599+ offering $25 for them back. */
55600+
55601+ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
55602+ d32 magic;
55603+ /* flushstamp is made of mk_id and write_counter. mk_id is an
55604+ id generated randomly at mkreiserfs time. So we can just
55605+ skip all nodes with different mk_id. write_counter is d64
55606+ incrementing counter of writes on disk. It is used for
55607+ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
55608+
55609+ d32 mkfs_id;
55610+ d64 flush_id;
55611+ /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
55612+ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
55613+ d16 flags;
55614+
55615+ /* 1 is leaf level, 2 is twig level, root is the numerically
55616+ largest level */
55617+ d8 level;
55618+
55619+ d8 pad;
55620+} PACKED node40_header;
55621+
55622+/* item headers are not standard across all node layouts, pass
55623+ pos_in_node to functions instead */
55624+typedef struct item_header40 {
55625+ /* key of item */
55626+ /* 0 */ reiser4_key key;
55627+ /* offset from start of a node measured in 8-byte chunks */
55628+ /* 24 */ d16 offset;
55629+ /* 26 */ d16 flags;
55630+ /* 28 */ d16 plugin_id;
55631+} PACKED item_header40;
55632+
55633+size_t item_overhead_node40(const znode * node, flow_t * aflow);
55634+size_t free_space_node40(znode * node);
55635+node_search_result lookup_node40(znode * node, const reiser4_key * key,
55636+ lookup_bias bias, coord_t * coord);
55637+int num_of_items_node40(const znode * node);
55638+char *item_by_coord_node40(const coord_t * coord);
55639+int length_by_coord_node40(const coord_t * coord);
55640+item_plugin *plugin_by_coord_node40(const coord_t * coord);
55641+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
55642+size_t estimate_node40(znode * node);
55643+int check_node40(const znode * node, __u32 flags, const char **error);
55644+int parse_node40(znode * node);
55645+int init_node40(znode * node);
55646+#ifdef GUESS_EXISTS
55647+int guess_node40(const znode * node);
55648+#endif
55649+void change_item_size_node40(coord_t * coord, int by);
55650+int create_item_node40(coord_t * target, const reiser4_key * key,
55651+ reiser4_item_data * data, carry_plugin_info * info);
55652+void update_item_key_node40(coord_t * target, const reiser4_key * key,
55653+ carry_plugin_info * info);
55654+int kill_node40(struct carry_kill_data *, carry_plugin_info *);
55655+int cut_node40(struct carry_cut_data *, carry_plugin_info *);
55656+int shift_node40(coord_t * from, znode * to, shift_direction pend,
55657+ /* if @from->node becomes
55658+ empty - it will be deleted from
55659+ the tree if this is set to 1
55660+ */
55661+ int delete_child, int including_stop_coord,
55662+ carry_plugin_info * info);
55663+
55664+int fast_insert_node40(const coord_t * coord);
55665+int fast_paste_node40(const coord_t * coord);
55666+int fast_cut_node40(const coord_t * coord);
55667+int max_item_size_node40(void);
55668+int prepare_removal_node40(znode * empty, carry_plugin_info * info);
55669+int set_item_plugin_node40(coord_t * coord, item_id id);
55670+int shrink_item_node40(coord_t * coord, int delta);
55671+
55672+#if REISER4_DEBUG
55673+void *shift_check_prepare(const znode *left, const znode *right);
55674+void shift_check(void *vp, const znode *left, const znode *right);
55675+#endif
55676+
55677+/* __REISER4_NODE40_H__ */
55678+#endif
55679+/*
55680+ Local variables:
55681+ c-indentation-style: "K&R"
55682+ mode-name: "LC"
55683+ c-basic-offset: 8
55684+ tab-width: 8
55685+ fill-column: 120
55686+ scroll-step: 1
55687+ End:
55688+*/
55689diff -urN linux-2.6.23.orig/fs/reiser4/plugin/node/node.c linux-2.6.23/fs/reiser4/plugin/node/node.c
55690--- linux-2.6.23.orig/fs/reiser4/plugin/node/node.c 1970-01-01 03:00:00.000000000 +0300
55691+++ linux-2.6.23/fs/reiser4/plugin/node/node.c 2007-12-04 16:49:30.000000000 +0300
55692@@ -0,0 +1,131 @@
55693+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55694+
55695+/* Node plugin interface.
55696+
55697+ Description: The tree provides the abstraction of flows, which it
55698+ internally fragments into items which it stores in nodes.
55699+
55700+ A key_atom is a piece of data bound to a single key.
55701+
55702+ For reasonable space efficiency to be achieved it is often
55703+ necessary to store key_atoms in the nodes in the form of items, where
55704+ an item is a sequence of key_atoms of the same or similar type. It is
55705+ more space-efficient, because the item can implement (very)
55706+ efficient compression of key_atom's bodies using internal knowledge
55707+ about their semantics, and it can often avoid having a key for each
55708+ key_atom. Each type of item has specific operations implemented by its
55709+ item handler (see balance.c).
55710+
55711+ Rationale: the rest of the code (specifically balancing routines)
55712+ accesses leaf level nodes through this interface. This way we can
55713+ implement various block layouts and even combine various layouts
55714+ within the same tree. Balancing/allocating algorithms should not
55715+ care about peculiarities of splitting/merging specific item types,
55716+ but rather should leave that to the item's item handler.
55717+
55718+ Items, including those that provide the abstraction of flows, have
55719+ the property that if you move them in part or in whole to another
55720+ node, the balancing code invokes their is_left_mergeable()
55721+ item_operation to determine if they are mergeable with their new
55722+ neighbor in the node you have moved them to. For some items the
55723+ is_left_mergeable() function always returns null.
55724+
55725+ When moving the bodies of items from one node to another:
55726+
55727+ if a partial item is shifted to another node the balancing code invokes
55728+ an item handler method to handle the item splitting.
55729+
55730+ if the balancing code needs to merge with an item in the node it
55731+ is shifting to, it will invoke an item handler method to handle
55732+ the item merging.
55733+
55734+ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
55735+ adjusting the item headers after the move is done using the node handler.
55736+*/
55737+
55738+#include "../../forward.h"
55739+#include "../../debug.h"
55740+#include "../../key.h"
55741+#include "../../coord.h"
55742+#include "../plugin_header.h"
55743+#include "../item/item.h"
55744+#include "node.h"
55745+#include "../plugin.h"
55746+#include "../../znode.h"
55747+#include "../../tree.h"
55748+#include "../../super.h"
55749+#include "../../reiser4.h"
55750+
55751+/**
55752+ * leftmost_key_in_node - get the smallest key in node
55753+ * @node:
55754+ * @key: store result here
55755+ *
55756+ * Stores the leftmost key of @node in @key.
55757+ */
55758+reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
55759+{
55760+ assert("nikita-1634", node != NULL);
55761+ assert("nikita-1635", key != NULL);
55762+
55763+ if (!node_is_empty(node)) {
55764+ coord_t first_item;
55765+
55766+ coord_init_first_unit(&first_item, (znode *) node);
55767+ item_key_by_coord(&first_item, key);
55768+ } else
55769+ *key = *reiser4_max_key();
55770+ return key;
55771+}
55772+
55773+node_plugin node_plugins[LAST_NODE_ID] = {
55774+ [NODE40_ID] = {
55775+ .h = {
55776+ .type_id = REISER4_NODE_PLUGIN_TYPE,
55777+ .id = NODE40_ID,
55778+ .pops = NULL,
55779+ .label = "unified",
55780+ .desc = "unified node layout",
55781+ .linkage = {NULL, NULL}
55782+ },
55783+ .item_overhead = item_overhead_node40,
55784+ .free_space = free_space_node40,
55785+ .lookup = lookup_node40,
55786+ .num_of_items = num_of_items_node40,
55787+ .item_by_coord = item_by_coord_node40,
55788+ .length_by_coord = length_by_coord_node40,
55789+ .plugin_by_coord = plugin_by_coord_node40,
55790+ .key_at = key_at_node40,
55791+ .estimate = estimate_node40,
55792+ .check = check_node40,
55793+ .parse = parse_node40,
55794+ .init = init_node40,
55795+#ifdef GUESS_EXISTS
55796+ .guess = guess_node40,
55797+#endif
55798+ .change_item_size = change_item_size_node40,
55799+ .create_item = create_item_node40,
55800+ .update_item_key = update_item_key_node40,
55801+ .cut_and_kill = kill_node40,
55802+ .cut = cut_node40,
55803+ .shift = shift_node40,
55804+ .shrink_item = shrink_item_node40,
55805+ .fast_insert = fast_insert_node40,
55806+ .fast_paste = fast_paste_node40,
55807+ .fast_cut = fast_cut_node40,
55808+ .max_item_size = max_item_size_node40,
55809+ .prepare_removal = prepare_removal_node40,
55810+ .set_item_plugin = set_item_plugin_node40
55811+ }
55812+};
55813+
55814+/*
55815+ Local variables:
55816+ c-indentation-style: "K&R"
55817+ mode-name: "LC"
55818+ c-basic-offset: 8
55819+ tab-width: 8
55820+ fill-column: 120
55821+ scroll-step: 1
55822+ End:
55823+*/
55824diff -urN linux-2.6.23.orig/fs/reiser4/plugin/node/node.h linux-2.6.23/fs/reiser4/plugin/node/node.h
55825--- linux-2.6.23.orig/fs/reiser4/plugin/node/node.h 1970-01-01 03:00:00.000000000 +0300
55826+++ linux-2.6.23/fs/reiser4/plugin/node/node.h 2007-12-04 16:49:30.000000000 +0300
55827@@ -0,0 +1,272 @@
55828+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55829+
55830+/* We need a definition of the default node layout here. */
55831+
55832+/* Generally speaking, it is best to have free space in the middle of the
55833+ node so that two sets of things can grow towards it, and to have the
55834+ item bodies on the left so that the last one of them grows into free
55835+ space. We optimize for the case where we append new items to the end
55836+ of the node, or grow the last item, because it hurts nothing to so
55837+ optimize and it is a common special case to do massive insertions in
55838+ increasing key order (and one of cases more likely to have a real user
55839+ notice the delay time for).
55840+
55841+ formatted leaf default layout: (leaf1)
55842+
55843+ |node header:item bodies:free space:key + pluginid + item offset|
55844+
55845+ We grow towards the middle, optimizing layout for the case where we
55846+ append new items to the end of the node. The node header is fixed
55847+ length. Keys, and item offsets plus pluginids for the items
55848+ corresponding to them are in increasing key order, and are fixed
55849+ length. Item offsets are relative to start of node (16 bits creating
55850+ a node size limit of 64k, 12 bits might be a better choice....). Item
55851+ bodies are in decreasing key order. Item bodies have a variable size.
55852+ There is a one to one to one mapping of keys to item offsets to item
55853+ bodies. Item offsets consist of pointers to the zeroth byte of the
55854+ item body. Item length equals the start of the next item minus the
55855+ start of this item, except the zeroth item whose length equals the end
55856+ of the node minus the start of that item (plus a byte). In other
55857+ words, the item length is not recorded anywhere, and it does not need
55858+ to be since it is computable.
55859+
55860+ Leaf variable length items and keys layout : (lvar)
55861+
55862+ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
55863+
55864+ We grow towards the middle, optimizing layout for the case where we
55865+ append new items to the end of the node. The node header is fixed
55866+ length. Keys and item offsets for the items corresponding to them are
55867+ in increasing key order, and keys are variable length. Item offsets
55868+ are relative to start of node (16 bits). Item bodies are in
55869+ decreasing key order. Item bodies have a variable size. There is a
55870+ one to one to one mapping of keys to item offsets to item bodies.
55871+ Item offsets consist of pointers to the zeroth byte of the item body.
55872+ Item length equals the start of the next item's key minus the start of
55873+ this item, except the zeroth item whose length equals the end of the
55874+ node minus the start of that item (plus a byte).
55875+
55876+ leaf compressed keys layout: (lcomp)
55877+
55878+ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
55879+
55880+ We grow towards the middle, optimizing layout for the case where we
55881+ append new items to the end of the node. The node header is fixed
55882+ length. Keys and item offsets for the items corresponding to them are
55883+ in increasing key order, and keys are variable length. The "key
55884+ inherit" field indicates how much of the key prefix is identical to
55885+ the previous key (stem compression as described in "Managing
55886+ Gigabytes" is used). key_inherit is a one byte integer. The
55887+ intra-node searches performed through this layout are linear searches,
55888+ and this is theorized to not hurt performance much due to the high
55889+ cost of processor stalls on modern CPUs, and the small number of keys
55890+ in a single node. Item offsets are relative to start of node (16
55891+ bits). Item bodies are in decreasing key order. Item bodies have a
55892+ variable size. There is a one to one to one mapping of keys to item
55893+ offsets to item bodies. Item offsets consist of pointers to the
55894+ zeroth byte of the item body. Item length equals the start of the
55895+ next item minus the start of this item, except the zeroth item whose
55896+ length equals the end of the node minus the start of that item (plus a
55897+ byte). In other words, item length and key length is not recorded
55898+ anywhere, and it does not need to be since it is computable.
55899+
55900+ internal node default layout: (idef1)
55901+
55902+ just like ldef1 except that item bodies are either blocknrs of
55903+ children or extents, and moving them may require updating parent
55904+ pointers in the nodes that they point to.
55905+*/
55906+
55907+/* There is an inherent 3-way tradeoff between optimizing and
55908+ exchanging disks between different architectures and code
55909+ complexity. This is optimal and simple and inexchangeable.
55910+ Someone else can do the code for exchanging disks and make it
55911+ complex. It would not be that hard. Using other than the PAGE_SIZE
55912+ might be suboptimal.
55913+*/
55914+
55915+#if !defined( __REISER4_NODE_H__ )
55916+#define __REISER4_NODE_H__
55917+
55918+#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
55919+
55920+#include "../../dformat.h"
55921+#include "../plugin_header.h"
55922+
55923+#include <linux/types.h>
55924+
55925+typedef enum {
55926+ NS_FOUND = 0,
55927+ NS_NOT_FOUND = -ENOENT
55928+} node_search_result;
55929+
55930+/* Maximal possible space overhead for creation of new item in a node */
55931+#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
55932+
55933+typedef enum {
55934+ REISER4_NODE_DKEYS = (1 << 0),
55935+ REISER4_NODE_TREE_STABLE = (1 << 1)
55936+} reiser4_node_check_flag;
55937+
55938+/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
55939+struct cut_list {
55940+ coord_t *from;
55941+ coord_t *to;
55942+ const reiser4_key *from_key;
55943+ const reiser4_key *to_key;
55944+ reiser4_key *smallest_removed;
55945+ carry_plugin_info *info;
55946+ __u32 flags;
55947+ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
55948+ lock_handle *left;
55949+ lock_handle *right;
55950+};
55951+
55952+struct carry_cut_data;
55953+struct carry_kill_data;
55954+
55955+/* The responsibility of the node plugin is to store and give access
55956+ to the sequence of items within the node. */
55957+typedef struct node_plugin {
55958+ /* generic plugin fields */
55959+ plugin_header h;
55960+
55961+ /* calculates the amount of space that will be required to store an
55962+ item which is in addition to the space consumed by the item body.
55963+ (the space consumed by the item body can be gotten by calling
55964+ item->estimate) */
55965+ size_t(*item_overhead) (const znode * node, flow_t * f);
55966+
55967+ /* returns free space by looking into node (i.e., without using
55968+ znode->free_space). */
55969+ size_t(*free_space) (znode * node);
55970+ /* search within the node for the one item which might
55971+ contain the key, invoking item->search_within to search within
55972+ that item to see if it is in there */
55973+ node_search_result(*lookup) (znode * node, const reiser4_key * key,
55974+ lookup_bias bias, coord_t * coord);
55975+ /* number of items in node */
55976+ int (*num_of_items) (const znode * node);
55977+
55978+ /* store information about item in @coord in @data */
55979+ /* break into several node ops, don't add any more uses of this before doing so */
55980+ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
55981+ char *(*item_by_coord) (const coord_t * coord);
55982+ int (*length_by_coord) (const coord_t * coord);
55983+ item_plugin *(*plugin_by_coord) (const coord_t * coord);
55984+
55985+ /* store item key in @key */
55986+ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
55987+ /* conservatively estimate whether unit of what size can fit
55988+ into node. This estimation should be performed without
55989+ actually looking into the node's content (free space is saved in
55990+ znode). */
55991+ size_t(*estimate) (znode * node);
55992+
55993+ /* performs every consistency check the node plugin author could
55994+ imagine. Optional. */
55995+ int (*check) (const znode * node, __u32 flags, const char **error);
55996+
55997+ /* Called when node is read into memory and node plugin is
55998+ already detected. This should read some data into znode (like free
55999+ space counter) and, optionally, check data consistency.
56000+ */
56001+ int (*parse) (znode * node);
56002+ /* This method is called on a new node to initialise plugin specific
56003+ data (header, etc.) */
56004+ int (*init) (znode * node);
56005+ /* Check whether @node content conforms to this plugin format.
56006+ Probably only useful after support for old V3.x formats is added.
56007+ Uncomment after 4.0 only.
56008+ */
56009+ /* int ( *guess )( const znode *node ); */
56010+#if REISER4_DEBUG
56011+ void (*print) (const char *prefix, const znode * node, __u32 flags);
56012+#endif
56013+ /* change size of @item by @by bytes. @item->node has enough free
56014+ space. When @by > 0 - free space is appended to end of item. When
56015+ @by < 0 - item is truncated - it is assumed that last @by bytes if
56016+ the item are freed already */
56017+ void (*change_item_size) (coord_t * item, int by);
56018+
56019+ /* create new item @length bytes long in coord @target */
56020+ int (*create_item) (coord_t * target, const reiser4_key * key,
56021+ reiser4_item_data * data, carry_plugin_info * info);
56022+
56023+ /* update key of item. */
56024+ void (*update_item_key) (coord_t * target, const reiser4_key * key,
56025+ carry_plugin_info * info);
56026+
56027+ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
56028+ int (*cut) (struct carry_cut_data *, carry_plugin_info *);
56029+
56030+ /*
56031+ * shrink item pointed to by @coord by @delta bytes.
56032+ */
56033+ int (*shrink_item) (coord_t * coord, int delta);
56034+
56035+ /* copy as much as possible but not more than up to @stop from
56036+ @stop->node to @target. If (pend == append) then data from beginning of
56037+ @stop->node are copied to the end of @target. If (pend == prepend) then
56038+ data from the end of @stop->node are copied to the beginning of
56039+ @target. Copied data are removed from @stop->node. Information
56040+ about what to do on upper level is stored in @todo */
56041+ int (*shift) (coord_t * stop, znode * target, shift_direction pend,
56042+ int delete_node, int including_insert_coord,
56043+ carry_plugin_info * info);
56044+ /* return true if this node allows skip carry() in some situations
56045+ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
56046+ emulation doesn't.
56047+
56048+ This will speedup insertions that doesn't require updates to the
56049+ parent, by bypassing initialisation of carry() structures. It's
56050+ believed that majority of insertions will fit there.
56051+
56052+ */
56053+ int (*fast_insert) (const coord_t * coord);
56054+ int (*fast_paste) (const coord_t * coord);
56055+ int (*fast_cut) (const coord_t * coord);
56056+ /* this limits max size of item which can be inserted into a node and
56057+ number of bytes item in a node may be appended with */
56058+ int (*max_item_size) (void);
56059+ int (*prepare_removal) (znode * empty, carry_plugin_info * info);
56060+ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
56061+ * files */
56062+ int (*set_item_plugin) (coord_t * coord, item_id);
56063+} node_plugin;
56064+
56065+typedef enum {
56066+ /* standard unified node layout used for both leaf and internal
56067+ nodes */
56068+ NODE40_ID,
56069+ LAST_NODE_ID
56070+} reiser4_node_id;
56071+
56072+extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
56073+#if REISER4_DEBUG
56074+extern void print_node_content(const char *prefix, const znode * node,
56075+ __u32 flags);
56076+#endif
56077+
56078+extern void indent_znode(const znode * node);
56079+
56080+typedef struct common_node_header {
56081+ /*
56082+ * identifier of node plugin. Must be located at the very beginning of
56083+ * a node.
56084+ */
56085+ __le16 plugin_id;
56086+} common_node_header;
56087+
56088+/* __REISER4_NODE_H__ */
56089+#endif
56090+/*
56091+ * Local variables:
56092+ * c-indentation-style: "K&R"
56093+ * mode-name: "LC"
56094+ * c-basic-offset: 8
56095+ * tab-width: 8
56096+ * fill-column: 79
56097+ * scroll-step: 1
56098+ * End:
56099+ */
56100diff -urN linux-2.6.23.orig/fs/reiser4/plugin/object.c linux-2.6.23/fs/reiser4/plugin/object.c
56101--- linux-2.6.23.orig/fs/reiser4/plugin/object.c 1970-01-01 03:00:00.000000000 +0300
56102+++ linux-2.6.23/fs/reiser4/plugin/object.c 2007-12-04 18:49:45.000000000 +0300
56103@@ -0,0 +1,531 @@
56104+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
56105+ * reiser4/README */
56106+
56107+/*
56108+ * Examples of object plugins: file, directory, symlink, special file.
56109+ *
56110+ * Plugins associated with inode:
56111+ *
56112+ * Plugin of inode is plugin referenced by plugin-id field of on-disk
56113+ * stat-data. How we store this plugin in in-core inode is not
56114+ * important. Currently pointers are used, another variant is to store offsets
56115+ * and do array lookup on each access.
56116+ *
56117+ * Now, each inode has one selected plugin: object plugin that
56118+ * determines what type of file this object is: directory, regular etc.
56119+ *
56120+ * This main plugin can use other plugins that are thus subordinated to
56121+ * it. Directory instance of object plugin uses hash; regular file
56122+ * instance uses tail policy plugin.
56123+ *
56124+ * Object plugin is either taken from id in stat-data or guessed from
56125+ * i_mode bits. Once it is established we ask it to install its
56126+ * subordinate plugins, by looking again in stat-data or inheriting them
56127+ * from parent.
56128+ *
56129+ * How new inode is initialized during ->read_inode():
56130+ * 1 read stat-data and initialize inode fields: i_size, i_mode,
56131+ * i_generation, capabilities etc.
56132+ * 2 read plugin id from stat data or try to guess plugin id
56133+ * from inode->i_mode bits if plugin id is missing.
56134+ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
56135+ *
56136+ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
56137+ * if stat data does contain i_size, etc., due to it being an unusual plugin?
56138+ *
56139+ * 4 Call ->activate() method of object's plugin. Plugin is either read from
56140+ * from stat-data or guessed from mode bits
56141+ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
56142+ * plugins from parent.
56143+ *
56144+ * Easy induction proves that on last step all plugins of inode would be
56145+ * initialized.
56146+ *
56147+ * When creating new object:
56148+ * 1 obtain object plugin id (see next period)
56149+ * NIKITA-FIXME-HANS: period?
56150+ * 2 ->install() this plugin
56151+ * 3 ->inherit() the rest from the parent
56152+ *
56153+ * We need some examples of creating an object with default and non-default
56154+ * plugin ids. Nikita, please create them.
56155+ */
56156+
56157+#include "../inode.h"
56158+
56159+static int _bugop(void)
56160+{
56161+ BUG_ON(1);
56162+ return 0;
56163+}
56164+
56165+#define bugop ((void *)_bugop)
56166+
56167+static int _dummyop(void)
56168+{
56169+ return 0;
56170+}
56171+
56172+#define dummyop ((void *)_dummyop)
56173+
56174+static int change_file(struct inode *inode,
56175+ reiser4_plugin * plugin,
56176+ pset_member memb)
56177+{
56178+ /* cannot change object plugin of already existing object */
56179+ if (memb == PSET_FILE)
56180+ return RETERR(-EINVAL);
56181+
56182+ /* Change PSET_CREATE */
56183+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
56184+}
56185+
56186+static reiser4_plugin_ops file_plugin_ops = {
56187+ .change = change_file
56188+};
56189+
56190+static struct inode_operations null_i_ops = {.create = NULL};
56191+static struct file_operations null_f_ops = {.owner = NULL};
56192+static struct address_space_operations null_a_ops = {.writepage = NULL};
56193+
56194+/* VFS methods for regular files */
56195+static struct inode_operations regular_file_i_ops = {
56196+ .permission = reiser4_permission_common,
56197+ .setattr = reiser4_setattr_careful,
56198+ .getattr = reiser4_getattr_common
56199+};
56200+static struct file_operations regular_file_f_ops = {
56201+ .llseek = generic_file_llseek,
56202+ .read = reiser4_read_careful,
56203+ .write = reiser4_write_careful,
56204+ .aio_read = generic_file_aio_read,
56205+ .ioctl = reiser4_ioctl_careful,
56206+ .mmap = reiser4_mmap_careful,
56207+ .open = reiser4_open_careful,
56208+ .release = reiser4_release_careful,
56209+ .fsync = reiser4_sync_file_common,
56210+ .splice_read = generic_file_splice_read,
56211+ .splice_write = generic_file_splice_write
56212+};
56213+static struct address_space_operations regular_file_a_ops = {
56214+ .writepage = reiser4_writepage,
56215+ .readpage = reiser4_readpage,
56216+ .sync_page = block_sync_page,
56217+ .writepages = reiser4_writepages,
56218+ .set_page_dirty = reiser4_set_page_dirty,
56219+ .readpages = reiser4_readpages,
56220+ .prepare_write = reiser4_prepare_write,
56221+ .commit_write = reiser4_commit_write,
56222+ .bmap = reiser4_bmap_careful,
56223+ .invalidatepage = reiser4_invalidatepage,
56224+ .releasepage = reiser4_releasepage
56225+};
56226+
56227+/* VFS methods for symlink files */
56228+static struct inode_operations symlink_file_i_ops = {
56229+ .readlink = generic_readlink,
56230+ .follow_link = reiser4_follow_link_common,
56231+ .permission = reiser4_permission_common,
56232+ .setattr = reiser4_setattr_common,
56233+ .getattr = reiser4_getattr_common
56234+};
56235+
56236+/* VFS methods for special files */
56237+static struct inode_operations special_file_i_ops = {
56238+ .permission = reiser4_permission_common,
56239+ .setattr = reiser4_setattr_common,
56240+ .getattr = reiser4_getattr_common
56241+};
56242+
56243+/* VFS methods for directories */
56244+static struct inode_operations directory_i_ops = {
56245+ .create = reiser4_create_common,
56246+ .lookup = reiser4_lookup_common,
56247+ .link = reiser4_link_common,
56248+ .unlink = reiser4_unlink_common,
56249+ .symlink = reiser4_symlink_common,
56250+ .mkdir = reiser4_mkdir_common,
56251+ .rmdir = reiser4_unlink_common,
56252+ .mknod = reiser4_mknod_common,
56253+ .rename = reiser4_rename_common,
56254+ .permission = reiser4_permission_common,
56255+ .setattr = reiser4_setattr_common,
56256+ .getattr = reiser4_getattr_common
56257+};
56258+static struct file_operations directory_f_ops = {
56259+ .llseek = reiser4_llseek_dir_common,
56260+ .read = generic_read_dir,
56261+ .readdir = reiser4_readdir_common,
56262+ .release = reiser4_release_dir_common,
56263+ .fsync = reiser4_sync_common
56264+};
56265+static struct address_space_operations directory_a_ops = {
56266+ .writepage = bugop,
56267+ .sync_page = bugop,
56268+ .writepages = dummyop,
56269+ .set_page_dirty = bugop,
56270+ .readpages = bugop,
56271+ .prepare_write = bugop,
56272+ .commit_write = bugop,
56273+ .bmap = bugop,
56274+ .invalidatepage = bugop,
56275+ .releasepage = bugop
56276+};
56277+
56278+/*
56279+ * Definitions of object plugins.
56280+ */
56281+
56282+file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
56283+ [UNIX_FILE_PLUGIN_ID] = {
56284+ .h = {
56285+ .type_id = REISER4_FILE_PLUGIN_TYPE,
56286+ .id = UNIX_FILE_PLUGIN_ID,
56287+ .groups = (1 << REISER4_REGULAR_FILE),
56288+ .pops = &file_plugin_ops,
56289+ .label = "reg",
56290+ .desc = "regular file",
56291+ .linkage = {NULL, NULL},
56292+ },
56293+ /*
56294+ * invariant vfs ops
56295+ */
56296+ .inode_ops = &regular_file_i_ops,
56297+ .file_ops = &regular_file_f_ops,
56298+ .as_ops = &regular_file_a_ops,
56299+ /*
56300+ * private i_ops
56301+ */
56302+ .setattr = setattr_unix_file,
56303+ .open = open_unix_file,
56304+ .read = read_unix_file,
56305+ .write = write_unix_file,
56306+ .ioctl = ioctl_unix_file,
56307+ .mmap = mmap_unix_file,
56308+ .release = release_unix_file,
56309+ /*
56310+ * private f_ops
56311+ */
56312+ .readpage = readpage_unix_file,
56313+ .readpages = readpages_unix_file,
56314+ .writepages = writepages_unix_file,
56315+ .prepare_write = prepare_write_unix_file,
56316+ .commit_write = commit_write_unix_file,
56317+ /*
56318+ * private a_ops
56319+ */
56320+ .bmap = bmap_unix_file,
56321+ /*
56322+ * other private methods
56323+ */
56324+ .write_sd_by_inode = write_sd_by_inode_common,
56325+ .flow_by_inode = flow_by_inode_unix_file,
56326+ .key_by_inode = key_by_inode_and_offset_common,
56327+ .set_plug_in_inode = set_plug_in_inode_common,
56328+ .adjust_to_parent = adjust_to_parent_common,
56329+ .create_object = reiser4_create_object_common,
56330+ .delete_object = delete_object_unix_file,
56331+ .add_link = reiser4_add_link_common,
56332+ .rem_link = reiser4_rem_link_common,
56333+ .owns_item = owns_item_unix_file,
56334+ .can_add_link = can_add_link_common,
56335+ .detach = dummyop,
56336+ .bind = dummyop,
56337+ .safelink = safelink_common,
56338+ .estimate = {
56339+ .create = estimate_create_common,
56340+ .update = estimate_update_common,
56341+ .unlink = estimate_unlink_common
56342+ },
56343+ .init_inode_data = init_inode_data_unix_file,
56344+ .cut_tree_worker = cut_tree_worker_common,
56345+ .wire = {
56346+ .write = wire_write_common,
56347+ .read = wire_read_common,
56348+ .get = wire_get_common,
56349+ .size = wire_size_common,
56350+ .done = wire_done_common
56351+ }
56352+ },
56353+ [DIRECTORY_FILE_PLUGIN_ID] = {
56354+ .h = {
56355+ .type_id = REISER4_FILE_PLUGIN_TYPE,
56356+ .id = DIRECTORY_FILE_PLUGIN_ID,
56357+ .groups = (1 << REISER4_DIRECTORY_FILE),
56358+ .pops = &file_plugin_ops,
56359+ .label = "dir",
56360+ .desc = "directory",
56361+ .linkage = {NULL, NULL}
56362+ },
56363+ .inode_ops = &null_i_ops,
56364+ .file_ops = &null_f_ops,
56365+ .as_ops = &null_a_ops,
56366+
56367+ .write_sd_by_inode = write_sd_by_inode_common,
56368+ .flow_by_inode = bugop,
56369+ .key_by_inode = bugop,
56370+ .set_plug_in_inode = set_plug_in_inode_common,
56371+ .adjust_to_parent = adjust_to_parent_common_dir,
56372+ .create_object = reiser4_create_object_common,
56373+ .delete_object = reiser4_delete_dir_common,
56374+ .add_link = reiser4_add_link_common,
56375+ .rem_link = rem_link_common_dir,
56376+ .owns_item = owns_item_common_dir,
56377+ .can_add_link = can_add_link_common,
56378+ .can_rem_link = can_rem_link_common_dir,
56379+ .detach = reiser4_detach_common_dir,
56380+ .bind = reiser4_bind_common_dir,
56381+ .safelink = safelink_common,
56382+ .estimate = {
56383+ .create = estimate_create_common_dir,
56384+ .update = estimate_update_common,
56385+ .unlink = estimate_unlink_common_dir
56386+ },
56387+ .wire = {
56388+ .write = wire_write_common,
56389+ .read = wire_read_common,
56390+ .get = wire_get_common,
56391+ .size = wire_size_common,
56392+ .done = wire_done_common
56393+ },
56394+ .init_inode_data = init_inode_ordering,
56395+ .cut_tree_worker = cut_tree_worker_common,
56396+ },
56397+ [SYMLINK_FILE_PLUGIN_ID] = {
56398+ .h = {
56399+ .type_id = REISER4_FILE_PLUGIN_TYPE,
56400+ .id = SYMLINK_FILE_PLUGIN_ID,
56401+ .groups = (1 << REISER4_SYMLINK_FILE),
56402+ .pops = &file_plugin_ops,
56403+ .label = "symlink",
56404+ .desc = "symbolic link",
56405+ .linkage = {NULL,NULL}
56406+ },
56407+ .inode_ops = &symlink_file_i_ops,
56408+ /* inode->i_fop of symlink is initialized
56409+ by NULL in setup_inode_ops */
56410+ .file_ops = &null_f_ops,
56411+ .as_ops = &null_a_ops,
56412+
56413+ .write_sd_by_inode = write_sd_by_inode_common,
56414+ .set_plug_in_inode = set_plug_in_inode_common,
56415+ .adjust_to_parent = adjust_to_parent_common,
56416+ .create_object = reiser4_create_symlink,
56417+ .delete_object = reiser4_delete_object_common,
56418+ .add_link = reiser4_add_link_common,
56419+ .rem_link = reiser4_rem_link_common,
56420+ .can_add_link = can_add_link_common,
56421+ .detach = dummyop,
56422+ .bind = dummyop,
56423+ .safelink = safelink_common,
56424+ .estimate = {
56425+ .create = estimate_create_common,
56426+ .update = estimate_update_common,
56427+ .unlink = estimate_unlink_common
56428+ },
56429+ .init_inode_data = init_inode_ordering,
56430+ .cut_tree_worker = cut_tree_worker_common,
56431+ .destroy_inode = destroy_inode_symlink,
56432+ .wire = {
56433+ .write = wire_write_common,
56434+ .read = wire_read_common,
56435+ .get = wire_get_common,
56436+ .size = wire_size_common,
56437+ .done = wire_done_common
56438+ }
56439+ },
56440+ [SPECIAL_FILE_PLUGIN_ID] = {
56441+ .h = {
56442+ .type_id = REISER4_FILE_PLUGIN_TYPE,
56443+ .id = SPECIAL_FILE_PLUGIN_ID,
56444+ .groups = (1 << REISER4_SPECIAL_FILE),
56445+ .pops = &file_plugin_ops,
56446+ .label = "special",
56447+ .desc =
56448+ "special: fifo, device or socket",
56449+ .linkage = {NULL, NULL}
56450+ },
56451+ .inode_ops = &special_file_i_ops,
56452+ /* file_ops of special files (sockets, block, char, fifo) are
56453+ initialized by init_special_inode. */
56454+ .file_ops = &null_f_ops,
56455+ .as_ops = &null_a_ops,
56456+
56457+ .write_sd_by_inode = write_sd_by_inode_common,
56458+ .set_plug_in_inode = set_plug_in_inode_common,
56459+ .adjust_to_parent = adjust_to_parent_common,
56460+ .create_object = reiser4_create_object_common,
56461+ .delete_object = reiser4_delete_object_common,
56462+ .add_link = reiser4_add_link_common,
56463+ .rem_link = reiser4_rem_link_common,
56464+ .owns_item = owns_item_common,
56465+ .can_add_link = can_add_link_common,
56466+ .detach = dummyop,
56467+ .bind = dummyop,
56468+ .safelink = safelink_common,
56469+ .estimate = {
56470+ .create = estimate_create_common,
56471+ .update = estimate_update_common,
56472+ .unlink = estimate_unlink_common
56473+ },
56474+ .init_inode_data = init_inode_ordering,
56475+ .cut_tree_worker = cut_tree_worker_common,
56476+ .wire = {
56477+ .write = wire_write_common,
56478+ .read = wire_read_common,
56479+ .get = wire_get_common,
56480+ .size = wire_size_common,
56481+ .done = wire_done_common
56482+ }
56483+ },
56484+ [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
56485+ .h = {
56486+ .type_id = REISER4_FILE_PLUGIN_TYPE,
56487+ .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
56488+ .groups = (1 << REISER4_REGULAR_FILE),
56489+ .pops = &file_plugin_ops,
56490+ .label = "cryptcompress",
56491+ .desc = "cryptcompress file",
56492+ .linkage = {NULL, NULL}
56493+ },
56494+ .inode_ops = &regular_file_i_ops,
56495+ .file_ops = &regular_file_f_ops,
56496+ .as_ops = &regular_file_a_ops,
56497+
56498+ .setattr = setattr_cryptcompress,
56499+ .open = open_cryptcompress,
56500+ .read = read_cryptcompress,
56501+ .write = write_cryptcompress,
56502+ .ioctl = ioctl_cryptcompress,
56503+ .mmap = mmap_cryptcompress,
56504+ .release = release_cryptcompress,
56505+
56506+ .readpage = readpage_cryptcompress,
56507+ .readpages = readpages_cryptcompress,
56508+ .writepages = writepages_cryptcompress,
56509+ .prepare_write = prepare_write_cryptcompress,
56510+ .commit_write = commit_write_cryptcompress,
56511+
56512+ .bmap = bmap_cryptcompress,
56513+
56514+ .write_sd_by_inode = write_sd_by_inode_common,
56515+ .flow_by_inode = flow_by_inode_cryptcompress,
56516+ .key_by_inode = key_by_inode_cryptcompress,
56517+ .set_plug_in_inode = set_plug_in_inode_common,
56518+ .adjust_to_parent = adjust_to_parent_cryptcompress,
56519+ .create_object = create_object_cryptcompress,
56520+ .delete_object = delete_object_cryptcompress,
56521+ .add_link = reiser4_add_link_common,
56522+ .rem_link = reiser4_rem_link_common,
56523+ .owns_item = owns_item_common,
56524+ .can_add_link = can_add_link_common,
56525+ .detach = dummyop,
56526+ .bind = dummyop,
56527+ .safelink = safelink_common,
56528+ .estimate = {
56529+ .create = estimate_create_common,
56530+ .update = estimate_update_common,
56531+ .unlink = estimate_unlink_common
56532+ },
56533+ .init_inode_data = init_inode_data_cryptcompress,
56534+ .cut_tree_worker = cut_tree_worker_cryptcompress,
56535+ .destroy_inode = destroy_inode_cryptcompress,
56536+ .wire = {
56537+ .write = wire_write_common,
56538+ .read = wire_read_common,
56539+ .get = wire_get_common,
56540+ .size = wire_size_common,
56541+ .done = wire_done_common
56542+ }
56543+ }
56544+};
56545+
56546+static int change_dir(struct inode *inode,
56547+ reiser4_plugin * plugin,
56548+ pset_member memb)
56549+{
56550+ /* cannot change dir plugin of already existing object */
56551+ return RETERR(-EINVAL);
56552+}
56553+
56554+static reiser4_plugin_ops dir_plugin_ops = {
56555+ .change = change_dir
56556+};
56557+
56558+/*
56559+ * definition of directory plugins
56560+ */
56561+
56562+dir_plugin dir_plugins[LAST_DIR_ID] = {
56563+ /* standard hashed directory plugin */
56564+ [HASHED_DIR_PLUGIN_ID] = {
56565+ .h = {
56566+ .type_id = REISER4_DIR_PLUGIN_TYPE,
56567+ .id = HASHED_DIR_PLUGIN_ID,
56568+ .pops = &dir_plugin_ops,
56569+ .label = "dir",
56570+ .desc = "hashed directory",
56571+ .linkage = {NULL, NULL}
56572+ },
56573+ .inode_ops = &directory_i_ops,
56574+ .file_ops = &directory_f_ops,
56575+ .as_ops = &directory_a_ops,
56576+
56577+ .get_parent = get_parent_common,
56578+ .is_name_acceptable = is_name_acceptable_common,
56579+ .build_entry_key = build_entry_key_hashed,
56580+ .build_readdir_key = build_readdir_key_common,
56581+ .add_entry = reiser4_add_entry_common,
56582+ .rem_entry = reiser4_rem_entry_common,
56583+ .init = reiser4_dir_init_common,
56584+ .done = reiser4_dir_done_common,
56585+ .attach = reiser4_attach_common,
56586+ .detach = reiser4_detach_common,
56587+ .estimate = {
56588+ .add_entry = estimate_add_entry_common,
56589+ .rem_entry = estimate_rem_entry_common,
56590+ .unlink = dir_estimate_unlink_common
56591+ }
56592+ },
56593+ /* hashed directory for which seekdir/telldir are guaranteed to
56594+ * work. Brain-damage. */
56595+ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
56596+ .h = {
56597+ .type_id = REISER4_DIR_PLUGIN_TYPE,
56598+ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
56599+ .pops = &dir_plugin_ops,
56600+ .label = "dir32",
56601+ .desc = "directory hashed with 31 bit hash",
56602+ .linkage = {NULL, NULL}
56603+ },
56604+ .inode_ops = &directory_i_ops,
56605+ .file_ops = &directory_f_ops,
56606+ .as_ops = &directory_a_ops,
56607+
56608+ .get_parent = get_parent_common,
56609+ .is_name_acceptable = is_name_acceptable_common,
56610+ .build_entry_key = build_entry_key_seekable,
56611+ .build_readdir_key = build_readdir_key_common,
56612+ .add_entry = reiser4_add_entry_common,
56613+ .rem_entry = reiser4_rem_entry_common,
56614+ .init = reiser4_dir_init_common,
56615+ .done = reiser4_dir_done_common,
56616+ .attach = reiser4_attach_common,
56617+ .detach = reiser4_detach_common,
56618+ .estimate = {
56619+ .add_entry = estimate_add_entry_common,
56620+ .rem_entry = estimate_rem_entry_common,
56621+ .unlink = dir_estimate_unlink_common
56622+ }
56623+ }
56624+};
56625+
56626+/* Make Linus happy.
56627+ Local variables:
56628+ c-indentation-style: "K&R"
56629+ mode-name: "LC"
56630+ c-basic-offset: 8
56631+ tab-width: 8
56632+ fill-column: 120
56633+ End:
56634+*/
56635diff -urN linux-2.6.23.orig/fs/reiser4/plugin/object.h linux-2.6.23/fs/reiser4/plugin/object.h
56636--- linux-2.6.23.orig/fs/reiser4/plugin/object.h 1970-01-01 03:00:00.000000000 +0300
56637+++ linux-2.6.23/fs/reiser4/plugin/object.h 2007-12-04 16:49:30.000000000 +0300
56638@@ -0,0 +1,121 @@
56639+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
56640+ * reiser4/README */
56641+
56642+/* Declaration of object plugin functions. */
56643+
56644+#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
56645+#define __FS_REISER4_PLUGIN_OBJECT_H__
56646+
56647+#include "../type_safe_hash.h"
56648+
56649+/* common implementations of inode operations */
56650+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
56651+ int mode, struct nameidata *);
56652+struct dentry * reiser4_lookup_common(struct inode *parent,
56653+ struct dentry *dentry,
56654+ struct nameidata *nameidata);
56655+int reiser4_link_common(struct dentry *existing, struct inode *parent,
56656+ struct dentry *newname);
56657+int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
56658+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
56659+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
56660+ const char *linkname);
56661+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
56662+ int mode, dev_t rdev);
56663+int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
56664+ struct inode *new_dir, struct dentry *new_name);
56665+void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
56666+int reiser4_permission_common(struct inode *, int mask,
56667+ struct nameidata *nameidata);
56668+int reiser4_setattr_common(struct dentry *, struct iattr *);
56669+int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
56670+ struct kstat *);
56671+
56672+/* common implementations of file operations */
56673+loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
56674+int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
56675+int reiser4_release_dir_common(struct inode *, struct file *);
56676+int reiser4_sync_common(struct file *, struct dentry *, int datasync);
56677+
56678+/* common implementations of address space operations */
56679+int prepare_write_common(struct file *, struct page *, unsigned from,
56680+ unsigned to);
56681+
56682+/* file plugin operations: common implementations */
56683+int write_sd_by_inode_common(struct inode *);
56684+int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
56685+int set_plug_in_inode_common(struct inode *object, struct inode *parent,
56686+ reiser4_object_create_data *);
56687+int adjust_to_parent_common(struct inode *object, struct inode *parent,
56688+ struct inode *root);
56689+int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
56690+ struct inode *root);
56691+int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
56692+ struct inode *root);
56693+int reiser4_create_object_common(struct inode *object, struct inode *parent,
56694+ reiser4_object_create_data *);
56695+int reiser4_delete_object_common(struct inode *);
56696+int reiser4_delete_dir_common(struct inode *);
56697+int reiser4_add_link_common(struct inode *object, struct inode *parent);
56698+int reiser4_rem_link_common(struct inode *object, struct inode *parent);
56699+int rem_link_common_dir(struct inode *object, struct inode *parent);
56700+int owns_item_common(const struct inode *, const coord_t *);
56701+int owns_item_common_dir(const struct inode *, const coord_t *);
56702+int can_add_link_common(const struct inode *);
56703+int can_rem_link_common_dir(const struct inode *);
56704+int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
56705+int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
56706+int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
56707+reiser4_block_nr estimate_create_common(const struct inode *);
56708+reiser4_block_nr estimate_create_common_dir(const struct inode *);
56709+reiser4_block_nr estimate_update_common(const struct inode *);
56710+reiser4_block_nr estimate_unlink_common(const struct inode *,
56711+ const struct inode *);
56712+reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
56713+ const struct inode *);
56714+char *wire_write_common(struct inode *, char *start);
56715+char *wire_read_common(char *addr, reiser4_object_on_wire *);
56716+struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
56717+int wire_size_common(struct inode *);
56718+void wire_done_common(reiser4_object_on_wire *);
56719+
56720+/* dir plugin operations: common implementations */
56721+struct dentry *get_parent_common(struct inode *child);
56722+int is_name_acceptable_common(const struct inode *, const char *name, int len);
56723+void build_entry_key_common(const struct inode *,
56724+ const struct qstr *qname, reiser4_key *);
56725+int build_readdir_key_common(struct file *dir, reiser4_key *);
56726+int reiser4_add_entry_common(struct inode *object, struct dentry *where,
56727+ reiser4_object_create_data *, reiser4_dir_entry_desc *);
56728+int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
56729+ reiser4_dir_entry_desc *);
56730+int reiser4_dir_init_common(struct inode *object, struct inode *parent,
56731+ reiser4_object_create_data *);
56732+int reiser4_dir_done_common(struct inode *);
56733+int reiser4_attach_common(struct inode *child, struct inode *parent);
56734+int reiser4_detach_common(struct inode *object, struct inode *parent);
56735+reiser4_block_nr estimate_add_entry_common(const struct inode *);
56736+reiser4_block_nr estimate_rem_entry_common(const struct inode *);
56737+reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
56738+ const struct inode *);
56739+
56740+/* these are essential parts of common implementations, they are to make
56741+ customized implementations easier */
56742+int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
56743+
56744+/* merely useful functions */
56745+int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
56746+ const reiser4_key *, int silent);
56747+
56748+/* __FS_REISER4_PLUGIN_OBJECT_H__ */
56749+#endif
56750+
56751+/* Make Linus happy.
56752+ Local variables:
56753+ c-indentation-style: "K&R"
56754+ mode-name: "LC"
56755+ c-basic-offset: 8
56756+ tab-width: 8
56757+ fill-column: 120
56758+ End:
56759+*/
56760diff -urN linux-2.6.23.orig/fs/reiser4/plugin/plugin.c linux-2.6.23/fs/reiser4/plugin/plugin.c
56761--- linux-2.6.23.orig/fs/reiser4/plugin/plugin.c 1970-01-01 03:00:00.000000000 +0300
56762+++ linux-2.6.23/fs/reiser4/plugin/plugin.c 2007-12-04 16:49:30.000000000 +0300
56763@@ -0,0 +1,559 @@
56764+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
56765+ * reiser4/README */
56766+
56767+/* Basic plugin infrastructure, lookup etc. */
56768+
56769+/* PLUGINS:
56770+
56771+ Plugins are internal Reiser4 "modules" or "objects" used to increase
56772+ extensibility and allow external users to easily adapt reiser4 to
56773+ their needs.
56774+
56775+ Plugins are classified into several disjoint "types". Plugins
56776+ belonging to the particular plugin type are termed "instances" of
56777+ this type. Existing types are listed by enum reiser4_plugin_type
56778+ (see plugin/plugin_header.h)
56779+
56780+NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
56781+
56782+ Object (file) plugin determines how given file-system object serves
56783+ standard VFS requests for read, write, seek, mmap etc. Instances of
56784+ file plugins are: regular file, directory, symlink. Another example
56785+ of file plugin is audit plugin, that optionally records accesses to
56786+ underlying object and forwards requests to it.
56787+
56788+ Hash plugins compute hashes used by reiser4 to store and locate
56789+ files within directories. Instances of hash plugin type are: r5,
56790+ tea, rupasov.
56791+
56792+ Tail plugins (or, more precisely, tail policy plugins) determine
56793+ when last part of the file should be stored in a formatted item.
56794+
56795+ Scope and lookup:
56796+
56797+ label such that pair ( type_label, plugin_label ) is unique. This
56798+ pair is a globally persistent and user-visible plugin
56799+ identifier. Internally kernel maintains plugins and plugin types in
56800+ arrays using an index into those arrays as plugin and plugin type
56801+ identifiers. File-system in turn, also maintains persistent
56802+ "dictionary" which is mapping from plugin label to numerical
56803+ identifier which is stored in file-system objects. That is, we
56804+ store the offset into the plugin array for that plugin type as the
56805+ plugin id in the stat data of the filesystem object.
56806+
56807+ Internal kernel plugin type identifier (index in plugins[] array) is
56808+ of type reiser4_plugin_type. Set of available plugin types is
56809+ currently static, but dynamic loading doesn't seem to pose
56810+ insurmountable problems.
56811+
56812+ Within each type plugins are addressed by the identifiers of type
56813+ reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]).
56814+ Such identifiers are only required to be unique within one type,
56815+ not globally.
56816+
56817+ Thus, plugin in memory is uniquely identified by the pair (type_id,
56818+ id).
56819+
56820+ Usage:
56821+
56822+ There exists only one instance of each plugin instance, but this
56823+ single instance can be associated with many entities (file-system
56824+ objects, items, nodes, transactions, file-descriptors etc.). Entity
56825+ to which plugin of given type is termed (due to the lack of
56826+ imagination) "subject" of this plugin type and, by abuse of
56827+ terminology, subject of particular instance of this type to which
56828+ it's attached currently. For example, inode is subject of object
56829+ plugin type. Inode representing directory is subject of directory
56830+ plugin, hash plugin type and some particular instance of hash plugin
56831+ type. Inode, representing regular file is subject of "regular file"
56832+ plugin, tail-policy plugin type etc.
56833+
56834+ With each subject the plugin possibly stores some state. For example,
56835+ the state of a directory plugin (instance of object plugin type) is pointer
56836+ to hash plugin (if directories always use hashing that is).
56837+
56838+ Interface:
56839+
56840+ In addition to a scalar identifier, each plugin type and plugin
56841+ proper has a "label": short string and a "description"---longer
56842+ descriptive string. Labels and descriptions of plugin types are
56843+ hard-coded into plugins[] array, declared and defined in
56844+ plugin.c. Label and description of plugin are stored in .label and
56845+ .desc fields of reiser4_plugin_header respectively. It's possible to
56846+ locate plugin by the pair of labels.
56847+
56848+ Features (not implemented):
56849+
56850+ . user-level plugin manipulations:
56851+ + reiser4("filename/..file_plugin<='audit'");
56852+ + write(open("filename/..file_plugin"), "audit", 8);
56853+
56854+ . user level utilities lsplug and chplug to manipulate plugins.
56855+ Utilities are not of primary priority. Possibly they will be not
56856+ working on v4.0
56857+
56858+ NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount
56859+ option, do you agree? I don't think that specifying it at mount time,
56860+ and then changing it with each mount, is a good model for usage.
56861+
56862+ . mount option "plug" to set-up plugins of root-directory.
56863+ "plug=foo:bar" will set "bar" as default plugin of type "foo".
56864+
56865+ Limitations:
56866+
56867+ . each plugin type has to provide at least one builtin
56868+ plugin. This is technical limitation and it can be lifted in the
56869+ future.
56870+
56871+ TODO:
56872+
56873+ New plugin types/plugings:
56874+ Things we should be able to separately choose to inherit:
56875+
56876+ security plugins
56877+
56878+ stat data
56879+
56880+ file bodies
56881+
56882+ file plugins
56883+
56884+ dir plugins
56885+
56886+ . perm:acl
56887+
56888+ . audi---audit plugin intercepting and possibly logging all
56889+ accesses to object. Requires to put stub functions in file_operations
56890+ in stead of generic_file_*.
56891+
56892+NIKITA-FIXME-HANS: why make overflows a plugin?
56893+ . over---handle hash overflows
56894+
56895+ . sqnt---handle different access patterns and instruments read-ahead
56896+
56897+NIKITA-FIXME-HANS: describe the line below in more detail.
56898+
56899+ . hier---handle inheritance of plugins along file-system hierarchy
56900+
56901+ Different kinds of inheritance: on creation vs. on access.
56902+ Compatible/incompatible plugins.
56903+ Inheritance for multi-linked files.
56904+ Layered plugins.
56905+ Notion of plugin context is abandoned.
56906+
56907+Each file is associated
56908+ with one plugin and dependant plugins (hash, etc.) are stored as
56909+ main plugin state. Now, if we have plugins used for regular files
56910+ but not for directories, how such plugins would be inherited?
56911+ . always store them with directories also
56912+
56913+NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing
56914+the line below which is also useful.
56915+
56916+ . use inheritance hierarchy, independent of file-system namespace
56917+*/
56918+
56919+#include "../debug.h"
56920+#include "../dformat.h"
56921+#include "plugin_header.h"
56922+#include "item/static_stat.h"
56923+#include "node/node.h"
56924+#include "security/perm.h"
56925+#include "space/space_allocator.h"
56926+#include "disk_format/disk_format.h"
56927+#include "plugin.h"
56928+#include "../reiser4.h"
56929+#include "../jnode.h"
56930+#include "../inode.h"
56931+
56932+#include <linux/fs.h> /* for struct super_block */
56933+
56934+/*
56935+ * init_plugins - initialize plugin sub-system.
56936+ * Just call this once on reiser4 startup.
56937+ *
56938+ * Initializes plugin sub-system. It is part of reiser4 module
56939+ * initialization. For each plugin of each type init method is called and each
56940+ * plugin is put into list of plugins.
56941+ */
56942+int init_plugins(void)
56943+{
56944+ reiser4_plugin_type type_id;
56945+
56946+ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
56947+ struct reiser4_plugin_type_data *ptype;
56948+ int i;
56949+
56950+ ptype = &plugins[type_id];
56951+ assert("nikita-3508", ptype->label != NULL);
56952+ assert("nikita-3509", ptype->type_id == type_id);
56953+
56954+ INIT_LIST_HEAD(&ptype->plugins_list);
56955+/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
56956+ for (i = 0; i < ptype->builtin_num; ++i) {
56957+ reiser4_plugin *plugin;
56958+
56959+ plugin = plugin_at(ptype, i);
56960+
56961+ if (plugin->h.label == NULL)
56962+ /* uninitialized slot encountered */
56963+ continue;
56964+ assert("nikita-3445", plugin->h.type_id == type_id);
56965+ plugin->h.id = i;
56966+ if (plugin->h.pops != NULL &&
56967+ plugin->h.pops->init != NULL) {
56968+ int result;
56969+
56970+ result = plugin->h.pops->init(plugin);
56971+ if (result != 0)
56972+ return result;
56973+ }
56974+ INIT_LIST_HEAD(&plugin->h.linkage);
56975+ list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
56976+ }
56977+ }
56978+ return 0;
56979+}
56980+
56981+/* true if plugin type id is valid */
56982+int is_plugin_type_valid(reiser4_plugin_type type)
56983+{
56984+ /* "type" is unsigned, so no comparison with 0 is
56985+ necessary */
56986+ return (type < REISER4_PLUGIN_TYPES);
56987+}
56988+
56989+/* true if plugin id is valid */
56990+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
56991+{
56992+ assert("nikita-1653", is_plugin_type_valid(type));
56993+ return id < plugins[type].builtin_num;
56994+}
56995+
56996+/* return plugin by its @type and @id.
56997+
56998+ Both arguments are checked for validness: this is supposed to be called
56999+ from user-level.
57000+
57001+NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
57002+user space, and passed to the filesystem by use of method files? Your
57003+comment really confused me on the first reading....
57004+
57005+*/
57006+reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
57007+ * unchecked */,
57008+ reiser4_plugin_id id /* plugin id,
57009+ * unchecked */)
57010+{
57011+ if (is_plugin_type_valid(type)) {
57012+ if (is_plugin_id_valid(type, id))
57013+ return plugin_at(&plugins[type], id);
57014+ else
57015+ /* id out of bounds */
57016+ warning("nikita-2913",
57017+ "Invalid plugin id: [%i:%i]", type, id);
57018+ } else
57019+ /* type_id out of bounds */
57020+ warning("nikita-2914", "Invalid type_id: %i", type);
57021+ return NULL;
57022+}
57023+
57024+/**
57025+ * save_plugin_id - store plugin id in disk format
57026+ * @plugin: plugin to convert
57027+ * @area: where to store result
57028+ *
57029+ * Puts id of @plugin in little endian format to address @area.
57030+ */
57031+int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
57032+ d16 *area /* where to store result */ )
57033+{
57034+ assert("nikita-1261", plugin != NULL);
57035+ assert("nikita-1262", area != NULL);
57036+
57037+ put_unaligned(cpu_to_le16(plugin->h.id), area);
57038+ return 0;
57039+}
57040+
57041+/* list of all plugins of given type */
57042+struct list_head *get_plugin_list(reiser4_plugin_type type)
57043+{
57044+ assert("nikita-1056", is_plugin_type_valid(type));
57045+ return &plugins[type].plugins_list;
57046+}
57047+
57048+static void update_pset_mask(reiser4_inode * info, pset_member memb)
57049+{
57050+ struct dentry *rootdir;
57051+ reiser4_inode *root;
57052+
57053+ assert("edward-1443", memb != PSET_FILE);
57054+
57055+ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
57056+ if (rootdir != NULL) {
57057+ root = reiser4_inode_data(rootdir->d_inode);
57058+ /*
57059+ * if inode is different from the default one, or we are
57060+ * changing plugin of root directory, update plugin_mask
57061+ */
57062+ if (aset_get(info->pset, memb) !=
57063+ aset_get(root->pset, memb) ||
57064+ info == root)
57065+ info->plugin_mask |= (1 << memb);
57066+ else
57067+ info->plugin_mask &= ~(1 << memb);
57068+ }
57069+}
57070+
57071+/* Get specified plugin set member from parent,
57072+ or from fs-defaults (if no parent is given) and
57073+ install the result to pset of @self */
57074+int grab_plugin_pset(struct inode *self,
57075+ struct inode *ancestor,
57076+ pset_member memb)
57077+{
57078+ reiser4_plugin *plug;
57079+ reiser4_inode *info;
57080+ int result = 0;
57081+
57082+ /* Do not grab if initialised already. */
57083+ info = reiser4_inode_data(self);
57084+ if (aset_get(info->pset, memb) != NULL)
57085+ return 0;
57086+ if (ancestor) {
57087+ reiser4_inode *parent;
57088+
57089+ parent = reiser4_inode_data(ancestor);
57090+ plug = aset_get(parent->hset, memb) ? :
57091+ aset_get(parent->pset, memb);
57092+ }
57093+ else
57094+ plug = get_default_plugin(memb);
57095+
57096+ result = set_plugin(&info->pset, memb, plug);
57097+ if (result == 0) {
57098+ if (!ancestor || self->i_sb->s_root->d_inode != self)
57099+ update_pset_mask(info, memb);
57100+ }
57101+ return result;
57102+}
57103+
57104+/* Take missing pset members from root inode */
57105+int finish_pset(struct inode *inode)
57106+{
57107+ reiser4_plugin *plug;
57108+ reiser4_inode *root;
57109+ reiser4_inode *info;
57110+ pset_member memb;
57111+ int result = 0;
57112+
57113+ root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
57114+ info = reiser4_inode_data(inode);
57115+
57116+ assert("edward-1455", root != NULL);
57117+ assert("edward-1456", info != NULL);
57118+
57119+ /* file and directory plugins are already initialized. */
57120+ for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
57121+
57122+ /* Do not grab if initialised already. */
57123+ if (aset_get(info->pset, memb) != NULL)
57124+ continue;
57125+
57126+ plug = aset_get(root->pset, memb);
57127+ result = set_plugin(&info->pset, memb, plug);
57128+ if (result != 0)
57129+ break;
57130+ }
57131+ if (result != 0) {
57132+ warning("nikita-3447",
57133+ "Cannot set up plugins for %lli",
57134+ (unsigned long long)
57135+ get_inode_oid(inode));
57136+ }
57137+ return result;
57138+}
57139+
57140+int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug)
57141+{
57142+ reiser4_inode *info;
57143+ int result = 0;
57144+
57145+ if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
57146+ /* Changing pset in the root object. */
57147+ return RETERR(-EINVAL);
57148+ }
57149+
57150+ info = reiser4_inode_data(self);
57151+ if (plug->h.pops != NULL && plug->h.pops->change != NULL)
57152+ result = plug->h.pops->change(self, plug, memb);
57153+ else
57154+ result = aset_set_unsafe(&info->pset, memb, plug);
57155+ if (result == 0) {
57156+ __u16 oldmask = info->plugin_mask;
57157+
57158+ update_pset_mask(info, memb);
57159+ if (oldmask != info->plugin_mask)
57160+ reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
57161+ }
57162+ return result;
57163+}
57164+
57165+struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
57166+ /* C90 initializers */
57167+ [REISER4_FILE_PLUGIN_TYPE] = {
57168+ .type_id = REISER4_FILE_PLUGIN_TYPE,
57169+ .label = "file",
57170+ .desc = "Object plugins",
57171+ .builtin_num = sizeof_array(file_plugins),
57172+ .builtin = file_plugins,
57173+ .plugins_list = {NULL, NULL},
57174+ .size = sizeof(file_plugin)
57175+ },
57176+ [REISER4_DIR_PLUGIN_TYPE] = {
57177+ .type_id = REISER4_DIR_PLUGIN_TYPE,
57178+ .label = "dir",
57179+ .desc = "Directory plugins",
57180+ .builtin_num = sizeof_array(dir_plugins),
57181+ .builtin = dir_plugins,
57182+ .plugins_list = {NULL, NULL},
57183+ .size = sizeof(dir_plugin)
57184+ },
57185+ [REISER4_HASH_PLUGIN_TYPE] = {
57186+ .type_id = REISER4_HASH_PLUGIN_TYPE,
57187+ .label = "hash",
57188+ .desc = "Directory hashes",
57189+ .builtin_num = sizeof_array(hash_plugins),
57190+ .builtin = hash_plugins,
57191+ .plugins_list = {NULL, NULL},
57192+ .size = sizeof(hash_plugin)
57193+ },
57194+ [REISER4_FIBRATION_PLUGIN_TYPE] = {
57195+ .type_id =
57196+ REISER4_FIBRATION_PLUGIN_TYPE,
57197+ .label = "fibration",
57198+ .desc = "Directory fibrations",
57199+ .builtin_num = sizeof_array(fibration_plugins),
57200+ .builtin = fibration_plugins,
57201+ .plugins_list = {NULL, NULL},
57202+ .size = sizeof(fibration_plugin)
57203+ },
57204+ [REISER4_CIPHER_PLUGIN_TYPE] = {
57205+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
57206+ .label = "cipher",
57207+ .desc = "Cipher plugins",
57208+ .builtin_num = sizeof_array(cipher_plugins),
57209+ .builtin = cipher_plugins,
57210+ .plugins_list = {NULL, NULL},
57211+ .size = sizeof(cipher_plugin)
57212+ },
57213+ [REISER4_DIGEST_PLUGIN_TYPE] = {
57214+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
57215+ .label = "digest",
57216+ .desc = "Digest plugins",
57217+ .builtin_num = sizeof_array(digest_plugins),
57218+ .builtin = digest_plugins,
57219+ .plugins_list = {NULL, NULL},
57220+ .size = sizeof(digest_plugin)
57221+ },
57222+ [REISER4_COMPRESSION_PLUGIN_TYPE] = {
57223+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
57224+ .label = "compression",
57225+ .desc = "Compression plugins",
57226+ .builtin_num = sizeof_array(compression_plugins),
57227+ .builtin = compression_plugins,
57228+ .plugins_list = {NULL, NULL},
57229+ .size = sizeof(compression_plugin)
57230+ },
57231+ [REISER4_FORMATTING_PLUGIN_TYPE] = {
57232+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
57233+ .label = "formatting",
57234+ .desc = "Tail inlining policies",
57235+ .builtin_num = sizeof_array(formatting_plugins),
57236+ .builtin = formatting_plugins,
57237+ .plugins_list = {NULL, NULL},
57238+ .size = sizeof(formatting_plugin)
57239+ },
57240+ [REISER4_PERM_PLUGIN_TYPE] = {
57241+ .type_id = REISER4_PERM_PLUGIN_TYPE,
57242+ .label = "perm",
57243+ .desc = "Permission checks",
57244+ .builtin_num = sizeof_array(perm_plugins),
57245+ .builtin = perm_plugins,
57246+ .plugins_list = {NULL, NULL},
57247+ .size = sizeof(perm_plugin)
57248+ },
57249+ [REISER4_ITEM_PLUGIN_TYPE] = {
57250+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
57251+ .label = "item",
57252+ .desc = "Item handlers",
57253+ .builtin_num = sizeof_array(item_plugins),
57254+ .builtin = item_plugins,
57255+ .plugins_list = {NULL, NULL},
57256+ .size = sizeof(item_plugin)
57257+ },
57258+ [REISER4_NODE_PLUGIN_TYPE] = {
57259+ .type_id = REISER4_NODE_PLUGIN_TYPE,
57260+ .label = "node",
57261+ .desc = "node layout handlers",
57262+ .builtin_num = sizeof_array(node_plugins),
57263+ .builtin = node_plugins,
57264+ .plugins_list = {NULL, NULL},
57265+ .size = sizeof(node_plugin)
57266+ },
57267+ [REISER4_SD_EXT_PLUGIN_TYPE] = {
57268+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
57269+ .label = "sd_ext",
57270+ .desc = "Parts of stat-data",
57271+ .builtin_num = sizeof_array(sd_ext_plugins),
57272+ .builtin = sd_ext_plugins,
57273+ .plugins_list = {NULL, NULL},
57274+ .size = sizeof(sd_ext_plugin)
57275+ },
57276+ [REISER4_FORMAT_PLUGIN_TYPE] = {
57277+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
57278+ .label = "disk_layout",
57279+ .desc = "defines filesystem on disk layout",
57280+ .builtin_num = sizeof_array(format_plugins),
57281+ .builtin = format_plugins,
57282+ .plugins_list = {NULL, NULL},
57283+ .size = sizeof(disk_format_plugin)
57284+ },
57285+ [REISER4_JNODE_PLUGIN_TYPE] = {
57286+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
57287+ .label = "jnode",
57288+ .desc = "defines kind of jnode",
57289+ .builtin_num = sizeof_array(jnode_plugins),
57290+ .builtin = jnode_plugins,
57291+ .plugins_list = {NULL, NULL},
57292+ .size = sizeof(jnode_plugin)
57293+ },
57294+ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
57295+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
57296+ .label = "compression_mode",
57297+ .desc = "Defines compression mode",
57298+ .builtin_num = sizeof_array(compression_mode_plugins),
57299+ .builtin = compression_mode_plugins,
57300+ .plugins_list = {NULL, NULL},
57301+ .size = sizeof(compression_mode_plugin)
57302+ },
57303+ [REISER4_CLUSTER_PLUGIN_TYPE] = {
57304+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
57305+ .label = "cluster",
57306+ .desc = "Defines cluster size",
57307+ .builtin_num = sizeof_array(cluster_plugins),
57308+ .builtin = cluster_plugins,
57309+ .plugins_list = {NULL, NULL},
57310+ .size = sizeof(cluster_plugin)
57311+ }
57312+};
57313+
57314+/*
57315+ * Local variables:
57316+ * c-indentation-style: "K&R"
57317+ * mode-name: "LC"
57318+ * c-basic-offset: 8
57319+ * tab-width: 8
57320+ * fill-column: 120
57321+ * End:
57322+ */
57323diff -urN linux-2.6.23.orig/fs/reiser4/plugin/plugin.h linux-2.6.23/fs/reiser4/plugin/plugin.h
57324--- linux-2.6.23.orig/fs/reiser4/plugin/plugin.h 1970-01-01 03:00:00.000000000 +0300
57325+++ linux-2.6.23/fs/reiser4/plugin/plugin.h 2007-12-04 16:49:30.000000000 +0300
57326@@ -0,0 +1,936 @@
57327+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
57328+
57329+/* Basic plugin data-types.
57330+ see fs/reiser4/plugin/plugin.c for details */
57331+
57332+#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
57333+#define __FS_REISER4_PLUGIN_TYPES_H__
57334+
57335+#include "../forward.h"
57336+#include "../debug.h"
57337+#include "../dformat.h"
57338+#include "../key.h"
57339+#include "compress/compress.h"
57340+#include "crypto/cipher.h"
57341+#include "plugin_header.h"
57342+#include "item/static_stat.h"
57343+#include "item/internal.h"
57344+#include "item/sde.h"
57345+#include "item/cde.h"
57346+#include "item/item.h"
57347+#include "node/node.h"
57348+#include "node/node40.h"
57349+#include "security/perm.h"
57350+#include "fibration.h"
57351+
57352+#include "space/bitmap.h"
57353+#include "space/space_allocator.h"
57354+
57355+#include "disk_format/disk_format40.h"
57356+#include "disk_format/disk_format.h"
57357+
57358+#include <linux/fs.h> /* for struct super_block, address_space */
57359+#include <linux/mm.h> /* for struct page */
57360+#include <linux/buffer_head.h> /* for struct buffer_head */
57361+#include <linux/dcache.h> /* for struct dentry */
57362+#include <linux/types.h>
57363+#include <linux/crypto.h>
57364+
57365+typedef struct reiser4_object_on_wire reiser4_object_on_wire;
57366+
57367+/*
57368+ * File plugin. Defines the set of methods that file plugins implement, some
57369+ * of which are optional.
57370+ *
57371+ * A file plugin offers to the caller an interface for IO ( writing to and/or
57372+ * reading from) to what the caller sees as one sequence of bytes. An IO to it
57373+ * may affect more than one physical sequence of bytes, or no physical sequence
57374+ * of bytes, it may affect sequences of bytes offered by other file plugins to
57375+ * the semantic layer, and the file plugin may invoke other plugins and
57376+ * delegate work to them, but its interface is structured for offering the
57377+ * caller the ability to read and/or write what the caller sees as being a
57378+ * single sequence of bytes.
57379+ *
57380+ * The file plugin must present a sequence of bytes to the caller, but it does
57381+ * not necessarily have to store a sequence of bytes, it does not necessarily
57382+ * have to support efficient tree traversal to any offset in the sequence of
57383+ * bytes (tail and extent items, whose keys contain offsets, do however provide
57384+ * efficient non-sequential lookup of any offset in the sequence of bytes).
57385+ *
57386+ * Directory plugins provide methods for selecting file plugins by resolving a
57387+ * name for them.
57388+ *
57389+ * The functionality other filesystems call an attribute, and rigidly tie
57390+ * together, we decompose into orthogonal selectable features of files. Using
57391+ * the terminology we will define next, an attribute is a perhaps constrained,
57392+ * perhaps static length, file whose parent has a uni-count-intra-link to it,
57393+ * which might be grandparent-major-packed, and whose parent has a deletion
57394+ * method that deletes it.
57395+ *
57396+ * File plugins can implement constraints.
57397+ *
57398+ * Files can be of variable length (e.g. regular unix files), or of static
57399+ * length (e.g. static sized attributes).
57400+ *
57401+ * An object may have many sequences of bytes, and many file plugins, but, it
57402+ * has exactly one objectid. It is usually desirable that an object has a
57403+ * deletion method which deletes every item with that objectid. Items cannot
57404+ * in general be found by just their objectids. This means that an object must
57405+ * have either a method built into its deletion plugin method for knowing what
57406+ * items need to be deleted, or links stored with the object that provide the
57407+ * plugin with a method for finding those items. Deleting a file within an
57408+ * object may or may not have the effect of deleting the entire object,
57409+ * depending on the file plugin's deletion method.
57410+ *
57411+ * LINK TAXONOMY:
57412+ *
57413+ * Many objects have a reference count, and when the reference count reaches 0
57414+ * the object's deletion method is invoked. Some links embody a reference
57415+ * count increase ("countlinks"), and others do not ("nocountlinks").
57416+ *
57417+ * Some links are bi-directional links ("bilinks"), and some are
57418+ * uni-directional("unilinks").
57419+ *
57420+ * Some links are between parts of the same object ("intralinks"), and some are
57421+ * between different objects ("interlinks").
57422+ *
57423+ * PACKING TAXONOMY:
57424+ *
57425+ * Some items of an object are stored with a major packing locality based on
57426+ * their object's objectid (e.g. unix directory items in plan A), and these are
57427+ * called "self-major-packed".
57428+ *
57429+ * Some items of an object are stored with a major packing locality based on
57430+ * their semantic parent object's objectid (e.g. unix file bodies in plan A),
57431+ * and these are called "parent-major-packed".
57432+ *
57433+ * Some items of an object are stored with a major packing locality based on
57434+ * their semantic grandparent, and these are called "grandparent-major-packed".
57435+ * Now carefully notice that we run into trouble with key length if we have to
57436+ * store a 8 byte major+minor grandparent based packing locality, an 8 byte
57437+ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
57438+ * a 24 byte key. One of these fields must be sacrificed if an item is to be
57439+ * grandparent-major-packed, and which to sacrifice is left to the item author
57440+ * choosing to make the item grandparent-major-packed. You cannot make tail
57441+ * items and extent items grandparent-major-packed, though you could make them
57442+ * self-major-packed (usually they are parent-major-packed).
57443+ *
57444+ * In the case of ACLs (which are composed of fixed length ACEs which consist
57445+ * of {subject-type, subject, and permission bitmask} triples), it makes sense
57446+ * to not have an offset field in the ACE item key, and to allow duplicate keys
57447+ * for ACEs. Thus, the set of ACES for a given file is found by looking for a
57448+ * key consisting of the objectid of the grandparent (thus grouping all ACLs in
57449+ * a directory together), the minor packing locality of ACE, the objectid of
57450+ * the file, and 0.
57451+ *
57452+ * IO involves moving data from one location to another, which means that two
57453+ * locations must be specified, source and destination.
57454+ *
57455+ * This source and destination can be in the filesystem, or they can be a
57456+ * pointer in the user process address space plus a byte count.
57457+ *
57458+ * If both source and destination are in the filesystem, then at least one of
57459+ * them must be representable as a pure stream of bytes (which we call a flow,
57460+ * and define as a struct containing a key, a data pointer, and a length).
57461+ * This may mean converting one of them into a flow. We provide a generic
57462+ * cast_into_flow() method, which will work for any plugin supporting
57463+ * read_flow(), though it is inefficiently implemented in that it temporarily
57464+ * stores the flow in a buffer (Question: what to do with huge flows that
57465+ * cannot fit into memory? Answer: we must not convert them all at once. )
57466+ *
57467+ * Performing a write requires resolving the write request into a flow defining
57468+ * the source, and a method that performs the write, and a key that defines
57469+ * where in the tree the write is to go.
57470+ *
57471+ * Performing a read requires resolving the read request into a flow defining
57472+ * the target, and a method that performs the read, and a key that defines
57473+ * where in the tree the read is to come from.
57474+ *
57475+ * There will exist file plugins which have no pluginid stored on the disk for
57476+ * them, and which are only invoked by other plugins.
57477+ */
57478+
57479+/* This should be incremented with each new contributed
57480+ pair (plugin type, plugin id).
57481+ NOTE: Make sure there is a release of reiser4progs
57482+ with the corresponding version number */
57483+#define PLUGIN_LIBRARY_VERSION 0
57484+
57485+ /* enumeration of fields within plugin_set */
57486+typedef enum {
57487+ PSET_FILE,
57488+ PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
57489+ * inode.c:read_inode() depends on this. */
57490+ PSET_PERM,
57491+ PSET_FORMATTING,
57492+ PSET_HASH,
57493+ PSET_FIBRATION,
57494+ PSET_SD,
57495+ PSET_DIR_ITEM,
57496+ PSET_CIPHER,
57497+ PSET_DIGEST,
57498+ PSET_COMPRESSION,
57499+ PSET_COMPRESSION_MODE,
57500+ PSET_CLUSTER,
57501+ PSET_CREATE,
57502+ PSET_LAST
57503+} pset_member;
57504+
57505+/* builtin file-plugins */
57506+typedef enum {
57507+ /* regular file */
57508+ UNIX_FILE_PLUGIN_ID,
57509+ /* directory */
57510+ DIRECTORY_FILE_PLUGIN_ID,
57511+ /* symlink */
57512+ SYMLINK_FILE_PLUGIN_ID,
57513+ /* for objects completely handled by the VFS: fifos, devices,
57514+ sockets */
57515+ SPECIAL_FILE_PLUGIN_ID,
57516+ /* regular cryptcompress file */
57517+ CRYPTCOMPRESS_FILE_PLUGIN_ID,
57518+ /* number of file plugins. Used as size of arrays to hold
57519+ file plugins. */
57520+ LAST_FILE_PLUGIN_ID
57521+} reiser4_file_id;
57522+
57523+typedef struct file_plugin {
57524+
57525+ /* generic fields */
57526+ plugin_header h;
57527+
57528+ /* VFS methods.
57529+ * Must be invariant with respect to plugin conversion.
57530+ * It can be achieved by using "common" methods, which
57531+ * are the same for all plugins that take participation in
57532+ * conversion, or by using "generic" or "careful" methods,
57533+ * which provide automatic redirection to proper private
57534+ * plugin methods ("careful" are the same as "generic",
57535+ * but with protection of pset and other disk structures
57536+ * from being rebuilt during conversion.
57537+ */
57538+ struct inode_operations * inode_ops;
57539+ struct file_operations * file_ops;
57540+ struct address_space_operations * as_ops;
57541+ /**
57542+ * Private methods. These are optional. If used they will allow you
57543+ * to minimize the amount of code needed to implement a deviation
57544+ * from some other method that also uses them.
57545+ */
57546+ /*
57547+ * private inode_ops
57548+ */
57549+ int (*setattr)(struct dentry *, struct iattr *);
57550+ /*
57551+ * private file_ops
57552+ */
57553+ /* do whatever is necessary to do when object is opened */
57554+ int (*open) (struct inode * inode, struct file * file);
57555+ ssize_t (*read) (struct file *, char __user *buf, size_t read_amount,
57556+ loff_t *off);
57557+ /* write a file;
57558+ * perform file plugin conversion (if needed);
57559+ * set @*conv to 1, if the conversion occurred */
57560+ ssize_t (*write) (struct file *, const char __user *buf,
57561+ size_t write_amount, loff_t * off, int * conv);
57562+ int (*ioctl) (struct inode *inode, struct file *filp,
57563+ unsigned int cmd, unsigned long arg);
57564+ int (*mmap) (struct file *, struct vm_area_struct *);
57565+ int (*release) (struct inode *, struct file *);
57566+ /*
57567+ * private a_ops
57568+ */
57569+ int (*readpage) (struct file *file, struct page *page);
57570+ int (*readpages)(struct file *file, struct address_space *mapping,
57571+ struct list_head *pages, unsigned nr_pages);
57572+ int (*writepages)(struct address_space *mapping,
57573+ struct writeback_control *wbc);
57574+ int (*prepare_write)(struct file *file, struct page *page,
57575+ unsigned from, unsigned to);
57576+ int (*commit_write)(struct file *file, struct page *page,
57577+ unsigned from, unsigned to);
57578+ sector_t (*bmap) (struct address_space * mapping, sector_t lblock);
57579+ /* other private methods */
57580+ /* save inode cached stat-data onto disk. It was called
57581+ reiserfs_update_sd() in 3.x */
57582+ int (*write_sd_by_inode) (struct inode *);
57583+ /*
57584+ * Construct flow into @flow according to user-supplied data.
57585+ *
57586+ * This is used by read/write methods to construct a flow to
57587+ * write/read. ->flow_by_inode() is plugin method, rather than single
57588+ * global implementation, because key in a flow used by plugin may
57589+ * depend on data in a @buf.
57590+ *
57591+ * NIKITA-FIXME-HANS: please create statistics on what functions are
57592+ * dereferenced how often for the mongo benchmark. You can supervise
57593+ * Elena doing this for you if that helps. Email me the list of the
57594+ * top 10, with their counts, and an estimate of the total number of
57595+ * CPU cycles spent dereferencing as a percentage of CPU cycles spent
57596+ * processing (non-idle processing). If the total percent is, say,
57597+ * less than 1%, it will make our coding discussions much easier, and
57598+ * keep me from questioning whether functions like the below are too
57599+ * frequently called to be dereferenced. If the total percent is more
57600+ * than 1%, perhaps private methods should be listed in a "required"
57601+ * comment at the top of each plugin (with stern language about how if
57602+ * the comment is missing it will not be accepted by the maintainer),
57603+ * and implemented using macros not dereferenced functions. How about
57604+ * replacing this whole private methods part of the struct with a
57605+ * thorough documentation of what the standard helper functions are for
57606+ * use in constructing plugins? I think users have been asking for
57607+ * that, though not in so many words.
57608+ */
57609+ int (*flow_by_inode) (struct inode *, const char __user *buf,
57610+ int user, loff_t size,
57611+ loff_t off, rw_op op, flow_t *);
57612+ /*
57613+ * Return the key used to retrieve an offset of a file. It is used by
57614+ * default implementation of ->flow_by_inode() method
57615+ * (common_build_flow()) and, among other things, to get to the extent
57616+ * from jnode of unformatted node.
57617+ */
57618+ int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
57619+
57620+ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
57621+ /*
57622+ * set the plugin for a file. Called during file creation in creat()
57623+ * but not reiser4() unless an inode already exists for the file.
57624+ */
57625+ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
57626+ reiser4_object_create_data *);
57627+
57628+ /* NIKITA-FIXME-HANS: comment and name seem to say different things,
57629+ * are you setting up the object itself also or just adjusting the
57630+ * parent?.... */
57631+ /* set up plugins for new @object created in @parent. @root is root
57632+ directory. */
57633+ int (*adjust_to_parent) (struct inode *object, struct inode *parent,
57634+ struct inode *root);
57635+ /*
57636+ * this does whatever is necessary to do when object is created. For
57637+ * instance, for unix files stat data is inserted. It is supposed to be
57638+ * called by create of struct inode_operations.
57639+ */
57640+ int (*create_object) (struct inode *object, struct inode *parent,
57641+ reiser4_object_create_data *);
57642+ /*
57643+ * this method should check REISER4_NO_SD and set REISER4_NO_SD on
57644+ * success. Deletion of an object usually includes removal of items
57645+ * building file body (for directories this is removal of "." and "..")
57646+ * and removal of stat-data item.
57647+ */
57648+ int (*delete_object) (struct inode *);
57649+
57650+ /* add link from @parent to @object */
57651+ int (*add_link) (struct inode *object, struct inode *parent);
57652+
57653+ /* remove link from @parent to @object */
57654+ int (*rem_link) (struct inode *object, struct inode *parent);
57655+
57656+ /*
57657+ * return true if item addressed by @coord belongs to @inode. This is
57658+ * used by read/write to properly slice flow into items in presence of
57659+ * multiple key assignment policies, because items of a file are not
57660+ * necessarily contiguous in a key space, for example, in a plan-b.
57661+ */
57662+ int (*owns_item) (const struct inode *, const coord_t *);
57663+
57664+ /* checks whether yet another hard links to this object can be
57665+ added */
57666+ int (*can_add_link) (const struct inode *);
57667+
57668+ /* checks whether hard links to this object can be removed */
57669+ int (*can_rem_link) (const struct inode *);
57670+
57671+ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
57672+ detach of directory plugin to remove ".." */
57673+ int (*detach) (struct inode * child, struct inode * parent);
57674+
57675+ /* called when @child was just looked up in the @parent. It is not
57676+ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
57677+ directory plugin */
57678+ int (*bind) (struct inode * child, struct inode * parent);
57679+
57680+ /* process safe-link during mount */
57681+ int (*safelink) (struct inode * object, reiser4_safe_link_t link,
57682+ __u64 value);
57683+
57684+ /* The couple of estimate methods for all file operations */
57685+ struct {
57686+ reiser4_block_nr(*create) (const struct inode *);
57687+ reiser4_block_nr(*update) (const struct inode *);
57688+ reiser4_block_nr(*unlink) (const struct inode *,
57689+ const struct inode *);
57690+ } estimate;
57691+
57692+ /*
57693+ * reiser4 specific part of inode has a union of structures which are
57694+ * specific to a plugin. This method is called when inode is read
57695+ * (read_inode) and when file is created (common_create_child) so that
57696+ * file plugin could initialize its inode data
57697+ */
57698+ void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
57699+ int);
57700+
57701+ /*
57702+ * This method performs progressive deletion of items and whole nodes
57703+ * from right to left.
57704+ *
57705+ * @tap: the point deletion process begins from,
57706+ * @from_key: the beginning of the deleted key range,
57707+ * @to_key: the end of the deleted key range,
57708+ * @smallest_removed: the smallest removed key,
57709+ *
57710+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
57711+ * operation was interrupted for allowing atom commit .
57712+ */
57713+ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
57714+ const reiser4_key * to_key,
57715+ reiser4_key * smallest_removed, struct inode *,
57716+ int, int *);
57717+
57718+ /* called from ->destroy_inode() */
57719+ void (*destroy_inode) (struct inode *);
57720+
57721+ /*
57722+ * methods to serialize object identify. This is used, for example, by
57723+ * reiser4_{en,de}code_fh().
57724+ */
57725+ struct {
57726+ /* store object's identity at @area */
57727+ char *(*write) (struct inode * inode, char *area);
57728+ /* parse object from wire to the @obj */
57729+ char *(*read) (char *area, reiser4_object_on_wire * obj);
57730+ /* given object identity in @obj, find or create its dentry */
57731+ struct dentry *(*get) (struct super_block * s,
57732+ reiser4_object_on_wire * obj);
57733+ /* how many bytes ->wire.write() consumes */
57734+ int (*size) (struct inode * inode);
57735+ /* finish with object identify */
57736+ void (*done) (reiser4_object_on_wire * obj);
57737+ } wire;
57738+} file_plugin;
57739+
57740+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
57741+
57742+struct reiser4_object_on_wire {
57743+ file_plugin *plugin;
57744+ union {
57745+ struct {
57746+ obj_key_id key_id;
57747+ } std;
57748+ void *generic;
57749+ } u;
57750+};
57751+
57752+/* builtin dir-plugins */
57753+typedef enum {
57754+ HASHED_DIR_PLUGIN_ID,
57755+ SEEKABLE_HASHED_DIR_PLUGIN_ID,
57756+ LAST_DIR_ID
57757+} reiser4_dir_id;
57758+
57759+typedef struct dir_plugin {
57760+ /* generic fields */
57761+ plugin_header h;
57762+
57763+ struct inode_operations * inode_ops;
57764+ struct file_operations * file_ops;
57765+ struct address_space_operations * as_ops;
57766+
57767+ /*
57768+ * private methods: These are optional. If used they will allow you to
57769+ * minimize the amount of code needed to implement a deviation from
57770+ * some other method that uses them. You could logically argue that
57771+ * they should be a separate type of plugin.
57772+ */
57773+
57774+ struct dentry *(*get_parent) (struct inode * childdir);
57775+
57776+ /*
57777+ * check whether "name" is acceptable name to be inserted into this
57778+ * object. Optionally implemented by directory-like objects. Can check
57779+ * for maximal length, reserved symbols etc
57780+ */
57781+ int (*is_name_acceptable) (const struct inode * inode, const char *name,
57782+ int len);
57783+
57784+ void (*build_entry_key) (const struct inode * dir /* directory where
57785+ * entry is (or will
57786+ * be) in.*/ ,
57787+ const struct qstr * name /* name of file
57788+ * referenced by this
57789+ * entry */ ,
57790+ reiser4_key * result /* resulting key of
57791+ * directory entry */ );
57792+ int (*build_readdir_key) (struct file * dir, reiser4_key * result);
57793+ int (*add_entry) (struct inode * object, struct dentry * where,
57794+ reiser4_object_create_data * data,
57795+ reiser4_dir_entry_desc * entry);
57796+ int (*rem_entry) (struct inode * object, struct dentry * where,
57797+ reiser4_dir_entry_desc * entry);
57798+
57799+ /*
57800+ * initialize directory structure for newly created object. For normal
57801+ * unix directories, insert dot and dotdot.
57802+ */
57803+ int (*init) (struct inode * object, struct inode * parent,
57804+ reiser4_object_create_data * data);
57805+
57806+ /* destroy directory */
57807+ int (*done) (struct inode * child);
57808+
57809+ /* called when @subdir was just looked up in the @dir */
57810+ int (*attach) (struct inode * subdir, struct inode * dir);
57811+ int (*detach) (struct inode * subdir, struct inode * dir);
57812+
57813+ struct {
57814+ reiser4_block_nr(*add_entry) (const struct inode *);
57815+ reiser4_block_nr(*rem_entry) (const struct inode *);
57816+ reiser4_block_nr(*unlink) (const struct inode *,
57817+ const struct inode *);
57818+ } estimate;
57819+} dir_plugin;
57820+
57821+extern dir_plugin dir_plugins[LAST_DIR_ID];
57822+
57823+typedef struct formatting_plugin {
57824+ /* generic fields */
57825+ plugin_header h;
57826+ /* returns non-zero iff file's tail has to be stored
57827+ in a direct item. */
57828+ int (*have_tail) (const struct inode * inode, loff_t size);
57829+} formatting_plugin;
57830+
57831+typedef struct hash_plugin {
57832+ /* generic fields */
57833+ plugin_header h;
57834+ /* computes hash of the given name */
57835+ __u64(*hash) (const unsigned char *name, int len);
57836+} hash_plugin;
57837+
57838+typedef struct cipher_plugin {
57839+ /* generic fields */
57840+ plugin_header h;
57841+ struct crypto_blkcipher * (*alloc) (void);
57842+ void (*free) (struct crypto_blkcipher * tfm);
57843+ /* Offset translator. For each offset this returns (k * offset), where
57844+ k (k >= 1) is an expansion factor of the cipher algorithm.
57845+ For all symmetric algorithms k == 1. For asymmetric algorithms (which
57846+ inflate data) offset translation guarantees that all disk cluster's
57847+ units will have keys smaller then next cluster's one.
57848+ */
57849+ loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
57850+ /* Cipher algorithms can accept data only by chunks of cipher block
57851+ size. This method is to align any flow up to cipher block size when
57852+ we pass it to cipher algorithm. To align means to append padding of
57853+ special format specific to the cipher algorithm */
57854+ int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
57855+ /* low-level key manager (check, install, etc..) */
57856+ int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
57857+ unsigned int keylen);
57858+ /* main text processing procedures */
57859+ void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
57860+ void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
57861+} cipher_plugin;
57862+
57863+typedef struct digest_plugin {
57864+ /* generic fields */
57865+ plugin_header h;
57866+ /* fingerprint size in bytes */
57867+ int fipsize;
57868+ struct crypto_hash * (*alloc) (void);
57869+ void (*free) (struct crypto_hash * tfm);
57870+} digest_plugin;
57871+
57872+typedef struct compression_plugin {
57873+ /* generic fields */
57874+ plugin_header h;
57875+ int (*init) (void);
57876+ /* the maximum number of bytes the size of the "compressed" data can
57877+ * exceed the uncompressed data. */
57878+ int (*overrun) (unsigned src_len);
57879+ coa_t(*alloc) (tfm_action act);
57880+ void (*free) (coa_t coa, tfm_action act);
57881+ /* minimal size of the flow we still try to compress */
57882+ int (*min_size_deflate) (void);
57883+ __u32(*checksum) (char *data, __u32 length);
57884+ /* main transform procedures */
57885+ void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
57886+ __u8 * dst_first, unsigned *dst_len);
57887+ void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
57888+ __u8 * dst_first, unsigned *dst_len);
57889+} compression_plugin;
57890+
57891+typedef struct compression_mode_plugin {
57892+ /* generic fields */
57893+ plugin_header h;
57894+ /* this is called when estimating compressibility
57895+ of a logical cluster by its content */
57896+ int (*should_deflate) (struct inode * inode, cloff_t index);
57897+ /* this is called when results of compression should be saved */
57898+ int (*accept_hook) (struct inode * inode, cloff_t index);
57899+ /* this is called when results of compression should be discarded */
57900+ int (*discard_hook) (struct inode * inode, cloff_t index);
57901+} compression_mode_plugin;
57902+
57903+typedef struct cluster_plugin {
57904+ /* generic fields */
57905+ plugin_header h;
57906+ int shift;
57907+} cluster_plugin;
57908+
57909+typedef struct sd_ext_plugin {
57910+ /* generic fields */
57911+ plugin_header h;
57912+ int (*present) (struct inode * inode, char **area, int *len);
57913+ int (*absent) (struct inode * inode);
57914+ int (*save_len) (struct inode * inode);
57915+ int (*save) (struct inode * inode, char **area);
57916+ /* alignment requirement for this stat-data part */
57917+ int alignment;
57918+} sd_ext_plugin;
57919+
57920+/* this plugin contains methods to allocate objectid for newly created files,
57921+ to deallocate objectid when file gets removed, to report number of used and
57922+ free objectids */
57923+typedef struct oid_allocator_plugin {
57924+ /* generic fields */
57925+ plugin_header h;
57926+ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
57927+ __u64 oids);
57928+ /* used to report statfs->f_files */
57929+ __u64(*oids_used) (reiser4_oid_allocator * map);
57930+ /* get next oid to use */
57931+ __u64(*next_oid) (reiser4_oid_allocator * map);
57932+ /* used to report statfs->f_ffree */
57933+ __u64(*oids_free) (reiser4_oid_allocator * map);
57934+ /* allocate new objectid */
57935+ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
57936+ /* release objectid */
57937+ int (*release_oid) (reiser4_oid_allocator * map, oid_t);
57938+ /* how many pages to reserve in transaction for allocation of new
57939+ objectid */
57940+ int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
57941+ /* how many pages to reserve in transaction for freeing of an
57942+ objectid */
57943+ int (*oid_reserve_release) (reiser4_oid_allocator * map);
57944+ void (*print_info) (const char *, reiser4_oid_allocator *);
57945+} oid_allocator_plugin;
57946+
57947+/* disk layout plugin: this specifies super block, journal, bitmap (if there
57948+ are any) locations, etc */
57949+typedef struct disk_format_plugin {
57950+ /* generic fields */
57951+ plugin_header h;
57952+ /* replay journal, initialize super_info_data, etc */
57953+ int (*init_format) (struct super_block *, void *data);
57954+
57955+ /* key of root directory stat data */
57956+ const reiser4_key *(*root_dir_key) (const struct super_block *);
57957+
57958+ int (*release) (struct super_block *);
57959+ jnode *(*log_super) (struct super_block *);
57960+ int (*check_open) (const struct inode * object);
57961+ int (*version_update) (struct super_block *);
57962+} disk_format_plugin;
57963+
57964+struct jnode_plugin {
57965+ /* generic fields */
57966+ plugin_header h;
57967+ int (*init) (jnode * node);
57968+ int (*parse) (jnode * node);
57969+ struct address_space *(*mapping) (const jnode * node);
57970+ unsigned long (*index) (const jnode * node);
57971+ jnode *(*clone) (jnode * node);
57972+};
57973+
57974+/* plugin instance. */
57975+/* */
57976+/* This is "wrapper" union for all types of plugins. Most of the code uses */
57977+/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
57978+/* operates with pointers to reiser4_plugin. This union is only used in */
57979+/* some generic code in plugin/plugin.c that operates on all */
57980+/* plugins. Technically speaking purpose of this union is to add type */
57981+/* safety to said generic code: each plugin type (file_plugin, for */
57982+/* example), contains plugin_header as its first memeber. This first member */
57983+/* is located at the same place in memory as .h member of */
57984+/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
57985+/* looks in the .h which is header of plugin type located in union. This */
57986+/* allows to avoid type-casts. */
57987+union reiser4_plugin {
57988+ /* generic fields */
57989+ plugin_header h;
57990+ /* file plugin */
57991+ file_plugin file;
57992+ /* directory plugin */
57993+ dir_plugin dir;
57994+ /* hash plugin, used by directory plugin */
57995+ hash_plugin hash;
57996+ /* fibration plugin used by directory plugin */
57997+ fibration_plugin fibration;
57998+ /* cipher transform plugin, used by file plugin */
57999+ cipher_plugin cipher;
58000+ /* digest transform plugin, used by file plugin */
58001+ digest_plugin digest;
58002+ /* compression transform plugin, used by file plugin */
58003+ compression_plugin compression;
58004+ /* tail plugin, used by file plugin */
58005+ formatting_plugin formatting;
58006+ /* permission plugin */
58007+ perm_plugin perm;
58008+ /* node plugin */
58009+ node_plugin node;
58010+ /* item plugin */
58011+ item_plugin item;
58012+ /* stat-data extension plugin */
58013+ sd_ext_plugin sd_ext;
58014+ /* disk layout plugin */
58015+ disk_format_plugin format;
58016+ /* object id allocator plugin */
58017+ oid_allocator_plugin oid_allocator;
58018+ /* plugin for different jnode types */
58019+ jnode_plugin jnode;
58020+ /* compression mode plugin, used by object plugin */
58021+ compression_mode_plugin compression_mode;
58022+ /* cluster plugin, used by object plugin */
58023+ cluster_plugin clust;
58024+ /* place-holder for new plugin types that can be registered
58025+ dynamically, and used by other dynamically loaded plugins. */
58026+ void *generic;
58027+};
58028+
58029+struct reiser4_plugin_ops {
58030+ /* called when plugin is initialized */
58031+ int (*init) (reiser4_plugin * plugin);
58032+ /* called when plugin is unloaded */
58033+ int (*done) (reiser4_plugin * plugin);
58034+ /* load given plugin from disk */
58035+ int (*load) (struct inode * inode,
58036+ reiser4_plugin * plugin, char **area, int *len);
58037+ /* how many space is required to store this plugin's state
58038+ in stat-data */
58039+ int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
58040+ /* save persistent plugin-data to disk */
58041+ int (*save) (struct inode * inode, reiser4_plugin * plugin,
58042+ char **area);
58043+ /* alignment requirement for on-disk state of this plugin
58044+ in number of bytes */
58045+ int alignment;
58046+ /* install itself into given inode. This can return error
58047+ (e.g., you cannot change hash of non-empty directory). */
58048+ int (*change) (struct inode * inode, reiser4_plugin * plugin,
58049+ pset_member memb);
58050+ /* install itself into given inode. This can return error
58051+ (e.g., you cannot change hash of non-empty directory). */
58052+ int (*inherit) (struct inode * inode, struct inode * parent,
58053+ reiser4_plugin * plugin);
58054+};
58055+
58056+/* functions implemented in fs/reiser4/plugin/plugin.c */
58057+
58058+/* stores plugin reference in reiser4-specific part of inode */
58059+extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
58060+extern int init_plugins(void);
58061+
58062+/* builtin plugins */
58063+
58064+/* builtin hash-plugins */
58065+
58066+typedef enum {
58067+ RUPASOV_HASH_ID,
58068+ R5_HASH_ID,
58069+ TEA_HASH_ID,
58070+ FNV1_HASH_ID,
58071+ DEGENERATE_HASH_ID,
58072+ LAST_HASH_ID
58073+} reiser4_hash_id;
58074+
58075+/* builtin cipher plugins */
58076+
58077+typedef enum {
58078+ NONE_CIPHER_ID,
58079+ LAST_CIPHER_ID
58080+} reiser4_cipher_id;
58081+
58082+/* builtin digest plugins */
58083+
58084+typedef enum {
58085+ SHA256_32_DIGEST_ID,
58086+ LAST_DIGEST_ID
58087+} reiser4_digest_id;
58088+
58089+/* builtin compression mode plugins */
58090+typedef enum {
58091+ NONE_COMPRESSION_MODE_ID,
58092+ LATTD_COMPRESSION_MODE_ID,
58093+ ULTIM_COMPRESSION_MODE_ID,
58094+ FORCE_COMPRESSION_MODE_ID,
58095+ CONVX_COMPRESSION_MODE_ID,
58096+ LAST_COMPRESSION_MODE_ID
58097+} reiser4_compression_mode_id;
58098+
58099+/* builtin cluster plugins */
58100+typedef enum {
58101+ CLUSTER_64K_ID,
58102+ CLUSTER_32K_ID,
58103+ CLUSTER_16K_ID,
58104+ CLUSTER_8K_ID,
58105+ CLUSTER_4K_ID,
58106+ LAST_CLUSTER_ID
58107+} reiser4_cluster_id;
58108+
58109+/* builtin tail-plugins */
58110+
58111+typedef enum {
58112+ NEVER_TAILS_FORMATTING_ID,
58113+ ALWAYS_TAILS_FORMATTING_ID,
58114+ SMALL_FILE_FORMATTING_ID,
58115+ LAST_TAIL_FORMATTING_ID
58116+} reiser4_formatting_id;
58117+
58118+/* data type used to pack parameters that we pass to vfs object creation
58119+ function create_object() */
58120+struct reiser4_object_create_data {
58121+ /* plugin to control created object */
58122+ reiser4_file_id id;
58123+ /* mode of regular file, directory or special file */
58124+/* what happens if some other sort of perm plugin is in use? */
58125+ int mode;
58126+ /* rdev of special file */
58127+ dev_t rdev;
58128+ /* symlink target */
58129+ const char *name;
58130+ /* add here something for non-standard objects you invent, like
58131+ query for interpolation file etc. */
58132+
58133+ struct reiser4_crypto_info * crypto;
58134+
58135+ struct inode *parent;
58136+ struct dentry *dentry;
58137+};
58138+
58139+/* description of directory entry being created/destroyed/sought for
58140+
58141+ It is passed down to the directory plugin and farther to the
58142+ directory item plugin methods. Creation of new directory is done in
58143+ several stages: first we search for an entry with the same name, then
58144+ create new one. reiser4_dir_entry_desc is used to store some information
58145+ collected at some stage of this process and required later: key of
58146+ item that we want to insert/delete and pointer to an object that will
58147+ be bound by the new directory entry. Probably some more fields will
58148+ be added there.
58149+
58150+*/
58151+struct reiser4_dir_entry_desc {
58152+ /* key of directory entry */
58153+ reiser4_key key;
58154+ /* object bound by this entry. */
58155+ struct inode *obj;
58156+};
58157+
58158+#define MAX_PLUGIN_TYPE_LABEL_LEN 32
58159+#define MAX_PLUGIN_PLUG_LABEL_LEN 32
58160+
58161+#define PLUGIN_BY_ID(TYPE,ID,FIELD) \
58162+static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \
58163+{ \
58164+ reiser4_plugin *plugin = plugin_by_id ( ID, id ); \
58165+ return plugin ? & plugin -> FIELD : NULL; \
58166+} \
58167+static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
58168+{ \
58169+ reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \
58170+ return plugin ? & plugin -> FIELD : NULL; \
58171+} \
58172+static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \
58173+{ \
58174+ reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \
58175+ return plugin ? & plugin -> FIELD : NULL; \
58176+} \
58177+static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \
58178+{ \
58179+ return ( reiser4_plugin * ) plugin; \
58180+} \
58181+static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \
58182+{ \
58183+ return TYPE ## _to_plugin (plugin) -> h.id; \
58184+} \
58185+typedef struct { int foo; } TYPE ## _plugin_dummy
58186+
58187+PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
58188+PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
58189+PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
58190+PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
58191+PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
58192+PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
58193+PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
58194+PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
58195+PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
58196+PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
58197+PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
58198+PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
58199+PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
58200+PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
58201+PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58202+ compression_mode);
58203+PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
58204+
58205+extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
58206+
58207+extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
58208+
58209+#define for_all_plugins(ptype, plugin) \
58210+for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
58211+ get_plugin_list(ptype) != &plugin->h.linkage; \
58212+ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
58213+
58214+
58215+extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb);
58216+extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug);
58217+extern int finish_pset(struct inode *inode);
58218+
58219+/* defined in fs/reiser4/plugin/object.c */
58220+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
58221+/* defined in fs/reiser4/plugin/object.c */
58222+extern dir_plugin dir_plugins[LAST_DIR_ID];
58223+/* defined in fs/reiser4/plugin/item/static_stat.c */
58224+extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
58225+/* defined in fs/reiser4/plugin/hash.c */
58226+extern hash_plugin hash_plugins[LAST_HASH_ID];
58227+/* defined in fs/reiser4/plugin/fibration.c */
58228+extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
58229+/* defined in fs/reiser4/plugin/crypt.c */
58230+extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
58231+/* defined in fs/reiser4/plugin/digest.c */
58232+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
58233+/* defined in fs/reiser4/plugin/compress/compress.c */
58234+extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
58235+/* defined in fs/reiser4/plugin/compress/compression_mode.c */
58236+extern compression_mode_plugin
58237+compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
58238+/* defined in fs/reiser4/plugin/cluster.c */
58239+extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
58240+/* defined in fs/reiser4/plugin/tail.c */
58241+extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
58242+/* defined in fs/reiser4/plugin/security/security.c */
58243+extern perm_plugin perm_plugins[LAST_PERM_ID];
58244+/* defined in fs/reiser4/plugin/item/item.c */
58245+extern item_plugin item_plugins[LAST_ITEM_ID];
58246+/* defined in fs/reiser4/plugin/node/node.c */
58247+extern node_plugin node_plugins[LAST_NODE_ID];
58248+/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
58249+extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
58250+
58251+/* __FS_REISER4_PLUGIN_TYPES_H__ */
58252+#endif
58253+
58254+/* Make Linus happy.
58255+ Local variables:
58256+ c-indentation-style: "K&R"
58257+ mode-name: "LC"
58258+ c-basic-offset: 8
58259+ tab-width: 8
58260+ fill-column: 120
58261+ End:
58262+*/
58263diff -urN linux-2.6.23.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.23/fs/reiser4/plugin/plugin_header.h
58264--- linux-2.6.23.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 03:00:00.000000000 +0300
58265+++ linux-2.6.23/fs/reiser4/plugin/plugin_header.h 2007-12-04 16:49:30.000000000 +0300
58266@@ -0,0 +1,155 @@
58267+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58268+
58269+/* plugin header. Data structures required by all plugin types. */
58270+
58271+#if !defined( __PLUGIN_HEADER_H__ )
58272+#define __PLUGIN_HEADER_H__
58273+
58274+/* plugin data-types and constants */
58275+
58276+#include "../debug.h"
58277+#include "../dformat.h"
58278+
58279+/* Every plugin type can be considered as a class of virtual objects
58280+ {(type, i) | i = 0, 1, ...}, which has one the following categories
58281+ of virtualization:
58282+ A - no virtualization;
58283+ F - per-file virtualization;
58284+ S - per-superblock virtualization;
58285+ FIXME-EDWARD: Define every such category */
58286+
58287+/* Supported plugin types: (id, (virtualization category), short description) */
58288+typedef enum {
58289+ REISER4_FILE_PLUGIN_TYPE, /* (F) service VFS enry-points */
58290+ REISER4_DIR_PLUGIN_TYPE, /* (F) service VFS enry-points */
58291+ REISER4_ITEM_PLUGIN_TYPE, /* (F) manage items */
58292+ REISER4_NODE_PLUGIN_TYPE, /* (S) manage formatted nodes */
58293+ REISER4_HASH_PLUGIN_TYPE, /* (F) compute hash */
58294+ REISER4_FIBRATION_PLUGIN_TYPE, /* (F) directory fibrations */
58295+ REISER4_FORMATTING_PLUGIN_TYPE, /* (F) tail-packing policy */
58296+ REISER4_PERM_PLUGIN_TYPE, /* stub (vacancy) */
58297+ REISER4_SD_EXT_PLUGIN_TYPE, /* (A) stat-data extensions */
58298+ REISER4_FORMAT_PLUGIN_TYPE, /* (S) specify disk format */
58299+ REISER4_JNODE_PLUGIN_TYPE, /* (A) in-memory node headers */
58300+ REISER4_CIPHER_PLUGIN_TYPE, /* (F) cipher transform algs */
58301+ REISER4_DIGEST_PLUGIN_TYPE, /* (F) digest transform algs */
58302+ REISER4_COMPRESSION_PLUGIN_TYPE, /* (F) compression tfm algs */
58303+ REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* (F) compression heuristic */
58304+ REISER4_CLUSTER_PLUGIN_TYPE, /* (F) size of logical cluster */
58305+ REISER4_PLUGIN_TYPES
58306+} reiser4_plugin_type;
58307+
58308+/* Supported plugin groups */
58309+typedef enum {
58310+ REISER4_DIRECTORY_FILE,
58311+ REISER4_REGULAR_FILE,
58312+ REISER4_SYMLINK_FILE,
58313+ REISER4_SPECIAL_FILE,
58314+} file_plugin_group;
58315+
58316+struct reiser4_plugin_ops;
58317+/* generic plugin operations, supported by each
58318+ plugin type. */
58319+typedef struct reiser4_plugin_ops reiser4_plugin_ops;
58320+
58321+/* the common part of all plugin instances. */
58322+typedef struct plugin_header {
58323+ /* plugin type */
58324+ reiser4_plugin_type type_id;
58325+ /* id of this plugin */
58326+ reiser4_plugin_id id;
58327+ /* bitmask of groups the plugin belongs to. */
58328+ reiser4_plugin_groups groups;
58329+ /* plugin operations */
58330+ reiser4_plugin_ops *pops;
58331+/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
58332+ /* short label of this plugin */
58333+ const char *label;
58334+ /* descriptive string.. */
58335+ const char *desc;
58336+ /* list linkage */
58337+ struct list_head linkage;
58338+} plugin_header;
58339+
58340+#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
58341+
58342+/* PRIVATE INTERFACES */
58343+/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
58344+/* plugin type representation. */
58345+struct reiser4_plugin_type_data {
58346+ /* internal plugin type identifier. Should coincide with
58347+ index of this item in plugins[] array. */
58348+ reiser4_plugin_type type_id;
58349+ /* short symbolic label of this plugin type. Should be no longer
58350+ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
58351+ const char *label;
58352+ /* plugin type description longer than .label */
58353+ const char *desc;
58354+
58355+/* NIKITA-FIXME-HANS: define built-in */
58356+ /* number of built-in plugin instances of this type */
58357+ int builtin_num;
58358+ /* array of built-in plugins */
58359+ void *builtin;
58360+ struct list_head plugins_list;
58361+ size_t size;
58362+};
58363+
58364+extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
58365+
58366+int is_plugin_type_valid(reiser4_plugin_type type);
58367+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
58368+
58369+static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data * ptype,
58370+ int i)
58371+{
58372+ char *builtin;
58373+
58374+ builtin = ptype->builtin;
58375+ return (reiser4_plugin *) (builtin + i * ptype->size);
58376+}
58377+
58378+/* return plugin by its @type_id and @id */
58379+static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
58380+ reiser4_plugin_id id)
58381+{
58382+ assert("nikita-1651", is_plugin_type_valid(type));
58383+ assert("nikita-1652", is_plugin_id_valid(type, id));
58384+ return plugin_at(&plugins[type], id);
58385+}
58386+
58387+extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
58388+ reiser4_plugin_id id);
58389+
58390+/**
58391+ * plugin_by_disk_id - get reiser4_plugin
58392+ * @type_id: plugin type id
58393+ * @did: plugin id in disk format
58394+ *
58395+ * Returns reiser4_plugin by plugin type id an dplugin_id.
58396+ */
58397+static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
58398+ reiser4_plugin_type type_id,
58399+ __le16 *plugin_id)
58400+{
58401+ /*
58402+ * what we should do properly is to maintain within each file-system a
58403+ * dictionary that maps on-disk plugin ids to "universal" ids. This
58404+ * dictionary will be resolved on mount time, so that this function
58405+ * will perform just one additional array lookup.
58406+ */
58407+ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
58408+}
58409+
58410+/* __PLUGIN_HEADER_H__ */
58411+#endif
58412+
58413+/*
58414+ * Local variables:
58415+ * c-indentation-style: "K&R"
58416+ * mode-name: "LC"
58417+ * c-basic-offset: 8
58418+ * tab-width: 8
58419+ * fill-column: 79
58420+ * End:
58421+ */
58422diff -urN linux-2.6.23.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.23/fs/reiser4/plugin/plugin_set.c
58423--- linux-2.6.23.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 03:00:00.000000000 +0300
58424+++ linux-2.6.23/fs/reiser4/plugin/plugin_set.c 2007-12-04 16:49:30.000000000 +0300
58425@@ -0,0 +1,379 @@
58426+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58427+ * reiser4/README */
58428+/* This file contains Reiser4 plugin set operations */
58429+
58430+/* plugin sets
58431+ *
58432+ * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
58433+ * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
58434+ * assigned (inherited, deduced from mode bits, etc.) at creation time. This
58435+ * set of plugins (so called pset) is described by structure plugin_set (see
58436+ * plugin/plugin_set.h), which contains pointers to all required plugins.
58437+ *
58438+ * Children can inherit some pset members from their parent, however sometimes
58439+ * it is useful to specify members different from parent ones. Since object's
58440+ * pset can not be easily changed without fatal consequences, we use for this
58441+ * purpose another special plugin table (so called hset, or heir set) described
58442+ * by the same structure.
58443+ *
58444+ * Inode only stores a pointers to pset and hset. Different inodes with the
58445+ * same set of pset (hset) members point to the same pset (hset). This is
58446+ * archived by storing psets and hsets in global hash table. Races are avoided
58447+ * by simple (and efficient so far) solution of never recycling psets, even
58448+ * when last inode pointing to it is destroyed.
58449+ */
58450+
58451+#include "../debug.h"
58452+#include "../super.h"
58453+#include "plugin_set.h"
58454+
58455+#include <linux/slab.h>
58456+#include <linux/stddef.h>
58457+
58458+/* slab for plugin sets */
58459+static struct kmem_cache *plugin_set_slab;
58460+
58461+static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
58462+ [0 ... 7] = SPIN_LOCK_UNLOCKED
58463+};
58464+
58465+/* hash table support */
58466+
58467+#define PS_TABLE_SIZE (32)
58468+
58469+static inline plugin_set *cast_to(const unsigned long *a)
58470+{
58471+ return container_of(a, plugin_set, hashval);
58472+}
58473+
58474+static inline int pseq(const unsigned long *a1, const unsigned long *a2)
58475+{
58476+ plugin_set *set1;
58477+ plugin_set *set2;
58478+
58479+ /* make sure fields are not missed in the code below */
58480+ cassert(sizeof *set1 ==
58481+ sizeof set1->hashval +
58482+ sizeof set1->link +
58483+ sizeof set1->file +
58484+ sizeof set1->dir +
58485+ sizeof set1->perm +
58486+ sizeof set1->formatting +
58487+ sizeof set1->hash +
58488+ sizeof set1->fibration +
58489+ sizeof set1->sd +
58490+ sizeof set1->dir_item +
58491+ sizeof set1->cipher +
58492+ sizeof set1->digest +
58493+ sizeof set1->compression +
58494+ sizeof set1->compression_mode +
58495+ sizeof set1->cluster +
58496+ sizeof set1->create);
58497+
58498+ set1 = cast_to(a1);
58499+ set2 = cast_to(a2);
58500+ return
58501+ set1->hashval == set2->hashval &&
58502+ set1->file == set2->file &&
58503+ set1->dir == set2->dir &&
58504+ set1->perm == set2->perm &&
58505+ set1->formatting == set2->formatting &&
58506+ set1->hash == set2->hash &&
58507+ set1->fibration == set2->fibration &&
58508+ set1->sd == set2->sd &&
58509+ set1->dir_item == set2->dir_item &&
58510+ set1->cipher == set2->cipher &&
58511+ set1->digest == set2->digest &&
58512+ set1->compression == set2->compression &&
58513+ set1->compression_mode == set2->compression_mode &&
58514+ set1->cluster == set2->cluster &&
58515+ set1->create == set2->create;
58516+}
58517+
58518+#define HASH_FIELD(hash, set, field) \
58519+({ \
58520+ (hash) += (unsigned long)(set)->field >> 2; \
58521+})
58522+
58523+static inline unsigned long calculate_hash(const plugin_set * set)
58524+{
58525+ unsigned long result;
58526+
58527+ result = 0;
58528+ HASH_FIELD(result, set, file);
58529+ HASH_FIELD(result, set, dir);
58530+ HASH_FIELD(result, set, perm);
58531+ HASH_FIELD(result, set, formatting);
58532+ HASH_FIELD(result, set, hash);
58533+ HASH_FIELD(result, set, fibration);
58534+ HASH_FIELD(result, set, sd);
58535+ HASH_FIELD(result, set, dir_item);
58536+ HASH_FIELD(result, set, cipher);
58537+ HASH_FIELD(result, set, digest);
58538+ HASH_FIELD(result, set, compression);
58539+ HASH_FIELD(result, set, compression_mode);
58540+ HASH_FIELD(result, set, cluster);
58541+ HASH_FIELD(result, set, create);
58542+ return result & (PS_TABLE_SIZE - 1);
58543+}
58544+
58545+static inline unsigned long
58546+pshash(ps_hash_table * table, const unsigned long *a)
58547+{
58548+ return *a;
58549+}
58550+
58551+/* The hash table definition */
58552+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
58553+#define KFREE(ptr, size) kfree(ptr)
58554+TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
58555+ pseq);
58556+#undef KFREE
58557+#undef KMALLOC
58558+
58559+static ps_hash_table ps_table;
58560+static plugin_set empty_set = {
58561+ .hashval = 0,
58562+ .file = NULL,
58563+ .dir = NULL,
58564+ .perm = NULL,
58565+ .formatting = NULL,
58566+ .hash = NULL,
58567+ .fibration = NULL,
58568+ .sd = NULL,
58569+ .dir_item = NULL,
58570+ .cipher = NULL,
58571+ .digest = NULL,
58572+ .compression = NULL,
58573+ .compression_mode = NULL,
58574+ .cluster = NULL,
58575+ .create = NULL,
58576+ .link = {NULL}
58577+};
58578+
58579+plugin_set *plugin_set_get_empty(void)
58580+{
58581+ return &empty_set;
58582+}
58583+
58584+void plugin_set_put(plugin_set * set)
58585+{
58586+}
58587+
58588+static inline unsigned long *pset_field(plugin_set * set, int offset)
58589+{
58590+ return (unsigned long *)(((char *)set) + offset);
58591+}
58592+
58593+static int plugin_set_field(plugin_set ** set, const unsigned long val,
58594+ const int offset)
58595+{
58596+ unsigned long *spot;
58597+ spinlock_t *lock;
58598+ plugin_set replica;
58599+ plugin_set *twin;
58600+ plugin_set *psal;
58601+ plugin_set *orig;
58602+
58603+ assert("nikita-2902", set != NULL);
58604+ assert("nikita-2904", *set != NULL);
58605+
58606+ spot = pset_field(*set, offset);
58607+ if (unlikely(*spot == val))
58608+ return 0;
58609+
58610+ replica = *(orig = *set);
58611+ *pset_field(&replica, offset) = val;
58612+ replica.hashval = calculate_hash(&replica);
58613+ rcu_read_lock();
58614+ twin = ps_hash_find(&ps_table, &replica.hashval);
58615+ if (unlikely(twin == NULL)) {
58616+ rcu_read_unlock();
58617+ psal = kmem_cache_alloc(plugin_set_slab,
58618+ reiser4_ctx_gfp_mask_get());
58619+ if (psal == NULL)
58620+ return RETERR(-ENOMEM);
58621+ *psal = replica;
58622+ lock = &plugin_set_lock[replica.hashval & 7];
58623+ spin_lock(lock);
58624+ twin = ps_hash_find(&ps_table, &replica.hashval);
58625+ if (likely(twin == NULL)) {
58626+ *set = psal;
58627+ ps_hash_insert_rcu(&ps_table, psal);
58628+ } else {
58629+ *set = twin;
58630+ kmem_cache_free(plugin_set_slab, psal);
58631+ }
58632+ spin_unlock(lock);
58633+ } else {
58634+ rcu_read_unlock();
58635+ *set = twin;
58636+ }
58637+ return 0;
58638+}
58639+
58640+static struct {
58641+ int offset;
58642+ reiser4_plugin_groups groups;
58643+ reiser4_plugin_type type;
58644+} pset_descr[PSET_LAST] = {
58645+ [PSET_FILE] = {
58646+ .offset = offsetof(plugin_set, file),
58647+ .type = REISER4_FILE_PLUGIN_TYPE,
58648+ .groups = 0
58649+ },
58650+ [PSET_DIR] = {
58651+ .offset = offsetof(plugin_set, dir),
58652+ .type = REISER4_DIR_PLUGIN_TYPE,
58653+ .groups = 0
58654+ },
58655+ [PSET_PERM] = {
58656+ .offset = offsetof(plugin_set, perm),
58657+ .type = REISER4_PERM_PLUGIN_TYPE,
58658+ .groups = 0
58659+ },
58660+ [PSET_FORMATTING] = {
58661+ .offset = offsetof(plugin_set, formatting),
58662+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
58663+ .groups = 0
58664+ },
58665+ [PSET_HASH] = {
58666+ .offset = offsetof(plugin_set, hash),
58667+ .type = REISER4_HASH_PLUGIN_TYPE,
58668+ .groups = 0
58669+ },
58670+ [PSET_FIBRATION] = {
58671+ .offset = offsetof(plugin_set, fibration),
58672+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
58673+ .groups = 0
58674+ },
58675+ [PSET_SD] = {
58676+ .offset = offsetof(plugin_set, sd),
58677+ .type = REISER4_ITEM_PLUGIN_TYPE,
58678+ .groups = (1 << STAT_DATA_ITEM_TYPE)
58679+ },
58680+ [PSET_DIR_ITEM] = {
58681+ .offset = offsetof(plugin_set, dir_item),
58682+ .type = REISER4_ITEM_PLUGIN_TYPE,
58683+ .groups = (1 << DIR_ENTRY_ITEM_TYPE)
58684+ },
58685+ [PSET_CIPHER] = {
58686+ .offset = offsetof(plugin_set, cipher),
58687+ .type = REISER4_CIPHER_PLUGIN_TYPE,
58688+ .groups = 0
58689+ },
58690+ [PSET_DIGEST] = {
58691+ .offset = offsetof(plugin_set, digest),
58692+ .type = REISER4_DIGEST_PLUGIN_TYPE,
58693+ .groups = 0
58694+ },
58695+ [PSET_COMPRESSION] = {
58696+ .offset = offsetof(plugin_set, compression),
58697+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
58698+ .groups = 0
58699+ },
58700+ [PSET_COMPRESSION_MODE] = {
58701+ .offset = offsetof(plugin_set, compression_mode),
58702+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58703+ .groups = 0
58704+ },
58705+ [PSET_CLUSTER] = {
58706+ .offset = offsetof(plugin_set, cluster),
58707+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
58708+ .groups = 0
58709+ },
58710+ [PSET_CREATE] = {
58711+ .offset = offsetof(plugin_set, create),
58712+ .type = REISER4_FILE_PLUGIN_TYPE,
58713+ .groups = (1 << REISER4_REGULAR_FILE)
58714+ }
58715+};
58716+
58717+#define DEFINE_PSET_OPS(PREFIX) \
58718+ reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \
58719+{ \
58720+ if (memb > PSET_LAST) \
58721+ return REISER4_PLUGIN_TYPES; \
58722+ return pset_descr[memb].type; \
58723+} \
58724+ \
58725+int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \
58726+ reiser4_plugin * plugin) \
58727+{ \
58728+ assert("nikita-3492", set != NULL); \
58729+ assert("nikita-3493", *set != NULL); \
58730+ assert("nikita-3494", plugin != NULL); \
58731+ assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \
58732+ assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \
58733+ \
58734+ if (pset_descr[memb].groups) \
58735+ if (!(pset_descr[memb].groups & plugin->h.groups)) \
58736+ return -EINVAL; \
58737+ \
58738+ return plugin_set_field(set, \
58739+ (unsigned long)plugin, pset_descr[memb].offset); \
58740+} \
58741+ \
58742+reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \
58743+{ \
58744+ assert("nikita-3497", set != NULL); \
58745+ assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \
58746+ \
58747+ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
58748+}
58749+
58750+DEFINE_PSET_OPS(aset);
58751+
58752+int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) {
58753+ return plugin_set_field(set,
58754+ (unsigned long)plugin, pset_descr[memb].offset);
58755+}
58756+
58757+/**
58758+ * init_plugin_set - create plugin set cache and hash table
58759+ *
58760+ * Initializes slab cache of plugin_set-s and their hash table. It is part of
58761+ * reiser4 module initialization.
58762+ */
58763+int init_plugin_set(void)
58764+{
58765+ int result;
58766+
58767+ result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
58768+ if (result == 0) {
58769+ plugin_set_slab = kmem_cache_create("plugin_set",
58770+ sizeof(plugin_set), 0,
58771+ SLAB_HWCACHE_ALIGN,
58772+ NULL);
58773+ if (plugin_set_slab == NULL)
58774+ result = RETERR(-ENOMEM);
58775+ }
58776+ return result;
58777+}
58778+
58779+/**
58780+ * done_plugin_set - delete plugin_set cache and plugin_set hash table
58781+ *
58782+ * This is called on reiser4 module unloading or system shutdown.
58783+ */
58784+void done_plugin_set(void)
58785+{
58786+ plugin_set *cur, *next;
58787+
58788+ for_all_in_htable(&ps_table, ps, cur, next) {
58789+ ps_hash_remove(&ps_table, cur);
58790+ kmem_cache_free(plugin_set_slab, cur);
58791+ }
58792+ destroy_reiser4_cache(&plugin_set_slab);
58793+ ps_hash_done(&ps_table);
58794+}
58795+
58796+/*
58797+ * Local variables:
58798+ * c-indentation-style: "K&R"
58799+ * mode-name: "LC"
58800+ * c-basic-offset: 8
58801+ * tab-width: 8
58802+ * fill-column: 120
58803+ * End:
58804+ */
58805diff -urN linux-2.6.23.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.23/fs/reiser4/plugin/plugin_set.h
58806--- linux-2.6.23.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 03:00:00.000000000 +0300
58807+++ linux-2.6.23/fs/reiser4/plugin/plugin_set.h 2007-12-04 16:49:30.000000000 +0300
58808@@ -0,0 +1,77 @@
58809+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58810+
58811+/* Reiser4 plugin set definition.
58812+ See fs/reiser4/plugin/plugin_set.c for details */
58813+
58814+#if !defined( __PLUGIN_SET_H__ )
58815+#define __PLUGIN_SET_H__
58816+
58817+#include "../type_safe_hash.h"
58818+#include "plugin.h"
58819+
58820+#include <linux/rcupdate.h>
58821+
58822+struct plugin_set;
58823+typedef struct plugin_set plugin_set;
58824+
58825+TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
58826+
58827+struct plugin_set {
58828+ unsigned long hashval;
58829+ /* plugin of file */
58830+ file_plugin *file;
58831+ /* plugin of dir */
58832+ dir_plugin *dir;
58833+ /* perm plugin for this file */
58834+ perm_plugin *perm;
58835+ /* tail policy plugin. Only meaningful for regular files */
58836+ formatting_plugin *formatting;
58837+ /* hash plugin. Only meaningful for directories. */
58838+ hash_plugin *hash;
58839+ /* fibration plugin. Only meaningful for directories. */
58840+ fibration_plugin *fibration;
58841+ /* plugin of stat-data */
58842+ item_plugin *sd;
58843+ /* plugin of items a directory is built of */
58844+ item_plugin *dir_item;
58845+ /* cipher plugin */
58846+ cipher_plugin *cipher;
58847+ /* digest plugin */
58848+ digest_plugin *digest;
58849+ /* compression plugin */
58850+ compression_plugin *compression;
58851+ /* compression mode plugin */
58852+ compression_mode_plugin *compression_mode;
58853+ /* cluster plugin */
58854+ cluster_plugin *cluster;
58855+ /* this specifies file plugin of regular children.
58856+ only meaningful for directories */
58857+ file_plugin *create;
58858+ ps_hash_link link;
58859+};
58860+
58861+extern plugin_set *plugin_set_get_empty(void);
58862+extern void plugin_set_put(plugin_set * set);
58863+
58864+extern int init_plugin_set(void);
58865+extern void done_plugin_set(void);
58866+
58867+extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
58868+extern int set_plugin(plugin_set ** set, pset_member memb,
58869+ reiser4_plugin * plugin);
58870+extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
58871+ reiser4_plugin * plugin);
58872+extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
58873+
58874+/* __PLUGIN_SET_H__ */
58875+#endif
58876+
58877+/* Make Linus happy.
58878+ Local variables:
58879+ c-indentation-style: "K&R"
58880+ mode-name: "LC"
58881+ c-basic-offset: 8
58882+ tab-width: 8
58883+ fill-column: 120
58884+ End:
58885+*/
58886diff -urN linux-2.6.23.orig/fs/reiser4/plugin/security/Makefile linux-2.6.23/fs/reiser4/plugin/security/Makefile
58887--- linux-2.6.23.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 03:00:00.000000000 +0300
58888+++ linux-2.6.23/fs/reiser4/plugin/security/Makefile 2007-12-04 16:49:30.000000000 +0300
58889@@ -0,0 +1,4 @@
58890+obj-$(CONFIG_REISER4_FS) += security_plugins.o
58891+
58892+security_plugins-objs := \
58893+ perm.o
58894diff -urN linux-2.6.23.orig/fs/reiser4/plugin/security/perm.c linux-2.6.23/fs/reiser4/plugin/security/perm.c
58895--- linux-2.6.23.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 03:00:00.000000000 +0300
58896+++ linux-2.6.23/fs/reiser4/plugin/security/perm.c 2007-12-04 16:49:30.000000000 +0300
58897@@ -0,0 +1,33 @@
58898+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58899+
58900+/*
58901+ * This file contains implementation of permission plugins.
58902+ * See the comments in perm.h
58903+ */
58904+
58905+#include "../plugin.h"
58906+#include "../plugin_header.h"
58907+#include "../../debug.h"
58908+
58909+perm_plugin perm_plugins[LAST_PERM_ID] = {
58910+ [NULL_PERM_ID] = {
58911+ .h = {
58912+ .type_id = REISER4_PERM_PLUGIN_TYPE,
58913+ .id = NULL_PERM_ID,
58914+ .pops = NULL,
58915+ .label = "null",
58916+ .desc = "stub permission plugin",
58917+ .linkage = {NULL, NULL}
58918+ }
58919+ }
58920+};
58921+
58922+/*
58923+ * Local variables:
58924+ * c-indentation-style: "K&R"
58925+ * mode-name: "LC"
58926+ * c-basic-offset: 8
58927+ * tab-width: 8
58928+ * fill-column: 79
58929+ * End:
58930+ */
58931diff -urN linux-2.6.23.orig/fs/reiser4/plugin/security/perm.h linux-2.6.23/fs/reiser4/plugin/security/perm.h
58932--- linux-2.6.23.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 03:00:00.000000000 +0300
58933+++ linux-2.6.23/fs/reiser4/plugin/security/perm.h 2007-12-04 16:49:30.000000000 +0300
58934@@ -0,0 +1,38 @@
58935+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58936+
58937+/* Perm (short for "permissions") plugins common stuff. */
58938+
58939+#if !defined( __REISER4_PERM_H__ )
58940+#define __REISER4_PERM_H__
58941+
58942+#include "../../forward.h"
58943+#include "../plugin_header.h"
58944+
58945+#include <linux/types.h>
58946+
58947+/* Definition of permission plugin */
58948+/* NIKITA-FIXME-HANS: define what this is targeted for.
58949+ It does not seem to be intended for use with sys_reiser4. Explain. */
58950+
58951+/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4.
58952+ Consider it like a temporary "seam" and reserved pset member.
58953+ If you have something usefull to add, then rename this plugin and add here */
58954+typedef struct perm_plugin {
58955+ /* generic plugin fields */
58956+ plugin_header h;
58957+} perm_plugin;
58958+
58959+typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
58960+
58961+/* __REISER4_PERM_H__ */
58962+#endif
58963+
58964+/* Make Linus happy.
58965+ Local variables:
58966+ c-indentation-style: "K&R"
58967+ mode-name: "LC"
58968+ c-basic-offset: 8
58969+ tab-width: 8
58970+ fill-column: 120
58971+ End:
58972+*/
58973diff -urN linux-2.6.23.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.23/fs/reiser4/plugin/space/bitmap.c
58974--- linux-2.6.23.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 03:00:00.000000000 +0300
58975+++ linux-2.6.23/fs/reiser4/plugin/space/bitmap.c 2007-12-04 16:49:30.000000000 +0300
58976@@ -0,0 +1,1585 @@
58977+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58978+
58979+#include "../../debug.h"
58980+#include "../../dformat.h"
58981+#include "../../txnmgr.h"
58982+#include "../../jnode.h"
58983+#include "../../block_alloc.h"
58984+#include "../../tree.h"
58985+#include "../../super.h"
58986+#include "../plugin.h"
58987+#include "space_allocator.h"
58988+#include "bitmap.h"
58989+
58990+#include <linux/types.h>
58991+#include <linux/fs.h> /* for struct super_block */
58992+#include <linux/mutex.h>
58993+#include <asm/div64.h>
58994+
58995+/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
58996+ * blocks
58997+
58998+ A useful optimization of reiser4 bitmap handling would be dynamic bitmap
58999+ blocks loading/unloading which is different from v3.x where all bitmap
59000+ blocks are loaded at mount time.
59001+
59002+ To implement bitmap blocks unloading we need to count bitmap block usage
59003+ and detect currently unused blocks allowing them to be unloaded. It is not
59004+ a simple task since we allow several threads to modify one bitmap block
59005+ simultaneously.
59006+
59007+ Briefly speaking, the following schema is proposed: we count in special
59008+ variable associated with each bitmap block. That is for counting of block
59009+ alloc/dealloc operations on that bitmap block. With a deferred block
59010+ deallocation feature of reiser4 all those operation will be represented in
59011+ atom dirty/deleted lists as jnodes for freshly allocated or deleted
59012+ nodes.
59013+
59014+ So, we increment usage counter for each new node allocated or deleted, and
59015+ decrement it at atom commit one time for each node from the dirty/deleted
59016+ atom's list. Of course, freshly allocated node deletion and node reusing
59017+ from atom deleted (if we do so) list should decrement bitmap usage counter
59018+ also.
59019+
59020+ This schema seems to be working but that reference counting is
59021+ not easy to debug. I think we should agree with Hans and do not implement
59022+ it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
59023+
59024+ For simplicity all bitmap nodes (both commit and working bitmap blocks) are
59025+ loaded into memory on fs mount time or each bitmap nodes are loaded at the
59026+ first access to it, the "dont_load_bitmap" mount option controls whether
59027+ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
59028+ nodes currently is not supported. */
59029+
59030+#define CHECKSUM_SIZE 4
59031+
59032+#define BYTES_PER_LONG (sizeof(long))
59033+
59034+#if BITS_PER_LONG == 64
59035+# define LONG_INT_SHIFT (6)
59036+#else
59037+# define LONG_INT_SHIFT (5)
59038+#endif
59039+
59040+#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
59041+
59042+typedef unsigned long ulong_t;
59043+
59044+#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
59045+#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
59046+
59047+/* Block allocation/deallocation are done through special bitmap objects which
59048+ are allocated in an array at fs mount. */
59049+struct bitmap_node {
59050+ struct mutex mutex; /* long term lock object */
59051+
59052+ jnode *wjnode; /* j-nodes for WORKING ... */
59053+ jnode *cjnode; /* ... and COMMIT bitmap blocks */
59054+
59055+ bmap_off_t first_zero_bit; /* for skip_busy option implementation */
59056+
59057+ atomic_t loaded; /* a flag which shows that bnode is loaded
59058+ * already */
59059+};
59060+
59061+static inline char *bnode_working_data(struct bitmap_node *bnode)
59062+{
59063+ char *data;
59064+
59065+ data = jdata(bnode->wjnode);
59066+ assert("zam-429", data != NULL);
59067+
59068+ return data + CHECKSUM_SIZE;
59069+}
59070+
59071+static inline char *bnode_commit_data(const struct bitmap_node *bnode)
59072+{
59073+ char *data;
59074+
59075+ data = jdata(bnode->cjnode);
59076+ assert("zam-430", data != NULL);
59077+
59078+ return data + CHECKSUM_SIZE;
59079+}
59080+
59081+static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
59082+{
59083+ char *data;
59084+
59085+ data = jdata(bnode->cjnode);
59086+ assert("vpf-261", data != NULL);
59087+
59088+ return le32_to_cpu(get_unaligned((d32 *)data));
59089+}
59090+
59091+static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
59092+{
59093+ char *data;
59094+
59095+ data = jdata(bnode->cjnode);
59096+ assert("vpf-261", data != NULL);
59097+
59098+ put_unaligned(cpu_to_le32(crc), (d32 *)data);
59099+}
59100+
59101+/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
59102+ * written the code, does this added abstraction still have */
59103+/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
59104+ * reiser4_space_allocator structure) */
59105+/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
59106+/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
59107+ * someday?". What they about? If there is a reason to have a union, it should
59108+ * be a union, if not, it should not be a union. "..might be someday" means no
59109+ * reason. */
59110+struct bitmap_allocator_data {
59111+ /* an array for bitmap blocks direct access */
59112+ struct bitmap_node *bitmap;
59113+};
59114+
59115+#define get_barray(super) \
59116+(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
59117+
59118+#define get_bnode(super, i) (get_barray(super) + i)
59119+
59120+/* allocate and initialize jnode with JNODE_BITMAP type */
59121+static jnode *bnew(void)
59122+{
59123+ jnode *jal = jalloc();
59124+
59125+ if (jal)
59126+ jnode_init(jal, current_tree, JNODE_BITMAP);
59127+
59128+ return jal;
59129+}
59130+
59131+/* this file contains:
59132+ - bitmap based implementation of space allocation plugin
59133+ - all the helper functions like set bit, find_first_zero_bit, etc */
59134+
59135+/* Audited by: green(2002.06.12) */
59136+static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
59137+{
59138+ ulong_t mask = 1UL << start_bit;
59139+ int i = start_bit;
59140+
59141+ while ((word & mask) != 0) {
59142+ mask <<= 1;
59143+ if (++i >= BITS_PER_LONG)
59144+ break;
59145+ }
59146+
59147+ return i;
59148+}
59149+
59150+#include <linux/bitops.h>
59151+
59152+#if BITS_PER_LONG == 64
59153+
59154+#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
59155+#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
59156+
59157+static inline void reiser4_set_bit(int nr, void *addr)
59158+{
59159+ ext2_set_bit(nr + OFF(addr), BASE(addr));
59160+}
59161+
59162+static inline void reiser4_clear_bit(int nr, void *addr)
59163+{
59164+ ext2_clear_bit(nr + OFF(addr), BASE(addr));
59165+}
59166+
59167+static inline int reiser4_test_bit(int nr, void *addr)
59168+{
59169+ return ext2_test_bit(nr + OFF(addr), BASE(addr));
59170+}
59171+static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
59172+ int offset)
59173+{
59174+ int off = OFF(addr);
59175+
59176+ return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
59177+ offset + off) - off;
59178+}
59179+
59180+#else
59181+
59182+#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
59183+#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
59184+#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
59185+
59186+#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
59187+ext2_find_next_zero_bit(addr, maxoffset, offset)
59188+#endif
59189+
59190+/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
59191+ * are counted from @addr, return the offset of the first bit if it is found,
59192+ * @maxoffset otherwise. */
59193+static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59194+ bmap_off_t start_offset)
59195+{
59196+ ulong_t *base = addr;
59197+ /* start_offset is in bits, convert it to byte offset within bitmap. */
59198+ int word_nr = start_offset >> LONG_INT_SHIFT;
59199+ /* bit number within the byte. */
59200+ int bit_nr = start_offset & LONG_INT_MASK;
59201+ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
59202+
59203+ assert("zam-387", max_offset != 0);
59204+
59205+ /* Unaligned @start_offset case. */
59206+ if (bit_nr != 0) {
59207+ bmap_nr_t nr;
59208+
59209+ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
59210+
59211+ if (nr < BITS_PER_LONG)
59212+ return (word_nr << LONG_INT_SHIFT) + nr;
59213+
59214+ ++word_nr;
59215+ }
59216+
59217+ /* Fast scan trough aligned words. */
59218+ while (word_nr <= max_word_nr) {
59219+ if (base[word_nr] != 0) {
59220+ return (word_nr << LONG_INT_SHIFT)
59221+ + find_next_zero_bit_in_word(~(base[word_nr]), 0);
59222+ }
59223+
59224+ ++word_nr;
59225+ }
59226+
59227+ return max_offset;
59228+}
59229+
59230+#if BITS_PER_LONG == 64
59231+
59232+static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59233+ bmap_off_t start_offset)
59234+{
59235+ bmap_off_t off = OFF(addr);
59236+
59237+ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
59238+ start_offset + off) - off;
59239+}
59240+
59241+#else
59242+#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
59243+ __reiser4_find_next_set_bit(addr, max_offset, start_offset)
59244+#endif
59245+
59246+/* search for the first set bit in single word. */
59247+static int find_last_set_bit_in_word(ulong_t word, int start_bit)
59248+{
59249+ ulong_t bit_mask;
59250+ int nr = start_bit;
59251+
59252+ assert("zam-965", start_bit < BITS_PER_LONG);
59253+ assert("zam-966", start_bit >= 0);
59254+
59255+ bit_mask = (1UL << nr);
59256+
59257+ while (bit_mask != 0) {
59258+ if (bit_mask & word)
59259+ return nr;
59260+ bit_mask >>= 1;
59261+ nr--;
59262+ }
59263+ return BITS_PER_LONG;
59264+}
59265+
59266+/* Search bitmap for a set bit in backward direction from the end to the
59267+ * beginning of given region
59268+ *
59269+ * @result: result offset of the last set bit
59270+ * @addr: base memory address,
59271+ * @low_off: low end of the search region, edge bit included into the region,
59272+ * @high_off: high end of the search region, edge bit included into the region,
59273+ *
59274+ * @return: 0 - set bit was found, -1 otherwise.
59275+ */
59276+static int
59277+reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59278+ bmap_off_t high_off)
59279+{
59280+ ulong_t *base = addr;
59281+ int last_word;
59282+ int first_word;
59283+ int last_bit;
59284+ int nr;
59285+
59286+ assert("zam-962", high_off >= low_off);
59287+
59288+ last_word = high_off >> LONG_INT_SHIFT;
59289+ last_bit = high_off & LONG_INT_MASK;
59290+ first_word = low_off >> LONG_INT_SHIFT;
59291+
59292+ if (last_bit < BITS_PER_LONG) {
59293+ nr = find_last_set_bit_in_word(base[last_word], last_bit);
59294+ if (nr < BITS_PER_LONG) {
59295+ *result = (last_word << LONG_INT_SHIFT) + nr;
59296+ return 0;
59297+ }
59298+ --last_word;
59299+ }
59300+ while (last_word >= first_word) {
59301+ if (base[last_word] != 0x0) {
59302+ last_bit =
59303+ find_last_set_bit_in_word(base[last_word],
59304+ BITS_PER_LONG - 1);
59305+ assert("zam-972", last_bit < BITS_PER_LONG);
59306+ *result = (last_word << LONG_INT_SHIFT) + last_bit;
59307+ return 0;
59308+ }
59309+ --last_word;
59310+ }
59311+
59312+ return -1; /* set bit not found */
59313+}
59314+
59315+/* Search bitmap for a clear bit in backward direction from the end to the
59316+ * beginning of given region */
59317+static int
59318+reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59319+ bmap_off_t high_off)
59320+{
59321+ ulong_t *base = addr;
59322+ int last_word;
59323+ int first_word;
59324+ int last_bit;
59325+ int nr;
59326+
59327+ last_word = high_off >> LONG_INT_SHIFT;
59328+ last_bit = high_off & LONG_INT_MASK;
59329+ first_word = low_off >> LONG_INT_SHIFT;
59330+
59331+ if (last_bit < BITS_PER_LONG) {
59332+ nr = find_last_set_bit_in_word(~base[last_word], last_bit);
59333+ if (nr < BITS_PER_LONG) {
59334+ *result = (last_word << LONG_INT_SHIFT) + nr;
59335+ return 0;
59336+ }
59337+ --last_word;
59338+ }
59339+ while (last_word >= first_word) {
59340+ if (base[last_word] != (ulong_t) (-1)) {
59341+ *result = (last_word << LONG_INT_SHIFT) +
59342+ find_last_set_bit_in_word(~base[last_word],
59343+ BITS_PER_LONG - 1);
59344+ return 0;
59345+ }
59346+ --last_word;
59347+ }
59348+
59349+ return -1; /* zero bit not found */
59350+}
59351+
59352+/* Audited by: green(2002.06.12) */
59353+static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
59354+{
59355+ int first_byte;
59356+ int last_byte;
59357+
59358+ unsigned char first_byte_mask = 0xFF;
59359+ unsigned char last_byte_mask = 0xFF;
59360+
59361+ assert("zam-410", start < end);
59362+
59363+ first_byte = start >> 3;
59364+ last_byte = (end - 1) >> 3;
59365+
59366+ if (last_byte > first_byte + 1)
59367+ memset(addr + first_byte + 1, 0,
59368+ (size_t) (last_byte - first_byte - 1));
59369+
59370+ first_byte_mask >>= 8 - (start & 0x7);
59371+ last_byte_mask <<= ((end - 1) & 0x7) + 1;
59372+
59373+ if (first_byte == last_byte) {
59374+ addr[first_byte] &= (first_byte_mask | last_byte_mask);
59375+ } else {
59376+ addr[first_byte] &= first_byte_mask;
59377+ addr[last_byte] &= last_byte_mask;
59378+ }
59379+}
59380+
59381+/* Audited by: green(2002.06.12) */
59382+/* ZAM-FIXME-HANS: comment this */
59383+static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
59384+{
59385+ int first_byte;
59386+ int last_byte;
59387+
59388+ unsigned char first_byte_mask = 0xFF;
59389+ unsigned char last_byte_mask = 0xFF;
59390+
59391+ assert("zam-386", start < end);
59392+
59393+ first_byte = start >> 3;
59394+ last_byte = (end - 1) >> 3;
59395+
59396+ if (last_byte > first_byte + 1)
59397+ memset(addr + first_byte + 1, 0xFF,
59398+ (size_t) (last_byte - first_byte - 1));
59399+
59400+ first_byte_mask <<= start & 0x7;
59401+ last_byte_mask >>= 7 - ((end - 1) & 0x7);
59402+
59403+ if (first_byte == last_byte) {
59404+ addr[first_byte] |= (first_byte_mask & last_byte_mask);
59405+ } else {
59406+ addr[first_byte] |= first_byte_mask;
59407+ addr[last_byte] |= last_byte_mask;
59408+ }
59409+}
59410+
59411+#define ADLER_BASE 65521
59412+#define ADLER_NMAX 5552
59413+
59414+/* Calculates the adler32 checksum for the data pointed by `data` of the
59415+ length `len`. This function was originally taken from zlib, version 1.1.3,
59416+ July 9th, 1998.
59417+
59418+ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
59419+
59420+ This software is provided 'as-is', without any express or implied
59421+ warranty. In no event will the authors be held liable for any damages
59422+ arising from the use of this software.
59423+
59424+ Permission is granted to anyone to use this software for any purpose,
59425+ including commercial applications, and to alter it and redistribute it
59426+ freely, subject to the following restrictions:
59427+
59428+ 1. The origin of this software must not be misrepresented; you must not
59429+ claim that you wrote the original software. If you use this software
59430+ in a product, an acknowledgment in the product documentation would be
59431+ appreciated but is not required.
59432+ 2. Altered source versions must be plainly marked as such, and must not be
59433+ misrepresented as being the original software.
59434+ 3. This notice may not be removed or altered from any source distribution.
59435+
59436+ Jean-loup Gailly Mark Adler
59437+ jloup@gzip.org madler@alumni.caltech.edu
59438+
59439+ The above comment applies only to the reiser4_adler32 function.
59440+*/
59441+
59442+__u32 reiser4_adler32(char *data, __u32 len)
59443+{
59444+ unsigned char *t = data;
59445+ __u32 s1 = 1;
59446+ __u32 s2 = 0;
59447+ int k;
59448+
59449+ while (len > 0) {
59450+ k = len < ADLER_NMAX ? len : ADLER_NMAX;
59451+ len -= k;
59452+
59453+ while (k--) {
59454+ s1 += *t++;
59455+ s2 += s1;
59456+ }
59457+
59458+ s1 %= ADLER_BASE;
59459+ s2 %= ADLER_BASE;
59460+ }
59461+ return (s2 << 16) | s1;
59462+}
59463+
59464+#define sb_by_bnode(bnode) \
59465+ ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
59466+
59467+static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
59468+{
59469+ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
59470+}
59471+
59472+static int
59473+bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
59474+{
59475+ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
59476+ bmap_nr_t bmap;
59477+
59478+ bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
59479+
59480+ warning("vpf-263",
59481+ "Checksum for the bitmap block %llu is incorrect",
59482+ bmap);
59483+
59484+ return RETERR(-EIO);
59485+ }
59486+
59487+ return 0;
59488+}
59489+
59490+#define REISER4_CHECK_BMAP_CRC (0)
59491+
59492+#if REISER4_CHECK_BMAP_CRC
59493+static int bnode_check_crc(const struct bitmap_node *bnode)
59494+{
59495+ return bnode_check_adler32(bnode,
59496+ bmap_size(sb_by_bnode(bnode)->s_blocksize));
59497+}
59498+
59499+/* REISER4_CHECK_BMAP_CRC */
59500+#else
59501+
59502+#define bnode_check_crc(bnode) (0)
59503+
59504+/* REISER4_CHECK_BMAP_CRC */
59505+#endif
59506+
59507+/* Recalculates the adler32 checksum for only 1 byte change.
59508+ adler - previous adler checksum
59509+ old_data, data - old, new byte values.
59510+ tail == (chunk - offset) : length, checksum was calculated for, - offset of
59511+ the changed byte within this chunk.
59512+ This function can be used for checksum calculation optimisation.
59513+*/
59514+
59515+static __u32
59516+adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
59517+ __u32 tail)
59518+{
59519+ __u32 delta = data - old_data + 2 * ADLER_BASE;
59520+ __u32 s1 = adler & 0xffff;
59521+ __u32 s2 = (adler >> 16) & 0xffff;
59522+
59523+ s1 = (delta + s1) % ADLER_BASE;
59524+ s2 = (delta * tail + s2) % ADLER_BASE;
59525+
59526+ return (s2 << 16) | s1;
59527+}
59528+
59529+#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
59530+
59531+/**
59532+ * get_nr_bitmap - calculate number of bitmap blocks
59533+ * @super: super block with initialized blocksize and block count
59534+ *
59535+ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
59536+ * maintain free disk space. It assumes that each bitmap addresses the same
59537+ * number of blocks which is calculated by bmap_block_count macro defined in
59538+ * above. Number of blocks in the filesystem has to be initialized in reiser4
59539+ * private data of super block already so that it can be obtained via
59540+ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
59541+ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
59542+ * to use special function to divide and modulo 64bits filesystem block
59543+ * counters.
59544+ *
59545+ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
59546+ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
59547+ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
59548+ */
59549+static bmap_nr_t get_nr_bmap(const struct super_block *super)
59550+{
59551+ u64 quotient;
59552+
59553+ assert("zam-393", reiser4_block_count(super) != 0);
59554+
59555+ quotient = reiser4_block_count(super) - 1;
59556+ do_div(quotient, bmap_bit_count(super->s_blocksize));
59557+ return quotient + 1;
59558+}
59559+
59560+/**
59561+ * parse_blocknr - calculate bitmap number and offset in it by block number
59562+ * @block: pointer to block number to calculate location in bitmap of
59563+ * @bmap: pointer where to store bitmap block number
59564+ * @offset: pointer where to store offset within bitmap block
59565+ *
59566+ * Calculates location of bit which is responsible for allocation/freeing of
59567+ * block @*block. That location is represented by bitmap block number and offset
59568+ * within that bitmap block.
59569+ */
59570+static void
59571+parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
59572+ bmap_off_t *offset)
59573+{
59574+ struct super_block *super = get_current_context()->super;
59575+ u64 quotient = *block;
59576+
59577+ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
59578+ *bmap = quotient;
59579+
59580+ assert("zam-433", *bmap < get_nr_bmap(super));
59581+ assert("", *offset < bmap_bit_count(super->s_blocksize));
59582+}
59583+
59584+#if REISER4_DEBUG
59585+/* Audited by: green(2002.06.12) */
59586+static void
59587+check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
59588+{
59589+ struct super_block *sb = reiser4_get_current_sb();
59590+
59591+ assert("zam-436", sb != NULL);
59592+
59593+ assert("zam-455", start != NULL);
59594+ assert("zam-437", *start != 0);
59595+ assert("zam-541", !reiser4_blocknr_is_fake(start));
59596+ assert("zam-441", *start < reiser4_block_count(sb));
59597+
59598+ if (len != NULL) {
59599+ assert("zam-438", *len != 0);
59600+ assert("zam-442", *start + *len <= reiser4_block_count(sb));
59601+ }
59602+}
59603+
59604+static void check_bnode_loaded(const struct bitmap_node *bnode)
59605+{
59606+ assert("zam-485", bnode != NULL);
59607+ assert("zam-483", jnode_page(bnode->wjnode) != NULL);
59608+ assert("zam-484", jnode_page(bnode->cjnode) != NULL);
59609+ assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
59610+ assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
59611+}
59612+
59613+#else
59614+
59615+# define check_block_range(start, len) do { /* nothing */} while(0)
59616+# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
59617+
59618+#endif
59619+
59620+/* modify bnode->first_zero_bit (if we free bits before); bnode should be
59621+ spin-locked */
59622+static inline void
59623+adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
59624+{
59625+ if (offset < bnode->first_zero_bit)
59626+ bnode->first_zero_bit = offset;
59627+}
59628+
59629+/* return a physical disk address for logical bitmap number @bmap */
59630+/* FIXME-VS: this is somehow related to disk layout? */
59631+/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
59632+ * per block allocation so that performance is not affected. Probably this
59633+ * whole file should be considered part of the disk layout plugin, and other
59634+ * disk layouts can use other defines and efficiency will not be significantly
59635+ * affected. */
59636+
59637+#define REISER4_FIRST_BITMAP_BLOCK \
59638+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
59639+
59640+/* Audited by: green(2002.06.12) */
59641+static void
59642+get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
59643+ reiser4_block_nr * bnr)
59644+{
59645+
59646+ assert("zam-390", bmap < get_nr_bmap(super));
59647+
59648+#ifdef CONFIG_REISER4_BADBLOCKS
59649+#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
59650+ /* Check if the diskmap have this already, first. */
59651+ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
59652+ return; /* Found it in diskmap */
59653+#endif
59654+ /* FIXME_ZAM: before discussing of disk layouts and disk format
59655+ plugins I implement bitmap location scheme which is close to scheme
59656+ used in reiser 3.6 */
59657+ if (bmap == 0) {
59658+ *bnr = REISER4_FIRST_BITMAP_BLOCK;
59659+ } else {
59660+ *bnr = bmap * bmap_bit_count(super->s_blocksize);
59661+ }
59662+}
59663+
59664+/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
59665+/* Audited by: green(2002.06.12) */
59666+static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
59667+{
59668+ *bnr =
59669+ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
59670+ REISER4_BITMAP_BLOCKS_STATUS_VALUE);
59671+}
59672+
59673+/* bnode structure initialization */
59674+static void
59675+init_bnode(struct bitmap_node *bnode,
59676+ struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
59677+{
59678+ memset(bnode, 0, sizeof(struct bitmap_node));
59679+
59680+ mutex_init(&bnode->mutex);
59681+ atomic_set(&bnode->loaded, 0);
59682+}
59683+
59684+static void release(jnode * node)
59685+{
59686+ jrelse(node);
59687+ JF_SET(node, JNODE_HEARD_BANSHEE);
59688+ jput(node);
59689+}
59690+
59691+/* This function is for internal bitmap.c use because it assumes that jnode is
59692+ in under full control of this thread */
59693+static void done_bnode(struct bitmap_node *bnode)
59694+{
59695+ if (bnode) {
59696+ atomic_set(&bnode->loaded, 0);
59697+ if (bnode->wjnode != NULL)
59698+ release(bnode->wjnode);
59699+ if (bnode->cjnode != NULL)
59700+ release(bnode->cjnode);
59701+ bnode->wjnode = bnode->cjnode = NULL;
59702+ }
59703+}
59704+
59705+/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
59706+static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
59707+ jnode **wjnode_ret)
59708+{
59709+ struct super_block *super;
59710+ jnode *cjnode;
59711+ jnode *wjnode;
59712+ bmap_nr_t bmap;
59713+ int ret;
59714+
59715+ super = reiser4_get_current_sb();
59716+
59717+ *wjnode_ret = wjnode = bnew();
59718+ if (wjnode == NULL) {
59719+ *cjnode_ret = NULL;
59720+ return RETERR(-ENOMEM);
59721+ }
59722+
59723+ *cjnode_ret = cjnode = bnew();
59724+ if (cjnode == NULL)
59725+ return RETERR(-ENOMEM);
59726+
59727+ bmap = bnode - get_bnode(super, 0);
59728+
59729+ get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
59730+ get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
59731+
59732+ jref(cjnode);
59733+ jref(wjnode);
59734+
59735+ /* load commit bitmap */
59736+ ret = jload_gfp(cjnode, GFP_NOFS, 1);
59737+
59738+ if (ret)
59739+ goto error;
59740+
59741+ /* allocate memory for working bitmap block. Note that for
59742+ * bitmaps jinit_new() doesn't actually modifies node content,
59743+ * so parallel calls to this are ok. */
59744+ ret = jinit_new(wjnode, GFP_NOFS);
59745+
59746+ if (ret != 0) {
59747+ jrelse(cjnode);
59748+ goto error;
59749+ }
59750+
59751+ return 0;
59752+
59753+ error:
59754+ jput(cjnode);
59755+ jput(wjnode);
59756+ *wjnode_ret = *cjnode_ret = NULL;
59757+ return ret;
59758+
59759+}
59760+
59761+/* Check the bnode data on read. */
59762+static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
59763+{
59764+ void *data;
59765+ int ret;
59766+
59767+ /* Check CRC */
59768+ ret = bnode_check_adler32(bnode, blksize);
59769+
59770+ if (ret) {
59771+ return ret;
59772+ }
59773+
59774+ data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
59775+
59776+ /* Check the very first bit -- it must be busy. */
59777+ if (!reiser4_test_bit(0, data)) {
59778+ warning("vpf-1362", "The allocator block %llu is not marked "
59779+ "as used.", (unsigned long long)bnode->cjnode->blocknr);
59780+
59781+ return -EINVAL;
59782+ }
59783+
59784+ return 0;
59785+}
59786+
59787+/* load bitmap blocks "on-demand" */
59788+static int load_and_lock_bnode(struct bitmap_node *bnode)
59789+{
59790+ int ret;
59791+
59792+ jnode *cjnode;
59793+ jnode *wjnode;
59794+
59795+ assert("nikita-3040", reiser4_schedulable());
59796+
59797+/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
59798+ * need to be atomic, right? Just leave a comment that if bitmaps were
59799+ * unloadable, this would need to be atomic. */
59800+ if (atomic_read(&bnode->loaded)) {
59801+ /* bitmap is already loaded, nothing to do */
59802+ check_bnode_loaded(bnode);
59803+ mutex_lock(&bnode->mutex);
59804+ assert("nikita-2827", atomic_read(&bnode->loaded));
59805+ return 0;
59806+ }
59807+
59808+ ret = prepare_bnode(bnode, &cjnode, &wjnode);
59809+ if (ret == 0) {
59810+ mutex_lock(&bnode->mutex);
59811+
59812+ if (!atomic_read(&bnode->loaded)) {
59813+ assert("nikita-2822", cjnode != NULL);
59814+ assert("nikita-2823", wjnode != NULL);
59815+ assert("nikita-2824", jnode_is_loaded(cjnode));
59816+ assert("nikita-2825", jnode_is_loaded(wjnode));
59817+
59818+ bnode->wjnode = wjnode;
59819+ bnode->cjnode = cjnode;
59820+
59821+ ret = check_struct_bnode(bnode, current_blocksize);
59822+ if (!ret) {
59823+ cjnode = wjnode = NULL;
59824+ atomic_set(&bnode->loaded, 1);
59825+ /* working bitmap is initialized by on-disk
59826+ * commit bitmap. This should be performed
59827+ * under mutex. */
59828+ memcpy(bnode_working_data(bnode),
59829+ bnode_commit_data(bnode),
59830+ bmap_size(current_blocksize));
59831+ } else
59832+ mutex_unlock(&bnode->mutex);
59833+ } else
59834+ /* race: someone already loaded bitmap while we were
59835+ * busy initializing data. */
59836+ check_bnode_loaded(bnode);
59837+ }
59838+
59839+ if (wjnode != NULL) {
59840+ release(wjnode);
59841+ bnode->wjnode = NULL;
59842+ }
59843+ if (cjnode != NULL) {
59844+ release(cjnode);
59845+ bnode->cjnode = NULL;
59846+ }
59847+
59848+ return ret;
59849+}
59850+
59851+static void release_and_unlock_bnode(struct bitmap_node *bnode)
59852+{
59853+ check_bnode_loaded(bnode);
59854+ mutex_unlock(&bnode->mutex);
59855+}
59856+
59857+/* This function does all block allocation work but only for one bitmap
59858+ block.*/
59859+/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
59860+ block responsibility zone boundaries. This had no sense in v3.6 but may
59861+ have it in v4.x */
59862+/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
59863+static int
59864+search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
59865+ bmap_off_t max_offset, int min_len, int max_len)
59866+{
59867+ struct super_block *super = get_current_context()->super;
59868+ struct bitmap_node *bnode = get_bnode(super, bmap);
59869+
59870+ char *data;
59871+
59872+ bmap_off_t search_end;
59873+ bmap_off_t start;
59874+ bmap_off_t end;
59875+
59876+ int set_first_zero_bit = 0;
59877+
59878+ int ret;
59879+
59880+ assert("zam-364", min_len > 0);
59881+ assert("zam-365", max_len >= min_len);
59882+ assert("zam-366", *offset <= max_offset);
59883+
59884+ ret = load_and_lock_bnode(bnode);
59885+
59886+ if (ret)
59887+ return ret;
59888+
59889+ data = bnode_working_data(bnode);
59890+
59891+ start = *offset;
59892+
59893+ if (bnode->first_zero_bit >= start) {
59894+ start = bnode->first_zero_bit;
59895+ set_first_zero_bit = 1;
59896+ }
59897+
59898+ while (start + min_len < max_offset) {
59899+
59900+ start =
59901+ reiser4_find_next_zero_bit((long *)data, max_offset, start);
59902+ if (set_first_zero_bit) {
59903+ bnode->first_zero_bit = start;
59904+ set_first_zero_bit = 0;
59905+ }
59906+ if (start >= max_offset)
59907+ break;
59908+
59909+ search_end = LIMIT(start + max_len, max_offset);
59910+ end =
59911+ reiser4_find_next_set_bit((long *)data, search_end, start);
59912+ if (end >= start + min_len) {
59913+ /* we can't trust find_next_set_bit result if set bit
59914+ was not fount, result may be bigger than
59915+ max_offset */
59916+ if (end > search_end)
59917+ end = search_end;
59918+
59919+ ret = end - start;
59920+ *offset = start;
59921+
59922+ reiser4_set_bits(data, start, end);
59923+
59924+ /* FIXME: we may advance first_zero_bit if [start,
59925+ end] region overlaps the first_zero_bit point */
59926+
59927+ break;
59928+ }
59929+
59930+ start = end + 1;
59931+ }
59932+
59933+ release_and_unlock_bnode(bnode);
59934+
59935+ return ret;
59936+}
59937+
59938+static int
59939+search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
59940+ bmap_off_t end_offset, int min_len, int max_len)
59941+{
59942+ struct super_block *super = get_current_context()->super;
59943+ struct bitmap_node *bnode = get_bnode(super, bmap);
59944+ char *data;
59945+ bmap_off_t start;
59946+ int ret;
59947+
59948+ assert("zam-958", min_len > 0);
59949+ assert("zam-959", max_len >= min_len);
59950+ assert("zam-960", *start_offset >= end_offset);
59951+
59952+ ret = load_and_lock_bnode(bnode);
59953+ if (ret)
59954+ return ret;
59955+
59956+ data = bnode_working_data(bnode);
59957+ start = *start_offset;
59958+
59959+ while (1) {
59960+ bmap_off_t end, search_end;
59961+
59962+ /* Find the beginning of the zero filled region */
59963+ if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
59964+ break;
59965+ /* Is there more than `min_len' bits from `start' to
59966+ * `end_offset'? */
59967+ if (start < end_offset + min_len - 1)
59968+ break;
59969+
59970+ /* Do not search to `end_offset' if we need to find less than
59971+ * `max_len' zero bits. */
59972+ if (end_offset + max_len - 1 < start)
59973+ search_end = start - max_len + 1;
59974+ else
59975+ search_end = end_offset;
59976+
59977+ if (reiser4_find_last_set_bit(&end, data, search_end, start))
59978+ end = search_end;
59979+ else
59980+ end++;
59981+
59982+ if (end + min_len <= start + 1) {
59983+ if (end < search_end)
59984+ end = search_end;
59985+ ret = start - end + 1;
59986+ *start_offset = end; /* `end' is lowest offset */
59987+ assert("zam-987",
59988+ reiser4_find_next_set_bit(data, start + 1,
59989+ end) >= start + 1);
59990+ reiser4_set_bits(data, end, start + 1);
59991+ break;
59992+ }
59993+
59994+ if (end <= end_offset)
59995+ /* left search boundary reached. */
59996+ break;
59997+ start = end - 1;
59998+ }
59999+
60000+ release_and_unlock_bnode(bnode);
60001+ return ret;
60002+}
60003+
60004+/* allocate contiguous range of blocks in bitmap */
60005+static int bitmap_alloc_forward(reiser4_block_nr * start,
60006+ const reiser4_block_nr * end, int min_len,
60007+ int max_len)
60008+{
60009+ bmap_nr_t bmap, end_bmap;
60010+ bmap_off_t offset, end_offset;
60011+ int len;
60012+
60013+ reiser4_block_nr tmp;
60014+
60015+ struct super_block *super = get_current_context()->super;
60016+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60017+
60018+ parse_blocknr(start, &bmap, &offset);
60019+
60020+ tmp = *end - 1;
60021+ parse_blocknr(&tmp, &end_bmap, &end_offset);
60022+ ++end_offset;
60023+
60024+ assert("zam-358", end_bmap >= bmap);
60025+ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
60026+
60027+ for (; bmap < end_bmap; bmap++, offset = 0) {
60028+ len =
60029+ search_one_bitmap_forward(bmap, &offset, max_offset,
60030+ min_len, max_len);
60031+ if (len != 0)
60032+ goto out;
60033+ }
60034+
60035+ len =
60036+ search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
60037+ max_len);
60038+ out:
60039+ *start = bmap * max_offset + offset;
60040+ return len;
60041+}
60042+
60043+/* allocate contiguous range of blocks in bitmap (from @start to @end in
60044+ * backward direction) */
60045+static int bitmap_alloc_backward(reiser4_block_nr * start,
60046+ const reiser4_block_nr * end, int min_len,
60047+ int max_len)
60048+{
60049+ bmap_nr_t bmap, end_bmap;
60050+ bmap_off_t offset, end_offset;
60051+ int len;
60052+ struct super_block *super = get_current_context()->super;
60053+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60054+
60055+ parse_blocknr(start, &bmap, &offset);
60056+ parse_blocknr(end, &end_bmap, &end_offset);
60057+
60058+ assert("zam-961", end_bmap <= bmap);
60059+ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
60060+
60061+ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
60062+ len =
60063+ search_one_bitmap_backward(bmap, &offset, 0, min_len,
60064+ max_len);
60065+ if (len != 0)
60066+ goto out;
60067+ }
60068+
60069+ len =
60070+ search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
60071+ max_len);
60072+ out:
60073+ *start = bmap * max_offset + offset;
60074+ return len;
60075+}
60076+
60077+/* plugin->u.space_allocator.alloc_blocks() */
60078+static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
60079+ reiser4_block_nr *start, reiser4_block_nr *len)
60080+{
60081+ struct super_block *super = get_current_context()->super;
60082+ int actual_len;
60083+
60084+ reiser4_block_nr search_start;
60085+ reiser4_block_nr search_end;
60086+
60087+ assert("zam-398", super != NULL);
60088+ assert("zam-412", hint != NULL);
60089+ assert("zam-397", hint->blk <= reiser4_block_count(super));
60090+
60091+ if (hint->max_dist == 0)
60092+ search_end = reiser4_block_count(super);
60093+ else
60094+ search_end =
60095+ LIMIT(hint->blk + hint->max_dist,
60096+ reiser4_block_count(super));
60097+
60098+ /* We use @hint -> blk as a search start and search from it to the end
60099+ of the disk or in given region if @hint -> max_dist is not zero */
60100+ search_start = hint->blk;
60101+
60102+ actual_len =
60103+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60104+
60105+ /* There is only one bitmap search if max_dist was specified or first
60106+ pass was from the beginning of the bitmap. We also do one pass for
60107+ scanning bitmap in backward direction. */
60108+ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
60109+ /* next step is a scanning from 0 to search_start */
60110+ search_end = search_start;
60111+ search_start = 0;
60112+ actual_len =
60113+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60114+ }
60115+ if (actual_len == 0)
60116+ return RETERR(-ENOSPC);
60117+ if (actual_len < 0)
60118+ return RETERR(actual_len);
60119+ *len = actual_len;
60120+ *start = search_start;
60121+ return 0;
60122+}
60123+
60124+static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
60125+ reiser4_block_nr * start,
60126+ reiser4_block_nr * len)
60127+{
60128+ reiser4_block_nr search_start;
60129+ reiser4_block_nr search_end;
60130+ int actual_len;
60131+
60132+ ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
60133+
60134+ assert("zam-969", super != NULL);
60135+ assert("zam-970", hint != NULL);
60136+ assert("zam-971", hint->blk <= reiser4_block_count(super));
60137+
60138+ search_start = hint->blk;
60139+ if (hint->max_dist == 0 || search_start <= hint->max_dist)
60140+ search_end = 0;
60141+ else
60142+ search_end = search_start - hint->max_dist;
60143+
60144+ actual_len =
60145+ bitmap_alloc_backward(&search_start, &search_end, 1, needed);
60146+ if (actual_len == 0)
60147+ return RETERR(-ENOSPC);
60148+ if (actual_len < 0)
60149+ return RETERR(actual_len);
60150+ *len = actual_len;
60151+ *start = search_start;
60152+ return 0;
60153+}
60154+
60155+/* plugin->u.space_allocator.alloc_blocks() */
60156+int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
60157+ reiser4_blocknr_hint * hint, int needed,
60158+ reiser4_block_nr * start, reiser4_block_nr * len)
60159+{
60160+ if (hint->backward)
60161+ return alloc_blocks_backward(hint, needed, start, len);
60162+ return alloc_blocks_forward(hint, needed, start, len);
60163+}
60164+
60165+/* plugin->u.space_allocator.dealloc_blocks(). */
60166+/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
60167+ nodes deletion is deferred until transaction commit. However, deallocation
60168+ of temporary objects like wandered blocks and transaction commit records
60169+ requires immediate node deletion from WORKING BITMAP.*/
60170+void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
60171+ reiser4_block_nr start, reiser4_block_nr len)
60172+{
60173+ struct super_block *super = reiser4_get_current_sb();
60174+
60175+ bmap_nr_t bmap;
60176+ bmap_off_t offset;
60177+
60178+ struct bitmap_node *bnode;
60179+ int ret;
60180+
60181+ assert("zam-468", len != 0);
60182+ check_block_range(&start, &len);
60183+
60184+ parse_blocknr(&start, &bmap, &offset);
60185+
60186+ assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
60187+
60188+ bnode = get_bnode(super, bmap);
60189+
60190+ assert("zam-470", bnode != NULL);
60191+
60192+ ret = load_and_lock_bnode(bnode);
60193+ assert("zam-481", ret == 0);
60194+
60195+ reiser4_clear_bits(bnode_working_data(bnode), offset,
60196+ (bmap_off_t) (offset + len));
60197+
60198+ adjust_first_zero_bit(bnode, offset);
60199+
60200+ release_and_unlock_bnode(bnode);
60201+}
60202+
60203+/* plugin->u.space_allocator.check_blocks(). */
60204+void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
60205+ const reiser4_block_nr * len, int desired)
60206+{
60207+#if REISER4_DEBUG
60208+ struct super_block *super = reiser4_get_current_sb();
60209+
60210+ bmap_nr_t bmap;
60211+ bmap_off_t start_offset;
60212+ bmap_off_t end_offset;
60213+
60214+ struct bitmap_node *bnode;
60215+ int ret;
60216+
60217+ assert("zam-622", len != NULL);
60218+ check_block_range(start, len);
60219+ parse_blocknr(start, &bmap, &start_offset);
60220+
60221+ end_offset = start_offset + *len;
60222+ assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
60223+
60224+ bnode = get_bnode(super, bmap);
60225+
60226+ assert("nikita-2215", bnode != NULL);
60227+
60228+ ret = load_and_lock_bnode(bnode);
60229+ assert("zam-626", ret == 0);
60230+
60231+ assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
60232+
60233+ if (desired) {
60234+ assert("zam-623",
60235+ reiser4_find_next_zero_bit(bnode_working_data(bnode),
60236+ end_offset, start_offset)
60237+ >= end_offset);
60238+ } else {
60239+ assert("zam-624",
60240+ reiser4_find_next_set_bit(bnode_working_data(bnode),
60241+ end_offset, start_offset)
60242+ >= end_offset);
60243+ }
60244+
60245+ release_and_unlock_bnode(bnode);
60246+#endif
60247+}
60248+
60249+/* conditional insertion of @node into atom's overwrite set if it was not there */
60250+static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
60251+{
60252+ assert("zam-546", atom != NULL);
60253+ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
60254+ assert("zam-548", node != NULL);
60255+
60256+ spin_lock_atom(atom);
60257+ spin_lock_jnode(node);
60258+
60259+ if (node->atom == NULL) {
60260+ JF_SET(node, JNODE_OVRWR);
60261+ insert_into_atom_ovrwr_list(atom, node);
60262+ } else {
60263+ assert("zam-549", node->atom == atom);
60264+ }
60265+
60266+ spin_unlock_jnode(node);
60267+ spin_unlock_atom(atom);
60268+}
60269+
60270+/* an actor which applies delete set to COMMIT bitmap pages and link modified
60271+ pages in a single-linked list */
60272+static int
60273+apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
60274+ const reiser4_block_nr * len, void *data)
60275+{
60276+
60277+ bmap_nr_t bmap;
60278+ bmap_off_t offset;
60279+ int ret;
60280+
60281+ long long *blocks_freed_p = data;
60282+
60283+ struct bitmap_node *bnode;
60284+
60285+ struct super_block *sb = reiser4_get_current_sb();
60286+
60287+ check_block_range(start, len);
60288+
60289+ parse_blocknr(start, &bmap, &offset);
60290+
60291+ /* FIXME-ZAM: we assume that all block ranges are allocated by this
60292+ bitmap-based allocator and each block range can't go over a zone of
60293+ responsibility of one bitmap block; same assumption is used in
60294+ other journal hooks in bitmap code. */
60295+ bnode = get_bnode(sb, bmap);
60296+ assert("zam-448", bnode != NULL);
60297+
60298+ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
60299+ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
60300+ ret = load_and_lock_bnode(bnode);
60301+ if (ret)
60302+ return ret;
60303+
60304+ /* put bnode into atom's overwrite set */
60305+ cond_add_to_overwrite_set(atom, bnode->cjnode);
60306+
60307+ data = bnode_commit_data(bnode);
60308+
60309+ ret = bnode_check_crc(bnode);
60310+ if (ret != 0)
60311+ return ret;
60312+
60313+ if (len != NULL) {
60314+ /* FIXME-ZAM: a check that all bits are set should be there */
60315+ assert("zam-443",
60316+ offset + *len <= bmap_bit_count(sb->s_blocksize));
60317+ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
60318+
60319+ (*blocks_freed_p) += *len;
60320+ } else {
60321+ reiser4_clear_bit(offset, data);
60322+ (*blocks_freed_p)++;
60323+ }
60324+
60325+ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
60326+
60327+ release_and_unlock_bnode(bnode);
60328+
60329+ return 0;
60330+}
60331+
60332+/* plugin->u.space_allocator.pre_commit_hook(). */
60333+/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
60334+ rest is done by transaction manager (allocate wandered locations for COMMIT
60335+ BITMAP blocks, copy COMMIT BITMAP blocks data). */
60336+/* Only one instance of this function can be running at one given time, because
60337+ only one transaction can be committed a time, therefore it is safe to access
60338+ some global variables without any locking */
60339+
60340+int reiser4_pre_commit_hook_bitmap(void)
60341+{
60342+ struct super_block *super = reiser4_get_current_sb();
60343+ txn_atom *atom;
60344+
60345+ long long blocks_freed = 0;
60346+
60347+ atom = get_current_atom_locked();
60348+ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
60349+ spin_unlock_atom(atom);
60350+
60351+ { /* scan atom's captured list and find all freshly allocated nodes,
60352+ * mark corresponded bits in COMMIT BITMAP as used */
60353+ struct list_head *head = ATOM_CLEAN_LIST(atom);
60354+ jnode *node = list_entry(head->next, jnode, capture_link);
60355+
60356+ while (head != &node->capture_link) {
60357+ /* we detect freshly allocated jnodes */
60358+ if (JF_ISSET(node, JNODE_RELOC)) {
60359+ int ret;
60360+ bmap_nr_t bmap;
60361+
60362+ bmap_off_t offset;
60363+ bmap_off_t index;
60364+ struct bitmap_node *bn;
60365+ __u32 size = bmap_size(super->s_blocksize);
60366+ __u32 crc;
60367+ char byte;
60368+
60369+ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
60370+ assert("zam-460",
60371+ !reiser4_blocknr_is_fake(&node->blocknr));
60372+
60373+ parse_blocknr(&node->blocknr, &bmap, &offset);
60374+ bn = get_bnode(super, bmap);
60375+
60376+ index = offset >> 3;
60377+ assert("vpf-276", index < size);
60378+
60379+ ret = bnode_check_crc(bnode);
60380+ if (ret != 0)
60381+ return ret;
60382+
60383+ check_bnode_loaded(bn);
60384+ load_and_lock_bnode(bn);
60385+
60386+ byte = *(bnode_commit_data(bn) + index);
60387+ reiser4_set_bit(offset, bnode_commit_data(bn));
60388+
60389+ crc = adler32_recalc(bnode_commit_crc(bn), byte,
60390+ *(bnode_commit_data(bn) +
60391+ index),
60392+ size - index),
60393+ bnode_set_commit_crc(bn, crc);
60394+
60395+ release_and_unlock_bnode(bn);
60396+
60397+ ret = bnode_check_crc(bn);
60398+ if (ret != 0)
60399+ return ret;
60400+
60401+ /* working of this depends on how it inserts
60402+ new j-node into clean list, because we are
60403+ scanning the same list now. It is OK, if
60404+ insertion is done to the list front */
60405+ cond_add_to_overwrite_set(atom, bn->cjnode);
60406+ }
60407+
60408+ node = list_entry(node->capture_link.next, jnode, capture_link);
60409+ }
60410+ }
60411+
60412+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
60413+ &blocks_freed, 0);
60414+
60415+ blocks_freed -= atom->nr_blocks_allocated;
60416+
60417+ {
60418+ reiser4_super_info_data *sbinfo;
60419+
60420+ sbinfo = get_super_private(super);
60421+
60422+ spin_lock_reiser4_super(sbinfo);
60423+ sbinfo->blocks_free_committed += blocks_freed;
60424+ spin_unlock_reiser4_super(sbinfo);
60425+ }
60426+
60427+ return 0;
60428+}
60429+
60430+/* plugin->u.space_allocator.init_allocator
60431+ constructor of reiser4_space_allocator object. It is called on fs mount */
60432+int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
60433+ struct super_block *super, void *arg)
60434+{
60435+ struct bitmap_allocator_data *data = NULL;
60436+ bmap_nr_t bitmap_blocks_nr;
60437+ bmap_nr_t i;
60438+
60439+ assert("nikita-3039", reiser4_schedulable());
60440+
60441+ /* getting memory for bitmap allocator private data holder */
60442+ data =
60443+ kmalloc(sizeof(struct bitmap_allocator_data),
60444+ reiser4_ctx_gfp_mask_get());
60445+
60446+ if (data == NULL)
60447+ return RETERR(-ENOMEM);
60448+
60449+ /* allocation and initialization for the array of bnodes */
60450+ bitmap_blocks_nr = get_nr_bmap(super);
60451+
60452+ /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
60453+ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
60454+ may I never meet someone who still uses the ia32 architecture when
60455+ storage devices of that size enter the market, and wants to use ia32
60456+ with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
60457+ probably, another dynamic data structure should replace a static
60458+ array of bnodes. */
60459+ /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
60460+ data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
60461+ if (data->bitmap == NULL) {
60462+ kfree(data);
60463+ return RETERR(-ENOMEM);
60464+ }
60465+
60466+ for (i = 0; i < bitmap_blocks_nr; i++)
60467+ init_bnode(data->bitmap + i, super, i);
60468+
60469+ allocator->u.generic = data;
60470+
60471+#if REISER4_DEBUG
60472+ get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
60473+#endif
60474+
60475+ /* Load all bitmap blocks at mount time. */
60476+ if (!test_bit
60477+ (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
60478+ __u64 start_time, elapsed_time;
60479+ struct bitmap_node *bnode;
60480+ int ret;
60481+
60482+ if (REISER4_DEBUG)
60483+ printk(KERN_INFO "loading reiser4 bitmap...");
60484+ start_time = jiffies;
60485+
60486+ for (i = 0; i < bitmap_blocks_nr; i++) {
60487+ bnode = data->bitmap + i;
60488+ ret = load_and_lock_bnode(bnode);
60489+ if (ret) {
60490+ reiser4_destroy_allocator_bitmap(allocator,
60491+ super);
60492+ return ret;
60493+ }
60494+ release_and_unlock_bnode(bnode);
60495+ }
60496+
60497+ elapsed_time = jiffies - start_time;
60498+ if (REISER4_DEBUG)
60499+ printk("...done (%llu jiffies)\n",
60500+ (unsigned long long)elapsed_time);
60501+ }
60502+
60503+ return 0;
60504+}
60505+
60506+/* plugin->u.space_allocator.destroy_allocator
60507+ destructor. It is called on fs unmount */
60508+int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
60509+ struct super_block *super)
60510+{
60511+ bmap_nr_t bitmap_blocks_nr;
60512+ bmap_nr_t i;
60513+
60514+ struct bitmap_allocator_data *data = allocator->u.generic;
60515+
60516+ assert("zam-414", data != NULL);
60517+ assert("zam-376", data->bitmap != NULL);
60518+
60519+ bitmap_blocks_nr = get_nr_bmap(super);
60520+
60521+ for (i = 0; i < bitmap_blocks_nr; i++) {
60522+ struct bitmap_node *bnode = data->bitmap + i;
60523+
60524+ mutex_lock(&bnode->mutex);
60525+
60526+#if REISER4_DEBUG
60527+ if (atomic_read(&bnode->loaded)) {
60528+ jnode *wj = bnode->wjnode;
60529+ jnode *cj = bnode->cjnode;
60530+
60531+ assert("zam-480", jnode_page(cj) != NULL);
60532+ assert("zam-633", jnode_page(wj) != NULL);
60533+
60534+ assert("zam-634",
60535+ memcmp(jdata(wj), jdata(wj),
60536+ bmap_size(super->s_blocksize)) == 0);
60537+
60538+ }
60539+#endif
60540+ done_bnode(bnode);
60541+ mutex_unlock(&bnode->mutex);
60542+ }
60543+
60544+ vfree(data->bitmap);
60545+ kfree(data);
60546+
60547+ allocator->u.generic = NULL;
60548+
60549+ return 0;
60550+}
60551+
60552+/*
60553+ * Local variables:
60554+ * c-indentation-style: "K&R"
60555+ * mode-name: "LC"
60556+ * c-basic-offset: 8
60557+ * tab-width: 8
60558+ * fill-column: 79
60559+ * scroll-step: 1
60560+ * End:
60561+ */
60562diff -urN linux-2.6.23.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.23/fs/reiser4/plugin/space/bitmap.h
60563--- linux-2.6.23.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 03:00:00.000000000 +0300
60564+++ linux-2.6.23/fs/reiser4/plugin/space/bitmap.h 2007-12-04 16:49:30.000000000 +0300
60565@@ -0,0 +1,47 @@
60566+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60567+
60568+#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
60569+#define __REISER4_PLUGIN_SPACE_BITMAP_H__
60570+
60571+#include "../../dformat.h"
60572+#include "../../block_alloc.h"
60573+
60574+#include <linux/types.h> /* for __u?? */
60575+#include <linux/fs.h> /* for struct super_block */
60576+/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
60577+/* declarations of functions implementing methods of space allocator plugin for
60578+ bitmap based allocator. The functions themselves are in bitmap.c */
60579+extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
60580+ struct super_block *, void *);
60581+extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
60582+ struct super_block *);
60583+extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
60584+ reiser4_blocknr_hint *, int needed,
60585+ reiser4_block_nr * start,
60586+ reiser4_block_nr * len);
60587+extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
60588+ const reiser4_block_nr *, int);
60589+extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
60590+ reiser4_block_nr,
60591+ reiser4_block_nr);
60592+extern int reiser4_pre_commit_hook_bitmap(void);
60593+
60594+#define reiser4_post_commit_hook_bitmap() do{}while(0)
60595+#define reiser4_post_write_back_hook_bitmap() do{}while(0)
60596+#define reiser4_print_info_bitmap(pref, al) do{}while(0)
60597+
60598+typedef __u64 bmap_nr_t;
60599+typedef __u32 bmap_off_t;
60600+
60601+#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
60602+
60603+/* Make Linus happy.
60604+ Local variables:
60605+ c-indentation-style: "K&R"
60606+ mode-name: "LC"
60607+ c-basic-offset: 8
60608+ tab-width: 8
60609+ fill-column: 120
60610+ scroll-step: 1
60611+ End:
60612+*/
60613diff -urN linux-2.6.23.orig/fs/reiser4/plugin/space/Makefile linux-2.6.23/fs/reiser4/plugin/space/Makefile
60614--- linux-2.6.23.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 03:00:00.000000000 +0300
60615+++ linux-2.6.23/fs/reiser4/plugin/space/Makefile 2007-12-04 16:49:30.000000000 +0300
60616@@ -0,0 +1,4 @@
60617+obj-$(CONFIG_REISER4_FS) += space_plugins.o
60618+
60619+space_plugins-objs := \
60620+ bitmap.o
60621diff -urN linux-2.6.23.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.23/fs/reiser4/plugin/space/space_allocator.h
60622--- linux-2.6.23.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 03:00:00.000000000 +0300
60623+++ linux-2.6.23/fs/reiser4/plugin/space/space_allocator.h 2007-12-04 16:49:30.000000000 +0300
60624@@ -0,0 +1,80 @@
60625+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60626+
60627+#ifndef __SPACE_ALLOCATOR_H__
60628+#define __SPACE_ALLOCATOR_H__
60629+
60630+#include "../../forward.h"
60631+#include "bitmap.h"
60632+/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
60633+ * but... */
60634+#define DEF_SPACE_ALLOCATOR(allocator) \
60635+ \
60636+static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
60637+{ \
60638+ return reiser4_init_allocator_##allocator (al, s, opaque); \
60639+} \
60640+ \
60641+static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
60642+{ \
60643+ reiser4_destroy_allocator_##allocator (al, s); \
60644+} \
60645+ \
60646+static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
60647+ int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
60648+{ \
60649+ return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \
60650+} \
60651+static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
60652+{ \
60653+ reiser4_dealloc_blocks_##allocator (al, start, len); \
60654+} \
60655+ \
60656+static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
60657+{ \
60658+ reiser4_check_blocks_##allocator (start, end, desired); \
60659+} \
60660+ \
60661+static inline void sa_pre_commit_hook (void) \
60662+{ \
60663+ reiser4_pre_commit_hook_##allocator (); \
60664+} \
60665+ \
60666+static inline void sa_post_commit_hook (void) \
60667+{ \
60668+ reiser4_post_commit_hook_##allocator (); \
60669+} \
60670+ \
60671+static inline void sa_post_write_back_hook (void) \
60672+{ \
60673+ reiser4_post_write_back_hook_##allocator(); \
60674+} \
60675+ \
60676+static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
60677+{ \
60678+ reiser4_print_info_##allocator (prefix, al); \
60679+}
60680+
60681+DEF_SPACE_ALLOCATOR(bitmap)
60682+
60683+/* this object is part of reiser4 private in-core super block */
60684+struct reiser4_space_allocator {
60685+ union {
60686+ /* space allocators might use this pointer to reference their
60687+ * data. */
60688+ void *generic;
60689+ } u;
60690+};
60691+
60692+/* __SPACE_ALLOCATOR_H__ */
60693+#endif
60694+
60695+/* Make Linus happy.
60696+ Local variables:
60697+ c-indentation-style: "K&R"
60698+ mode-name: "LC"
60699+ c-basic-offset: 8
60700+ tab-width: 8
60701+ fill-column: 120
60702+ scroll-step: 1
60703+ End:
60704+*/
60705diff -urN linux-2.6.23.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.23/fs/reiser4/plugin/tail_policy.c
60706--- linux-2.6.23.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 03:00:00.000000000 +0300
60707+++ linux-2.6.23/fs/reiser4/plugin/tail_policy.c 2007-12-04 16:49:30.000000000 +0300
60708@@ -0,0 +1,113 @@
60709+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60710+ * reiser4/README */
60711+
60712+/* Formatting policy plugins */
60713+
60714+/*
60715+ * Formatting policy plugin is used by object plugin (of regular file) to
60716+ * convert file between two representations.
60717+ *
60718+ * Currently following policies are implemented:
60719+ * never store file in formatted nodes
60720+ * always store file in formatted nodes
60721+ * store file in formatted nodes if file is smaller than 4 blocks (default)
60722+ */
60723+
60724+#include "../tree.h"
60725+#include "../inode.h"
60726+#include "../super.h"
60727+#include "object.h"
60728+#include "plugin.h"
60729+#include "node/node.h"
60730+#include "plugin_header.h"
60731+
60732+#include <linux/pagemap.h>
60733+#include <linux/fs.h> /* For struct inode */
60734+
60735+/**
60736+ * have_formatting_never -
60737+ * @inode:
60738+ * @size:
60739+ *
60740+ *
60741+ */
60742+/* Never store file's tail as direct item */
60743+/* Audited by: green(2002.06.12) */
60744+static int have_formatting_never(const struct inode *inode UNUSED_ARG
60745+ /* inode to operate on */ ,
60746+ loff_t size UNUSED_ARG /* new object size */ )
60747+{
60748+ return 0;
60749+}
60750+
60751+/* Always store file's tail as direct item */
60752+/* Audited by: green(2002.06.12) */
60753+static int
60754+have_formatting_always(const struct inode *inode UNUSED_ARG
60755+ /* inode to operate on */ ,
60756+ loff_t size UNUSED_ARG /* new object size */ )
60757+{
60758+ return 1;
60759+}
60760+
60761+/* This function makes test if we should store file denoted @inode as tails only or
60762+ as extents only. */
60763+static int
60764+have_formatting_default(const struct inode *inode UNUSED_ARG
60765+ /* inode to operate on */ ,
60766+ loff_t size /* new object size */ )
60767+{
60768+ assert("umka-1253", inode != NULL);
60769+
60770+ if (size > inode->i_sb->s_blocksize * 4)
60771+ return 0;
60772+
60773+ return 1;
60774+}
60775+
60776+/* tail plugins */
60777+formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
60778+ [NEVER_TAILS_FORMATTING_ID] = {
60779+ .h = {
60780+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60781+ .id = NEVER_TAILS_FORMATTING_ID,
60782+ .pops = NULL,
60783+ .label = "never",
60784+ .desc = "Never store file's tail",
60785+ .linkage = {NULL, NULL}
60786+ },
60787+ .have_tail = have_formatting_never
60788+ },
60789+ [ALWAYS_TAILS_FORMATTING_ID] = {
60790+ .h = {
60791+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60792+ .id = ALWAYS_TAILS_FORMATTING_ID,
60793+ .pops = NULL,
60794+ .label = "always",
60795+ .desc = "Always store file's tail",
60796+ .linkage = {NULL, NULL}
60797+ },
60798+ .have_tail = have_formatting_always
60799+ },
60800+ [SMALL_FILE_FORMATTING_ID] = {
60801+ .h = {
60802+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60803+ .id = SMALL_FILE_FORMATTING_ID,
60804+ .pops = NULL,
60805+ .label = "4blocks",
60806+ .desc = "store files shorter than 4 blocks in tail items",
60807+ .linkage = {NULL, NULL}
60808+ },
60809+ .have_tail = have_formatting_default
60810+ }
60811+};
60812+
60813+/*
60814+ * Local variables:
60815+ * c-indentation-style: "K&R"
60816+ * mode-name: "LC"
60817+ * c-basic-offset: 8
60818+ * tab-width: 8
60819+ * fill-column: 79
60820+ * End:
60821+ */
60822diff -urN linux-2.6.23.orig/fs/reiser4/pool.c linux-2.6.23/fs/reiser4/pool.c
60823--- linux-2.6.23.orig/fs/reiser4/pool.c 1970-01-01 03:00:00.000000000 +0300
60824+++ linux-2.6.23/fs/reiser4/pool.c 2007-12-04 16:49:30.000000000 +0300
60825@@ -0,0 +1,231 @@
60826+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60827+ * reiser4/README */
60828+
60829+/* Fast pool allocation.
60830+
60831+ There are situations when some sub-system normally asks memory allocator
60832+ for only few objects, but under some circumstances could require much
60833+ more. Typical and actually motivating example is tree balancing. It needs
60834+ to keep track of nodes that were involved into it, and it is well-known
60835+ that in reasonable packed balanced tree most (92.938121%) percent of all
60836+ balancings end up after working with only few nodes (3.141592 on
60837+ average). But in rare cases balancing can involve much more nodes
60838+ (3*tree_height+1 in extremal situation).
60839+
60840+ On the one hand, we don't want to resort to dynamic allocation (slab,
60841+ malloc(), etc.) to allocate data structures required to keep track of
60842+ nodes during balancing. On the other hand, we cannot statically allocate
60843+ required amount of space on the stack, because first: it is useless wastage
60844+ of precious resource, and second: this amount is unknown in advance (tree
60845+ height can change).
60846+
60847+ Pools, implemented in this file are solution for this problem:
60848+
60849+ - some configurable amount of objects is statically preallocated on the
60850+ stack
60851+
60852+ - if this preallocated pool is exhausted and more objects is requested
60853+ they are allocated dynamically.
60854+
60855+ Pools encapsulate distinction between statically and dynamically allocated
60856+ objects. Both allocation and recycling look exactly the same.
60857+
60858+ To keep track of dynamically allocated objects, pool adds its own linkage
60859+ to each object.
60860+
60861+ NOTE-NIKITA This linkage also contains some balancing-specific data. This
60862+ is not perfect. On the other hand, balancing is currently the only client
60863+ of pool code.
60864+
60865+ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
60866+ functions in the style of tslist/tshash, i.e., make them unreadable, but
60867+ type-safe.
60868+
60869+*/
60870+
60871+#include "debug.h"
60872+#include "pool.h"
60873+#include "super.h"
60874+
60875+#include <linux/types.h>
60876+#include <linux/err.h>
60877+
60878+/* initialize new pool object @h */
60879+static void reiser4_init_pool_obj(struct reiser4_pool_header * h)
60880+{
60881+ INIT_LIST_HEAD(&h->usage_linkage);
60882+ INIT_LIST_HEAD(&h->level_linkage);
60883+ INIT_LIST_HEAD(&h->extra_linkage);
60884+}
60885+
60886+/* initialize new pool */
60887+void reiser4_init_pool(struct reiser4_pool * pool /* pool to initialize */ ,
60888+ size_t obj_size /* size of objects in @pool */ ,
60889+ int num_of_objs /* number of preallocated objects */ ,
60890+ char *data /* area for preallocated objects */ )
60891+{
60892+ struct reiser4_pool_header *h;
60893+ int i;
60894+
60895+ assert("nikita-955", pool != NULL);
60896+ assert("nikita-1044", obj_size > 0);
60897+ assert("nikita-956", num_of_objs >= 0);
60898+ assert("nikita-957", data != NULL);
60899+
60900+ memset(pool, 0, sizeof *pool);
60901+ pool->obj_size = obj_size;
60902+ pool->data = data;
60903+ INIT_LIST_HEAD(&pool->free);
60904+ INIT_LIST_HEAD(&pool->used);
60905+ INIT_LIST_HEAD(&pool->extra);
60906+ memset(data, 0, obj_size * num_of_objs);
60907+ for (i = 0; i < num_of_objs; ++i) {
60908+ h = (struct reiser4_pool_header *) (data + i * obj_size);
60909+ reiser4_init_pool_obj(h);
60910+ /* add pool header to the end of pool's free list */
60911+ list_add_tail(&h->usage_linkage, &pool->free);
60912+ }
60913+}
60914+
60915+/* release pool resources
60916+
60917+ Release all resources acquired by this pool, specifically, dynamically
60918+ allocated objects.
60919+
60920+*/
60921+void reiser4_done_pool(struct reiser4_pool * pool UNUSED_ARG)
60922+{
60923+}
60924+
60925+/* allocate carry object from @pool
60926+
60927+ First, try to get preallocated object. If this fails, resort to dynamic
60928+ allocation.
60929+
60930+*/
60931+static void *reiser4_pool_alloc(struct reiser4_pool * pool)
60932+{
60933+ struct reiser4_pool_header *result;
60934+
60935+ assert("nikita-959", pool != NULL);
60936+
60937+ if (!list_empty(&pool->free)) {
60938+ struct list_head *linkage;
60939+
60940+ linkage = pool->free.next;
60941+ list_del(linkage);
60942+ INIT_LIST_HEAD(linkage);
60943+ result = list_entry(linkage, struct reiser4_pool_header,
60944+ usage_linkage);
60945+ BUG_ON(!list_empty(&result->level_linkage) ||
60946+ !list_empty(&result->extra_linkage));
60947+ } else {
60948+ /* pool is empty. Extra allocations don't deserve dedicated
60949+ slab to be served from, as they are expected to be rare. */
60950+ result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
60951+ if (result != 0) {
60952+ reiser4_init_pool_obj(result);
60953+ list_add(&result->extra_linkage, &pool->extra);
60954+ } else
60955+ return ERR_PTR(RETERR(-ENOMEM));
60956+ BUG_ON(!list_empty(&result->usage_linkage) ||
60957+ !list_empty(&result->level_linkage));
60958+ }
60959+ ++pool->objs;
60960+ list_add(&result->usage_linkage, &pool->used);
60961+ memset(result + 1, 0, pool->obj_size - sizeof *result);
60962+ return result;
60963+}
60964+
60965+/* return object back to the pool */
60966+void reiser4_pool_free(struct reiser4_pool * pool,
60967+ struct reiser4_pool_header * h)
60968+{
60969+ assert("nikita-961", h != NULL);
60970+ assert("nikita-962", pool != NULL);
60971+
60972+ --pool->objs;
60973+ assert("nikita-963", pool->objs >= 0);
60974+
60975+ list_del_init(&h->usage_linkage);
60976+ list_del_init(&h->level_linkage);
60977+
60978+ if (list_empty(&h->extra_linkage))
60979+ /*
60980+ * pool header is not an extra one. Push it onto free list
60981+ * using usage_linkage
60982+ */
60983+ list_add(&h->usage_linkage, &pool->free);
60984+ else {
60985+ /* remove pool header from pool's extra list and kfree it */
60986+ list_del(&h->extra_linkage);
60987+ kfree(h);
60988+ }
60989+}
60990+
60991+/* add new object to the carry level list
60992+
60993+ Carry level is FIFO most of the time, but not always. Complications arise
60994+ when make_space() function tries to go to the left neighbor and thus adds
60995+ carry node before existing nodes, and also, when updating delimiting keys
60996+ after moving data between two nodes, we want left node to be locked before
60997+ right node.
60998+
60999+ Latter case is confusing at the first glance. Problem is that COP_UPDATE
61000+ opration that updates delimiting keys is sometimes called with two nodes
61001+ (when data are moved between two nodes) and sometimes with only one node
61002+ (when leftmost item is deleted in a node). In any case operation is
61003+ supplied with at least node whose left delimiting key is to be updated
61004+ (that is "right" node).
61005+
61006+ @pool - from which to allocate new object;
61007+ @list - where to add object;
61008+ @reference - after (or before) which existing object to add
61009+*/
61010+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
61011+ struct list_head *list,
61012+ pool_ordering order,
61013+ struct reiser4_pool_header * reference)
61014+{
61015+ struct reiser4_pool_header *result;
61016+
61017+ assert("nikita-972", pool != NULL);
61018+
61019+ result = reiser4_pool_alloc(pool);
61020+ if (IS_ERR(result))
61021+ return result;
61022+
61023+ assert("nikita-973", result != NULL);
61024+
61025+ switch (order) {
61026+ case POOLO_BEFORE:
61027+ __list_add(&result->level_linkage,
61028+ reference->level_linkage.prev,
61029+ &reference->level_linkage);
61030+ break;
61031+ case POOLO_AFTER:
61032+ __list_add(&result->level_linkage,
61033+ &reference->level_linkage,
61034+ reference->level_linkage.next);
61035+ break;
61036+ case POOLO_LAST:
61037+ list_add_tail(&result->level_linkage, list);
61038+ break;
61039+ case POOLO_FIRST:
61040+ list_add(&result->level_linkage, list);
61041+ break;
61042+ default:
61043+ wrong_return_value("nikita-927", "order");
61044+ }
61045+ return result;
61046+}
61047+
61048+/* Make Linus happy.
61049+ Local variables:
61050+ c-indentation-style: "K&R"
61051+ mode-name: "LC"
61052+ c-basic-offset: 8
61053+ tab-width: 8
61054+ fill-column: 120
61055+ End:
61056+*/
61057diff -urN linux-2.6.23.orig/fs/reiser4/pool.h linux-2.6.23/fs/reiser4/pool.h
61058--- linux-2.6.23.orig/fs/reiser4/pool.h 1970-01-01 03:00:00.000000000 +0300
61059+++ linux-2.6.23/fs/reiser4/pool.h 2007-12-04 16:49:30.000000000 +0300
61060@@ -0,0 +1,56 @@
61061+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61062+
61063+/* Fast pool allocation */
61064+
61065+#ifndef __REISER4_POOL_H__
61066+#define __REISER4_POOL_H__
61067+
61068+#include <linux/types.h>
61069+
61070+struct reiser4_pool {
61071+ size_t obj_size;
61072+ int objs;
61073+ char *data;
61074+ struct list_head free;
61075+ struct list_head used;
61076+ struct list_head extra;
61077+};
61078+
61079+struct reiser4_pool_header {
61080+ /* object is either on free or "used" lists */
61081+ struct list_head usage_linkage;
61082+ struct list_head level_linkage;
61083+ struct list_head extra_linkage;
61084+};
61085+
61086+typedef enum {
61087+ POOLO_BEFORE,
61088+ POOLO_AFTER,
61089+ POOLO_LAST,
61090+ POOLO_FIRST
61091+} pool_ordering;
61092+
61093+/* pool manipulation functions */
61094+
61095+extern void reiser4_init_pool(struct reiser4_pool * pool, size_t obj_size,
61096+ int num_of_objs, char *data);
61097+extern void reiser4_done_pool(struct reiser4_pool * pool);
61098+extern void reiser4_pool_free(struct reiser4_pool * pool,
61099+ struct reiser4_pool_header * h);
61100+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
61101+ struct list_head * list,
61102+ pool_ordering order,
61103+ struct reiser4_pool_header *reference);
61104+
61105+/* __REISER4_POOL_H__ */
61106+#endif
61107+
61108+/* Make Linus happy.
61109+ Local variables:
61110+ c-indentation-style: "K&R"
61111+ mode-name: "LC"
61112+ c-basic-offset: 8
61113+ tab-width: 8
61114+ fill-column: 120
61115+ End:
61116+*/
61117diff -urN linux-2.6.23.orig/fs/reiser4/readahead.c linux-2.6.23/fs/reiser4/readahead.c
61118--- linux-2.6.23.orig/fs/reiser4/readahead.c 1970-01-01 03:00:00.000000000 +0300
61119+++ linux-2.6.23/fs/reiser4/readahead.c 2007-12-04 16:49:30.000000000 +0300
61120@@ -0,0 +1,138 @@
61121+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61122+ * reiser4/README */
61123+
61124+#include "forward.h"
61125+#include "tree.h"
61126+#include "tree_walk.h"
61127+#include "super.h"
61128+#include "inode.h"
61129+#include "key.h"
61130+#include "znode.h"
61131+
61132+#include <linux/swap.h> /* for totalram_pages */
61133+
61134+void reiser4_init_ra_info(ra_info_t * rai)
61135+{
61136+ rai->key_to_stop = *reiser4_min_key();
61137+}
61138+
61139+/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
61140+static inline int ra_adjacent_only(int flags)
61141+{
61142+ return flags & RA_ADJACENT_ONLY;
61143+}
61144+
61145+/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
61146+ if right neighbor's first key is less or equal to readahead's stop key */
61147+static int should_readahead_neighbor(znode * node, ra_info_t * info)
61148+{
61149+ int result;
61150+
61151+ read_lock_dk(znode_get_tree(node));
61152+ result = keyle(znode_get_rd_key(node), &info->key_to_stop);
61153+ read_unlock_dk(znode_get_tree(node));
61154+ return result;
61155+}
61156+
61157+#define LOW_MEM_PERCENTAGE (5)
61158+
61159+static int low_on_memory(void)
61160+{
61161+ unsigned int freepages;
61162+
61163+ freepages = nr_free_pages();
61164+ return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
61165+}
61166+
61167+/* start read for @node and for a few of its right neighbors */
61168+void formatted_readahead(znode * node, ra_info_t * info)
61169+{
61170+ struct formatted_ra_params *ra_params;
61171+ znode *cur;
61172+ int i;
61173+ int grn_flags;
61174+ lock_handle next_lh;
61175+
61176+ /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
61177+ if (reiser4_blocknr_is_fake(znode_get_block(node)))
61178+ return;
61179+
61180+ ra_params = get_current_super_ra_params();
61181+
61182+ if (znode_page(node) == NULL)
61183+ jstartio(ZJNODE(node));
61184+
61185+ if (znode_get_level(node) != LEAF_LEVEL)
61186+ return;
61187+
61188+ /* don't waste memory for read-ahead when low on memory */
61189+ if (low_on_memory())
61190+ return;
61191+
61192+ /* We can have locked nodes on upper tree levels, in this situation lock
61193+ priorities do not help to resolve deadlocks, we have to use TRY_LOCK
61194+ here. */
61195+ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
61196+
61197+ i = 0;
61198+ cur = zref(node);
61199+ init_lh(&next_lh);
61200+ while (i < ra_params->max) {
61201+ const reiser4_block_nr *nextblk;
61202+
61203+ if (!should_readahead_neighbor(cur, info))
61204+ break;
61205+
61206+ if (reiser4_get_right_neighbor
61207+ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
61208+ break;
61209+
61210+ nextblk = znode_get_block(next_lh.node);
61211+ if (reiser4_blocknr_is_fake(nextblk) ||
61212+ (ra_adjacent_only(ra_params->flags)
61213+ && *nextblk != *znode_get_block(cur) + 1)) {
61214+ break;
61215+ }
61216+
61217+ zput(cur);
61218+ cur = zref(next_lh.node);
61219+ done_lh(&next_lh);
61220+ if (znode_page(cur) == NULL)
61221+ jstartio(ZJNODE(cur));
61222+ else
61223+ /* Do not scan read-ahead window if pages already
61224+ * allocated (and i/o already started). */
61225+ break;
61226+
61227+ i++;
61228+ }
61229+ zput(cur);
61230+ done_lh(&next_lh);
61231+}
61232+
61233+void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
61234+{
61235+ reiser4_key *stop_key;
61236+
61237+ assert("nikita-3542", dir != NULL);
61238+ assert("nikita-3543", tap != NULL);
61239+
61240+ stop_key = &tap->ra_info.key_to_stop;
61241+ /* initialize readdir readahead information: include into readahead
61242+ * stat data of all files of the directory */
61243+ set_key_locality(stop_key, get_inode_oid(dir));
61244+ set_key_type(stop_key, KEY_SD_MINOR);
61245+ set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
61246+ set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
61247+ set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
61248+}
61249+
61250+/*
61251+ Local variables:
61252+ c-indentation-style: "K&R"
61253+ mode-name: "LC"
61254+ c-basic-offset: 8
61255+ tab-width: 8
61256+ fill-column: 80
61257+ End:
61258+*/
61259diff -urN linux-2.6.23.orig/fs/reiser4/readahead.h linux-2.6.23/fs/reiser4/readahead.h
61260--- linux-2.6.23.orig/fs/reiser4/readahead.h 1970-01-01 03:00:00.000000000 +0300
61261+++ linux-2.6.23/fs/reiser4/readahead.h 2007-12-04 16:49:30.000000000 +0300
61262@@ -0,0 +1,51 @@
61263+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61264+
61265+#ifndef __READAHEAD_H__
61266+#define __READAHEAD_H__
61267+
61268+#include "key.h"
61269+
61270+typedef enum {
61271+ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent.
61272+ Default is NO (not only adjacent) */
61273+} ra_global_flags;
61274+
61275+/* reiser4 super block has a field of this type.
61276+ It controls readahead during tree traversals */
61277+struct formatted_ra_params {
61278+ unsigned long max; /* request not more than this amount of nodes.
61279+ Default is totalram_pages / 4 */
61280+ int flags;
61281+};
61282+
61283+typedef struct {
61284+ reiser4_key key_to_stop;
61285+} ra_info_t;
61286+
61287+void formatted_readahead(znode *, ra_info_t *);
61288+void reiser4_init_ra_info(ra_info_t * rai);
61289+
61290+struct reiser4_file_ra_state {
61291+ loff_t start; /* Current window */
61292+ loff_t size;
61293+ loff_t next_size; /* Next window size */
61294+ loff_t ahead_start; /* Ahead window */
61295+ loff_t ahead_size;
61296+ loff_t max_window_size; /* Maximum readahead window */
61297+ loff_t slow_start; /* enlarging r/a size algorithm. */
61298+};
61299+
61300+extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
61301+
61302+/* __READAHEAD_H__ */
61303+#endif
61304+
61305+/*
61306+ Local variables:
61307+ c-indentation-style: "K&R"
61308+ mode-name: "LC"
61309+ c-basic-offset: 8
61310+ tab-width: 8
61311+ fill-column: 120
61312+ End:
61313+*/
61314diff -urN linux-2.6.23.orig/fs/reiser4/README linux-2.6.23/fs/reiser4/README
61315--- linux-2.6.23.orig/fs/reiser4/README 1970-01-01 03:00:00.000000000 +0300
61316+++ linux-2.6.23/fs/reiser4/README 2007-12-04 16:49:30.000000000 +0300
61317@@ -0,0 +1,128 @@
61318+[LICENSING]
61319+
61320+Reiser4 is hereby licensed under the GNU General
61321+Public License version 2.
61322+
61323+Source code files that contain the phrase "licensing governed by
61324+reiser4/README" are "governed files" throughout this file. Governed
61325+files are licensed under the GPL. The portions of them owned by Hans
61326+Reiser, or authorized to be licensed by him, have been in the past,
61327+and likely will be in the future, licensed to other parties under
61328+other licenses. If you add your code to governed files, and don't
61329+want it to be owned by Hans Reiser, put your copyright label on that
61330+code so the poor blight and his customers can keep things straight.
61331+All portions of governed files not labeled otherwise are owned by Hans
61332+Reiser, and by adding your code to it, widely distributing it to
61333+others or sending us a patch, and leaving the sentence in stating that
61334+licensing is governed by the statement in this file, you accept this.
61335+It will be a kindness if you identify whether Hans Reiser is allowed
61336+to license code labeled as owned by you on your behalf other than
61337+under the GPL, because he wants to know if it is okay to do so and put
61338+a check in the mail to you (for non-trivial improvements) when he
61339+makes his next sale. He makes no guarantees as to the amount if any,
61340+though he feels motivated to motivate contributors, and you can surely
61341+discuss this with him before or after contributing. You have the
61342+right to decline to allow him to license your code contribution other
61343+than under the GPL.
61344+
61345+Further licensing options are available for commercial and/or other
61346+interests directly from Hans Reiser: reiser@namesys.com. If you interpret
61347+the GPL as not allowing those additional licensing options, you read
61348+it wrongly, and Richard Stallman agrees with me, when carefully read
61349+you can see that those restrictions on additional terms do not apply
61350+to the owner of the copyright, and my interpretation of this shall
61351+govern for this license.
61352+
61353+[END LICENSING]
61354+
61355+Reiser4 is a file system based on dancing tree algorithms, and is
61356+described at http://www.namesys.com
61357+
61358+mkfs.reiser4 and other utilities are on our webpage or wherever your
61359+Linux provider put them. You really want to be running the latest
61360+version off the website if you use fsck.
61361+
61362+Yes, if you update your reiser4 kernel module you do have to
61363+recompile your kernel, most of the time. The errors you get will be
61364+quite cryptic if your forget to do so.
61365+
61366+Hideous Commercial Pitch: Spread your development costs across other OS
61367+vendors. Select from the best in the world, not the best in your
61368+building, by buying from third party OS component suppliers. Leverage
61369+the software component development power of the internet. Be the most
61370+aggressive in taking advantage of the commercial possibilities of
61371+decentralized internet development, and add value through your branded
61372+integration that you sell as an operating system. Let your competitors
61373+be the ones to compete against the entire internet by themselves. Be
61374+hip, get with the new economic trend, before your competitors do. Send
61375+email to reiser@namesys.com
61376+
61377+Hans Reiser was the primary architect of Reiser4, but a whole team
61378+chipped their ideas in. He invested everything he had into Namesys
61379+for 5.5 dark years of no money before Reiser3 finally started to work well
61380+enough to bring in money. He owns the copyright.
61381+
61382+DARPA was the primary sponsor of Reiser4. DARPA does not endorse
61383+Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
61384+opinion, unique in its willingness to invest into things more
61385+theoretical than the VC community can readily understand, and more
61386+longterm than allows them to be sure that they will be the ones to
61387+extract the economic benefits from. DARPA also integrated us into a
61388+security community that transformed our security worldview.
61389+
61390+Vladimir Saveliev is our lead programmer, with us from the beginning,
61391+and he worked long hours writing the cleanest code. This is why he is
61392+now the lead programmer after years of commitment to our work. He
61393+always made the effort to be the best he could be, and to make his
61394+code the best that it could be. What resulted was quite remarkable. I
61395+don't think that money can ever motivate someone to work the way he
61396+did, he is one of the most selfless men I know.
61397+
61398+Alexander Lyamin was our sysadmin, and helped to educate us in
61399+security issues. Moscow State University and IMT were very generous
61400+in the internet access they provided us, and in lots of other little
61401+ways that a generous institution can be.
61402+
61403+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
61404+locking code, the block allocator, and finished the flushing code.
61405+His code is always crystal clean and well structured.
61406+
61407+Nikita Danilov wrote the core of the balancing code, the core of the
61408+plugins code, and the directory code. He worked a steady pace of long
61409+hours that produced a whole lot of well abstracted code. He is our
61410+senior computer scientist.
61411+
61412+Vladimir Demidov wrote the parser. Writing an in kernel parser is
61413+something very few persons have the skills for, and it is thanks to
61414+him that we can say that the parser is really not so big compared to
61415+various bits of our other code, and making a parser work in the kernel
61416+was not so complicated as everyone would imagine mainly because it was
61417+him doing it...
61418+
61419+Joshua McDonald wrote the transaction manager, and the flush code.
61420+The flush code unexpectedly turned out be extremely hairy for reasons
61421+you can read about on our web page, and he did a great job on an
61422+extremely difficult task.
61423+
61424+Nina Reiser handled our accounting, government relations, and much
61425+more.
61426+
61427+Ramon Reiser developed our website.
61428+
61429+Beverly Palmer drew our graphics.
61430+
61431+Vitaly Fertman developed librepair, userspace plugins repair code, fsck
61432+and worked with Umka on developing libreiser4 and userspace plugins.
61433+
61434+Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
61435+userspace tools (reiser4progs).
61436+
61437+Oleg Drokin (aka Green) is the release manager who fixes everything.
61438+It is so nice to have someone like that on the team. He (plus Chris
61439+and Jeff) make it possible for the entire rest of the Namesys team to
61440+focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
61441+is just amazing to watch his talent for spotting bugs in action.
61442+
61443+Edward Shishkin wrote cryptcompress file plugin (which manages files
61444+built of encrypted and(or) compressed bodies) and other plugins related
61445+to transparent encryption and compression support.
61446diff -urN linux-2.6.23.orig/fs/reiser4/reiser4.h linux-2.6.23/fs/reiser4/reiser4.h
61447--- linux-2.6.23.orig/fs/reiser4/reiser4.h 1970-01-01 03:00:00.000000000 +0300
61448+++ linux-2.6.23/fs/reiser4/reiser4.h 2007-12-04 16:49:30.000000000 +0300
61449@@ -0,0 +1,269 @@
61450+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61451+ * reiser4/README */
61452+
61453+/* definitions of common constants used by reiser4 */
61454+
61455+#if !defined( __REISER4_H__ )
61456+#define __REISER4_H__
61457+
61458+#include <asm/param.h> /* for HZ */
61459+#include <linux/errno.h>
61460+#include <linux/types.h>
61461+#include <linux/fs.h>
61462+#include <linux/hardirq.h>
61463+#include <linux/sched.h>
61464+
61465+/*
61466+ * reiser4 compilation options.
61467+ */
61468+
61469+#if defined(CONFIG_REISER4_DEBUG)
61470+/* turn on assertion checks */
61471+#define REISER4_DEBUG (1)
61472+#else
61473+#define REISER4_DEBUG (0)
61474+#endif
61475+
61476+#if defined(CONFIG_ZLIB_INFLATE)
61477+/* turn on zlib */
61478+#define REISER4_ZLIB (1)
61479+#else
61480+#define REISER4_ZLIB (0)
61481+#endif
61482+
61483+#if defined(CONFIG_CRYPTO_SHA256)
61484+#define REISER4_SHA256 (1)
61485+#else
61486+#define REISER4_SHA256 (0)
61487+#endif
61488+
61489+/*
61490+ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
61491+ * 8-byte components. In the old "small key" mode, it's 3 8-byte
61492+ * components. Additional component, referred to as "ordering" is used to
61493+ * order items from which given object is composed of. As such, ordering is
61494+ * placed between locality and objectid. For directory item ordering contains
61495+ * initial prefix of the file name this item is for. This sorts all directory
61496+ * items within given directory lexicographically (but see
61497+ * fibration.[ch]). For file body and stat-data, ordering contains initial
61498+ * prefix of the name file was initially created with. In the common case
61499+ * (files with single name) this allows to order file bodies and stat-datas in
61500+ * the same order as their respective directory entries, thus speeding up
61501+ * readdir.
61502+ *
61503+ * Note, that kernel can only mount file system with the same key size as one
61504+ * it is compiled for, so flipping this option may render your data
61505+ * inaccessible.
61506+ */
61507+#define REISER4_LARGE_KEY (1)
61508+/*#define REISER4_LARGE_KEY (0)*/
61509+
61510+/*#define GUESS_EXISTS 1*/
61511+
61512+/*
61513+ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
61514+ * option
61515+ */
61516+
61517+extern const char *REISER4_SUPER_MAGIC_STRING;
61518+extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
61519+ * beginning of device */
61520+
61521+/* here go tunable parameters that are not worth special entry in kernel
61522+ configuration */
61523+
61524+/* default number of slots in coord-by-key caches */
61525+#define CBK_CACHE_SLOTS (16)
61526+/* how many elementary tree operation to carry on the next level */
61527+#define CARRIES_POOL_SIZE (5)
61528+/* size of pool of preallocated nodes for carry process. */
61529+#define NODES_LOCKED_POOL_SIZE (5)
61530+
61531+#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61532+#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61533+#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
61534+#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
61535+
61536+/* we are supporting reservation of disk space on uid basis */
61537+#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
61538+/* we are supporting reservation of disk space for groups */
61539+#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
61540+/* we are supporting reservation of disk space for root */
61541+#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
61542+/* we use rapid flush mode, see flush.c for comments. */
61543+#define REISER4_USE_RAPID_FLUSH (1)
61544+
61545+/*
61546+ * set this to 0 if you don't want to use wait-for-flush in ->writepage().
61547+ */
61548+#define REISER4_USE_ENTD (1)
61549+
61550+/* key allocation is Plan-A */
61551+#define REISER4_PLANA_KEY_ALLOCATION (1)
61552+/* key allocation follows good old 3.x scheme */
61553+#define REISER4_3_5_KEY_ALLOCATION (0)
61554+
61555+/* size of hash-table for znodes */
61556+#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
61557+
61558+/* number of buckets in lnode hash-table */
61559+#define LNODE_HTABLE_BUCKETS (1024)
61560+
61561+/* some ridiculously high maximal limit on height of znode tree. This
61562+ is used in declaration of various per level arrays and
61563+ to allocate stattistics gathering array for per-level stats. */
61564+#define REISER4_MAX_ZTREE_HEIGHT (8)
61565+
61566+#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
61567+
61568+/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
61569+ sequential search is on average faster than binary. This is because
61570+ of better optimization and because sequential search is more CPU
61571+ cache friendly. This number (25) was found by experiments on dual AMD
61572+ Athlon(tm), 1400MHz.
61573+
61574+ NOTE: testing in kernel has shown that binary search is more effective than
61575+ implied by results of the user level benchmarking. Probably because in the
61576+ node keys are separated by other data. So value was adjusted after few
61577+ tests. More thorough tuning is needed.
61578+*/
61579+#define REISER4_SEQ_SEARCH_BREAK (3)
61580+
61581+/* don't allow tree to be lower than this */
61582+#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
61583+
61584+/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
61585+ * available memory. */
61586+/* Default value of maximal atom size. Can be ovewritten by
61587+ tmgr.atom_max_size mount option. By default infinity. */
61588+#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
61589+
61590+/* Default value of maximal atom age (in jiffies). After reaching this age
61591+ atom will be forced to commit, either synchronously or asynchronously. Can
61592+ be overwritten by tmgr.atom_max_age mount option. */
61593+#define REISER4_ATOM_MAX_AGE (600 * HZ)
61594+
61595+/* sleeping period for ktxnmrgd */
61596+#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
61597+
61598+/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
61599+#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
61600+
61601+/* start complaining after that many restarts in coord_by_key().
61602+
61603+ This either means incredibly heavy contention for this part of a tree, or
61604+ some corruption or bug.
61605+*/
61606+#define REISER4_CBK_ITERATIONS_LIMIT (100)
61607+
61608+/* return -EIO after that many iterations in coord_by_key().
61609+
61610+ I have witnessed more than 800 iterations (in 30 thread test) before cbk
61611+ finished. --nikita
61612+*/
61613+#define REISER4_MAX_CBK_ITERATIONS 500000
61614+
61615+/* put a per-inode limit on maximal number of directory entries with identical
61616+ keys in hashed directory.
61617+
61618+ Disable this until inheritance interfaces stabilize: we need some way to
61619+ set per directory limit.
61620+*/
61621+#define REISER4_USE_COLLISION_LIMIT (0)
61622+
61623+/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
61624+ will force them to be relocated. */
61625+#define FLUSH_RELOCATE_THRESHOLD 64
61626+/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
61627+ from the preceder it will relocate to that position. */
61628+#define FLUSH_RELOCATE_DISTANCE 64
61629+
61630+/* If we have written this much or more blocks before encountering busy jnode
61631+ in flush list - abort flushing hoping that next time we get called
61632+ this jnode will be clean already, and we will save some seeks. */
61633+#define FLUSH_WRITTEN_THRESHOLD 50
61634+
61635+/* The maximum number of nodes to scan left on a level during flush. */
61636+#define FLUSH_SCAN_MAXNODES 10000
61637+
61638+/* per-atom limit of flushers */
61639+#define ATOM_MAX_FLUSHERS (1)
61640+
61641+/* default tracing buffer size */
61642+#define REISER4_TRACE_BUF_SIZE (1 << 15)
61643+
61644+/* what size units of IO we would like cp, etc., to use, in writing to
61645+ reiser4. In bytes.
61646+
61647+ Can be overwritten by optimal_io_size mount option.
61648+*/
61649+#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
61650+
61651+/* see comments in inode.c:oid_to_uino() */
61652+#define REISER4_UINO_SHIFT (1 << 30)
61653+
61654+/* Mark function argument as unused to avoid compiler warnings. */
61655+#define UNUSED_ARG __attribute__((unused))
61656+
61657+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
61658+#define NONNULL __attribute__((nonnull))
61659+#else
61660+#define NONNULL
61661+#endif
61662+
61663+/* master super block offset in bytes.*/
61664+#define REISER4_MASTER_OFFSET 65536
61665+
61666+/* size of VFS block */
61667+#define VFS_BLKSIZE 512
61668+/* number of bits in size of VFS block (512==2^9) */
61669+#define VFS_BLKSIZE_BITS 9
61670+
61671+#define REISER4_I reiser4_inode_data
61672+
61673+/* implication */
61674+#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
61675+/* logical equivalence */
61676+#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
61677+
61678+#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
61679+
61680+#define NOT_YET (0)
61681+
61682+/** Reiser4 specific error codes **/
61683+
61684+#define REISER4_ERROR_CODE_BASE 10000
61685+
61686+/* Neighbor is not available (side neighbor or parent) */
61687+#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
61688+
61689+/* Node was not found in cache */
61690+#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
61691+
61692+/* node has no free space enough for completion of balancing operation */
61693+#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
61694+
61695+/* repeat operation */
61696+#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
61697+
61698+/* deadlock happens */
61699+#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
61700+
61701+/* operation cannot be performed, because it would block and non-blocking mode
61702+ * was requested. */
61703+#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
61704+
61705+/* wait some event (depends on context), then repeat */
61706+#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
61707+
61708+#endif /* __REISER4_H__ */
61709+
61710+/* Make Linus happy.
61711+ Local variables:
61712+ c-indentation-style: "K&R"
61713+ mode-name: "LC"
61714+ c-basic-offset: 8
61715+ tab-width: 8
61716+ fill-column: 120
61717+ End:
61718+*/
61719diff -urN linux-2.6.23.orig/fs/reiser4/safe_link.c linux-2.6.23/fs/reiser4/safe_link.c
61720--- linux-2.6.23.orig/fs/reiser4/safe_link.c 1970-01-01 03:00:00.000000000 +0300
61721+++ linux-2.6.23/fs/reiser4/safe_link.c 2007-12-04 16:49:30.000000000 +0300
61722@@ -0,0 +1,352 @@
61723+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
61724+ * reiser4/README */
61725+
61726+/* Safe-links. */
61727+
61728+/*
61729+ * Safe-links are used to maintain file system consistency during operations
61730+ * that spawns multiple transactions. For example:
61731+ *
61732+ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
61733+ * without user-visible names in the file system, but still opened by some
61734+ * active process. What happens here is that unlink proper (i.e., removal
61735+ * of the last file name) and file deletion (truncate of file body to zero
61736+ * and deletion of stat-data, that happens when last file descriptor is
61737+ * closed), may belong to different transactions T1 and T2. If a crash
61738+ * happens after T1 commit, but before T2 commit, on-disk file system has
61739+ * a file without name, that is, disk space leak.
61740+ *
61741+ * 2. Truncate. Truncate of large file may spawn multiple transactions. If
61742+ * system crashes while truncate was in-progress, file is left partially
61743+ * truncated, which violates "atomicity guarantees" of reiser4, viz. that
61744+ * every system is atomic.
61745+ *
61746+ * Safe-links address both above cases. Basically, safe-link is a way post
61747+ * some operation to be executed during commit of some other transaction than
61748+ * current one. (Another way to look at the safe-link is to interpret it as a
61749+ * logical logging.)
61750+ *
61751+ * Specifically, at the beginning of unlink safe-link in inserted in the
61752+ * tree. This safe-link is normally removed by file deletion code (during
61753+ * transaction T2 in the above terms). Truncate also inserts safe-link that is
61754+ * normally removed when truncate operation is finished.
61755+ *
61756+ * This means, that in the case of "clean umount" there are no safe-links in
61757+ * the tree. If safe-links are observed during mount, it means that (a) system
61758+ * was terminated abnormally, and (b) safe-link correspond to the "pending"
61759+ * (i.e., not finished) operations that were in-progress during system
61760+ * termination. Each safe-link record enough information to complete
61761+ * corresponding operation, and mount simply "replays" them (hence, the
61762+ * analogy with the logical logging).
61763+ *
61764+ * Safe-links are implemented as blackbox items (see
61765+ * plugin/item/blackbox.[ch]).
61766+ *
61767+ * For the reference: ext3 also has similar mechanism, it's called "an orphan
61768+ * list" there.
61769+ */
61770+
61771+#include "safe_link.h"
61772+#include "debug.h"
61773+#include "inode.h"
61774+
61775+#include "plugin/item/blackbox.h"
61776+
61777+#include <linux/fs.h>
61778+
61779+/*
61780+ * On-disk format of safe-link.
61781+ */
61782+typedef struct safelink {
61783+ reiser4_key sdkey; /* key of stat-data for the file safe-link is
61784+ * for */
61785+ d64 size; /* size to which file should be truncated */
61786+} safelink_t;
61787+
61788+/*
61789+ * locality where safe-link items are stored. Next to the objectid of root
61790+ * directory.
61791+ */
61792+static oid_t safe_link_locality(reiser4_tree * tree)
61793+{
61794+ return get_key_objectid(get_super_private(tree->super)->df_plug->
61795+ root_dir_key(tree->super)) + 1;
61796+}
61797+
61798+/*
61799+ Construct a key for the safe-link. Key has the following format:
61800+
61801+| 60 | 4 | 64 | 4 | 60 | 64 |
61802++---------------+---+------------------+---+---------------+------------------+
61803+| locality | 0 | 0 | 0 | objectid | link type |
61804++---------------+---+------------------+---+---------------+------------------+
61805+| | | | |
61806+| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
61807+
61808+ This is in large keys format. In small keys format second 8 byte chunk is
61809+ out. Locality is a constant returned by safe_link_locality(). objectid is
61810+ an oid of a file on which operation protected by this safe-link is
61811+ performed. link-type is used to distinguish safe-links for different
61812+ operations.
61813+
61814+ */
61815+static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
61816+ reiser4_safe_link_t link, reiser4_key * key)
61817+{
61818+ reiser4_key_init(key);
61819+ set_key_locality(key, safe_link_locality(tree));
61820+ set_key_objectid(key, oid);
61821+ set_key_offset(key, link);
61822+ return key;
61823+}
61824+
61825+/*
61826+ * how much disk space is necessary to insert and remove (in the
61827+ * error-handling path) safe-link.
61828+ */
61829+static __u64 safe_link_tograb(reiser4_tree * tree)
61830+{
61831+ return
61832+ /* insert safe link */
61833+ estimate_one_insert_item(tree) +
61834+ /* remove safe link */
61835+ estimate_one_item_removal(tree) +
61836+ /* drill to the leaf level during insertion */
61837+ 1 + estimate_one_insert_item(tree) +
61838+ /*
61839+ * possible update of existing safe-link. Actually, if
61840+ * safe-link existed already (we failed to remove it), then no
61841+ * insertion is necessary, so this term is already "covered",
61842+ * but for simplicity let's left it.
61843+ */
61844+ 1;
61845+}
61846+
61847+/*
61848+ * grab enough disk space to insert and remove (in the error-handling path)
61849+ * safe-link.
61850+ */
61851+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
61852+{
61853+ int result;
61854+
61855+ grab_space_enable();
61856+ /* The sbinfo->delete_mutex can be taken here.
61857+ * safe_link_release() should be called before leaving reiser4
61858+ * context. */
61859+ result =
61860+ reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
61861+ grab_space_enable();
61862+ return result;
61863+}
61864+
61865+/*
61866+ * release unused disk space reserved by safe_link_grab().
61867+ */
61868+void safe_link_release(reiser4_tree * tree)
61869+{
61870+ reiser4_release_reserved(tree->super);
61871+}
61872+
61873+/*
61874+ * insert into tree safe-link for operation @link on inode @inode.
61875+ */
61876+int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
61877+{
61878+ reiser4_key key;
61879+ safelink_t sl;
61880+ int length;
61881+ int result;
61882+ reiser4_tree *tree;
61883+
61884+ build_sd_key(inode, &sl.sdkey);
61885+ length = sizeof sl.sdkey;
61886+
61887+ if (link == SAFE_TRUNCATE) {
61888+ /*
61889+ * for truncate we have to store final file length also,
61890+ * expand item.
61891+ */
61892+ length += sizeof(sl.size);
61893+ put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
61894+ }
61895+ tree = reiser4_tree_by_inode(inode);
61896+ build_link_key(tree, get_inode_oid(inode), link, &key);
61897+
61898+ result = store_black_box(tree, &key, &sl, length);
61899+ if (result == -EEXIST)
61900+ result = update_black_box(tree, &key, &sl, length);
61901+ return result;
61902+}
61903+
61904+/*
61905+ * remove safe-link corresponding to the operation @link on inode @inode from
61906+ * the tree.
61907+ */
61908+int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
61909+{
61910+ reiser4_key key;
61911+
61912+ return kill_black_box(tree, build_link_key(tree, oid, link, &key));
61913+}
61914+
61915+/*
61916+ * in-memory structure to keep information extracted from safe-link. This is
61917+ * used to iterate over all safe-links.
61918+ */
61919+struct safe_link_context {
61920+ reiser4_tree *tree; /* internal tree */
61921+ reiser4_key key; /* safe-link key */
61922+ reiser4_key sdkey; /* key of object stat-data */
61923+ reiser4_safe_link_t link; /* safe-link type */
61924+ oid_t oid; /* object oid */
61925+ __u64 size; /* final size for truncate */
61926+};
61927+
61928+/*
61929+ * start iterating over all safe-links.
61930+ */
61931+static void safe_link_iter_begin(reiser4_tree * tree,
61932+ struct safe_link_context * ctx)
61933+{
61934+ ctx->tree = tree;
61935+ reiser4_key_init(&ctx->key);
61936+ set_key_locality(&ctx->key, safe_link_locality(tree));
61937+ set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
61938+ set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
61939+}
61940+
61941+/*
61942+ * return next safe-link.
61943+ */
61944+static int safe_link_iter_next(struct safe_link_context * ctx)
61945+{
61946+ int result;
61947+ safelink_t sl;
61948+
61949+ result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
61950+ if (result == 0) {
61951+ ctx->oid = get_key_objectid(&ctx->key);
61952+ ctx->link = get_key_offset(&ctx->key);
61953+ ctx->sdkey = sl.sdkey;
61954+ if (ctx->link == SAFE_TRUNCATE)
61955+ ctx->size = le64_to_cpu(get_unaligned(&sl.size));
61956+ }
61957+ return result;
61958+}
61959+
61960+/*
61961+ * check are there any more safe-links left in the tree.
61962+ */
61963+static int safe_link_iter_finished(struct safe_link_context * ctx)
61964+{
61965+ return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
61966+}
61967+
61968+/*
61969+ * finish safe-link iteration.
61970+ */
61971+static void safe_link_iter_end(struct safe_link_context * ctx)
61972+{
61973+ /* nothing special */
61974+}
61975+
61976+/*
61977+ * process single safe-link.
61978+ */
61979+static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
61980+ reiser4_key * sdkey, oid_t oid, __u64 size)
61981+{
61982+ struct inode *inode;
61983+ int result;
61984+
61985+ /*
61986+ * obtain object inode by reiser4_iget(), then call object plugin
61987+ * ->safelink() method to do actual work, then delete safe-link on
61988+ * success.
61989+ */
61990+ inode = reiser4_iget(super, sdkey, 1);
61991+ if (!IS_ERR(inode)) {
61992+ file_plugin *fplug;
61993+
61994+ fplug = inode_file_plugin(inode);
61995+ assert("nikita-3428", fplug != NULL);
61996+ assert("", oid == get_inode_oid(inode));
61997+ if (fplug->safelink != NULL) {
61998+ /* reiser4_txn_restart_current is not necessary because
61999+ * mounting is signle thread. However, without it
62000+ * deadlock detection code will complain (see
62001+ * nikita-3361). */
62002+ reiser4_txn_restart_current();
62003+ result = fplug->safelink(inode, link, size);
62004+ } else {
62005+ warning("nikita-3430",
62006+ "Cannot handle safelink for %lli",
62007+ (unsigned long long)oid);
62008+ reiser4_print_key("key", sdkey);
62009+ result = 0;
62010+ }
62011+ if (result != 0) {
62012+ warning("nikita-3431",
62013+ "Error processing safelink for %lli: %i",
62014+ (unsigned long long)oid, result);
62015+ }
62016+ reiser4_iget_complete(inode);
62017+ iput(inode);
62018+ if (result == 0) {
62019+ result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT);
62020+ if (result == 0)
62021+ result =
62022+ safe_link_del(reiser4_get_tree(super), oid, link);
62023+ safe_link_release(reiser4_get_tree(super));
62024+ /*
62025+ * restart transaction: if there was large number of
62026+ * safe-links, their processing may fail to fit into
62027+ * single transaction.
62028+ */
62029+ if (result == 0)
62030+ reiser4_txn_restart_current();
62031+ }
62032+ } else
62033+ result = PTR_ERR(inode);
62034+ return result;
62035+}
62036+
62037+/*
62038+ * iterate over all safe-links in the file-system processing them one by one.
62039+ */
62040+int process_safelinks(struct super_block *super)
62041+{
62042+ struct safe_link_context ctx;
62043+ int result;
62044+
62045+ if (rofs_super(super))
62046+ /* do nothing on the read-only file system */
62047+ return 0;
62048+ safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
62049+ result = 0;
62050+ do {
62051+ result = safe_link_iter_next(&ctx);
62052+ if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
62053+ result = 0;
62054+ break;
62055+ }
62056+ if (result == 0)
62057+ result = process_safelink(super, ctx.link,
62058+ &ctx.sdkey, ctx.oid,
62059+ ctx.size);
62060+ } while (result == 0);
62061+ safe_link_iter_end(&ctx);
62062+ return result;
62063+}
62064+
62065+/* Make Linus happy.
62066+ Local variables:
62067+ c-indentation-style: "K&R"
62068+ mode-name: "LC"
62069+ c-basic-offset: 8
62070+ tab-width: 8
62071+ fill-column: 120
62072+ scroll-step: 1
62073+ End:
62074+*/
62075diff -urN linux-2.6.23.orig/fs/reiser4/safe_link.h linux-2.6.23/fs/reiser4/safe_link.h
62076--- linux-2.6.23.orig/fs/reiser4/safe_link.h 1970-01-01 03:00:00.000000000 +0300
62077+++ linux-2.6.23/fs/reiser4/safe_link.h 2007-12-04 16:49:30.000000000 +0300
62078@@ -0,0 +1,29 @@
62079+/* Copyright 2003 by Hans Reiser, licensing governed by
62080+ * reiser4/README */
62081+
62082+/* Safe-links. See safe_link.c for details. */
62083+
62084+#if !defined( __FS_SAFE_LINK_H__ )
62085+#define __FS_SAFE_LINK_H__
62086+
62087+#include "tree.h"
62088+
62089+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
62090+void safe_link_release(reiser4_tree * tree);
62091+int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
62092+int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
62093+
62094+int process_safelinks(struct super_block *super);
62095+
62096+/* __FS_SAFE_LINK_H__ */
62097+#endif
62098+
62099+/* Make Linus happy.
62100+ Local variables:
62101+ c-indentation-style: "K&R"
62102+ mode-name: "LC"
62103+ c-basic-offset: 8
62104+ tab-width: 8
62105+ fill-column: 120
62106+ End:
62107+*/
62108diff -urN linux-2.6.23.orig/fs/reiser4/seal.c linux-2.6.23/fs/reiser4/seal.c
62109--- linux-2.6.23.orig/fs/reiser4/seal.c 1970-01-01 03:00:00.000000000 +0300
62110+++ linux-2.6.23/fs/reiser4/seal.c 2007-12-04 16:49:30.000000000 +0300
62111@@ -0,0 +1,218 @@
62112+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62113+/* Seals implementation. */
62114+/* Seals are "weak" tree pointers. They are analogous to tree coords in
62115+ allowing to bypass tree traversal. But normal usage of coords implies that
62116+ node pointed to by coord is locked, whereas seals don't keep a lock (or
62117+ even a reference) to znode. In stead, each znode contains a version number,
62118+ increased on each znode modification. This version number is copied into a
62119+ seal when seal is created. Later, one can "validate" seal by calling
62120+ reiser4_seal_validate(). If znode is in cache and its version number is
62121+ still the same, seal is "pristine" and coord associated with it can be
62122+ re-used immediately.
62123+
62124+ If, on the other hand, znode is out of cache, or it is obviously different
62125+ one from the znode seal was initially attached to (for example, it is on
62126+ the different level, or is being removed from the tree), seal is
62127+ irreparably invalid ("burned") and tree traversal has to be repeated.
62128+
62129+ Otherwise, there is some hope, that while znode was modified (and seal was
62130+ "broken" as a result), key attached to the seal is still in the node. This
62131+ is checked by first comparing this key with delimiting keys of node and, if
62132+ key is ok, doing intra-node lookup.
62133+
62134+ Znode version is maintained in the following way:
62135+
62136+ there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
62137+ znode_epoch is incremented and its new value is stored in ->version field
62138+ of new znode. Whenever znode is dirtied (which means it was probably
62139+ modified), znode_epoch is also incremented and its new value is stored in
62140+ znode->version. This is done so, because just incrementing znode->version
62141+ on each update is not enough: it may so happen, that znode get deleted, new
62142+ znode is allocated for the same disk block and gets the same version
62143+ counter, tricking seal code into false positive.
62144+*/
62145+
62146+#include "forward.h"
62147+#include "debug.h"
62148+#include "key.h"
62149+#include "coord.h"
62150+#include "seal.h"
62151+#include "plugin/item/item.h"
62152+#include "plugin/node/node.h"
62153+#include "jnode.h"
62154+#include "znode.h"
62155+#include "super.h"
62156+
62157+static znode *seal_node(const seal_t * seal);
62158+static int seal_matches(const seal_t * seal, znode * node);
62159+
62160+/* initialise seal. This can be called several times on the same seal. @coord
62161+ and @key can be NULL. */
62162+void reiser4_seal_init(seal_t * seal /* seal to initialise */ ,
62163+ const coord_t * coord /* coord @seal will be
62164+ * attached to */ ,
62165+ const reiser4_key * key UNUSED_ARG /* key @seal will be
62166+ * attached to */ )
62167+{
62168+ assert("nikita-1886", seal != NULL);
62169+ memset(seal, 0, sizeof *seal);
62170+ if (coord != NULL) {
62171+ znode *node;
62172+
62173+ node = coord->node;
62174+ assert("nikita-1987", node != NULL);
62175+ spin_lock_znode(node);
62176+ seal->version = node->version;
62177+ assert("nikita-1988", seal->version != 0);
62178+ seal->block = *znode_get_block(node);
62179+#if REISER4_DEBUG
62180+ seal->coord1 = *coord;
62181+ if (key != NULL)
62182+ seal->key = *key;
62183+#endif
62184+ spin_unlock_znode(node);
62185+ }
62186+}
62187+
62188+/* finish with seal */
62189+void reiser4_seal_done(seal_t * seal /* seal to clear */ )
62190+{
62191+ assert("nikita-1887", seal != NULL);
62192+ seal->version = 0;
62193+}
62194+
62195+/* true if seal was initialised */
62196+int reiser4_seal_is_set(const seal_t * seal /* seal to query */ )
62197+{
62198+ assert("nikita-1890", seal != NULL);
62199+ return seal->version != 0;
62200+}
62201+
62202+#if REISER4_DEBUG
62203+/* helper function for reiser4_seal_validate(). It checks that item at @coord
62204+ * has expected key. This is to detect cases where node was modified but wasn't
62205+ * marked dirty. */
62206+static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
62207+ const reiser4_key * k /* expected key */ )
62208+{
62209+ reiser4_key ukey;
62210+
62211+ return (coord->between != AT_UNIT) ||
62212+ /* FIXME-VS: we only can compare keys for items whose units
62213+ represent exactly one key */
62214+ ((coord_is_existing_unit(coord))
62215+ && (item_is_extent(coord)
62216+ || keyeq(k, unit_key_by_coord(coord, &ukey))))
62217+ || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
62218+ && keyge(k, unit_key_by_coord(coord, &ukey)));
62219+}
62220+#endif
62221+
62222+/* this is used by reiser4_seal_validate. It accepts return value of
62223+ * longterm_lock_znode and returns 1 if it can be interpreted as seal
62224+ * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
62225+ * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
62226+ * We cannot do this in longterm_lock_znode(), because sometimes we want to
62227+ * distinguish between -EINVAL and -E_REPEAT. */
62228+static int should_repeat(int return_code)
62229+{
62230+ return return_code == -EINVAL;
62231+}
62232+
62233+/* (re-)validate seal.
62234+
62235+ Checks whether seal is pristine, and try to revalidate it if possible.
62236+
62237+ If seal was burned, or broken irreparably, return -E_REPEAT.
62238+
62239+ NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
62240+ looking for is in range of keys covered by the sealed node, but item wasn't
62241+ found by node ->lookup() method. Alternative is to return -ENOENT in this
62242+ case, but this would complicate callers logic.
62243+
62244+*/
62245+int reiser4_seal_validate(seal_t * seal /* seal to validate */,
62246+ coord_t * coord /* coord to validate against */,
62247+ const reiser4_key * key /* key to validate against */,
62248+ lock_handle * lh /* resulting lock handle */,
62249+ znode_lock_mode mode /* lock node */,
62250+ znode_lock_request request /* locking priority */)
62251+{
62252+ znode *node;
62253+ int result;
62254+
62255+ assert("nikita-1889", seal != NULL);
62256+ assert("nikita-1881", reiser4_seal_is_set(seal));
62257+ assert("nikita-1882", key != NULL);
62258+ assert("nikita-1883", coord != NULL);
62259+ assert("nikita-1884", lh != NULL);
62260+ assert("nikita-1885", keyeq(&seal->key, key));
62261+ assert("nikita-1989", coords_equal(&seal->coord1, coord));
62262+
62263+ /* obtain znode by block number */
62264+ node = seal_node(seal);
62265+ if (node != NULL) {
62266+ /* znode was in cache, lock it */
62267+ result = longterm_lock_znode(lh, node, mode, request);
62268+ zput(node);
62269+ if (result == 0) {
62270+ if (seal_matches(seal, node)) {
62271+ /* if seal version and znode version
62272+ coincide */
62273+ ON_DEBUG(coord_update_v(coord));
62274+ assert("nikita-1990",
62275+ node == seal->coord1.node);
62276+ assert("nikita-1898",
62277+ WITH_DATA_RET(coord->node, 1,
62278+ check_seal_match(coord,
62279+ key)));
62280+ } else
62281+ result = RETERR(-E_REPEAT);
62282+ }
62283+ if (result != 0) {
62284+ if (should_repeat(result))
62285+ result = RETERR(-E_REPEAT);
62286+ /* unlock node on failure */
62287+ done_lh(lh);
62288+ }
62289+ } else {
62290+ /* znode wasn't in cache */
62291+ result = RETERR(-E_REPEAT);
62292+ }
62293+ return result;
62294+}
62295+
62296+/* helpers functions */
62297+
62298+/* obtain reference to znode seal points to, if in cache */
62299+static znode *seal_node(const seal_t * seal /* seal to query */ )
62300+{
62301+ assert("nikita-1891", seal != NULL);
62302+ return zlook(current_tree, &seal->block);
62303+}
62304+
62305+/* true if @seal version and @node version coincide */
62306+static int seal_matches(const seal_t * seal /* seal to check */ ,
62307+ znode * node /* node to check */ )
62308+{
62309+ int result;
62310+
62311+ assert("nikita-1991", seal != NULL);
62312+ assert("nikita-1993", node != NULL);
62313+
62314+ spin_lock_znode(node);
62315+ result = (seal->version == node->version);
62316+ spin_unlock_znode(node);
62317+ return result;
62318+}
62319+
62320+/* Make Linus happy.
62321+ Local variables:
62322+ c-indentation-style: "K&R"
62323+ mode-name: "LC"
62324+ c-basic-offset: 8
62325+ tab-width: 8
62326+ fill-column: 120
62327+ scroll-step: 1
62328+ End:
62329+*/
62330diff -urN linux-2.6.23.orig/fs/reiser4/seal.h linux-2.6.23/fs/reiser4/seal.h
62331--- linux-2.6.23.orig/fs/reiser4/seal.h 1970-01-01 03:00:00.000000000 +0300
62332+++ linux-2.6.23/fs/reiser4/seal.h 2007-12-04 16:49:30.000000000 +0300
62333@@ -0,0 +1,49 @@
62334+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62335+
62336+/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
62337+
62338+#ifndef __SEAL_H__
62339+#define __SEAL_H__
62340+
62341+#include "forward.h"
62342+#include "debug.h"
62343+#include "dformat.h"
62344+#include "key.h"
62345+#include "coord.h"
62346+
62347+/* for __u?? types */
62348+/*#include <linux/types.h>*/
62349+
62350+/* seal. See comment at the top of seal.c */
62351+typedef struct seal_s {
62352+ /* version of znode recorder at the time of seal creation */
62353+ __u64 version;
62354+ /* block number of znode attached to this seal */
62355+ reiser4_block_nr block;
62356+#if REISER4_DEBUG
62357+ /* coord this seal is attached to. For debugging. */
62358+ coord_t coord1;
62359+ /* key this seal is attached to. For debugging. */
62360+ reiser4_key key;
62361+#endif
62362+} seal_t;
62363+
62364+extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
62365+extern void reiser4_seal_done(seal_t *);
62366+extern int reiser4_seal_is_set(const seal_t *);
62367+extern int reiser4_seal_validate(seal_t *, coord_t *,
62368+ const reiser4_key *, lock_handle *,
62369+ znode_lock_mode mode, znode_lock_request request);
62370+
62371+/* __SEAL_H__ */
62372+#endif
62373+
62374+/* Make Linus happy.
62375+ Local variables:
62376+ c-indentation-style: "K&R"
62377+ mode-name: "LC"
62378+ c-basic-offset: 8
62379+ tab-width: 8
62380+ fill-column: 120
62381+ End:
62382+*/
62383diff -urN linux-2.6.23.orig/fs/reiser4/search.c linux-2.6.23/fs/reiser4/search.c
62384--- linux-2.6.23.orig/fs/reiser4/search.c 1970-01-01 03:00:00.000000000 +0300
62385+++ linux-2.6.23/fs/reiser4/search.c 2007-12-04 16:49:30.000000000 +0300
62386@@ -0,0 +1,1611 @@
62387+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62388+ * reiser4/README */
62389+
62390+#include "forward.h"
62391+#include "debug.h"
62392+#include "dformat.h"
62393+#include "key.h"
62394+#include "coord.h"
62395+#include "seal.h"
62396+#include "plugin/item/item.h"
62397+#include "plugin/node/node.h"
62398+#include "plugin/plugin.h"
62399+#include "jnode.h"
62400+#include "znode.h"
62401+#include "block_alloc.h"
62402+#include "tree_walk.h"
62403+#include "tree.h"
62404+#include "reiser4.h"
62405+#include "super.h"
62406+#include "inode.h"
62407+
62408+#include <linux/slab.h>
62409+
62410+static const char *bias_name(lookup_bias bias);
62411+
62412+/* tree searching algorithm, intranode searching algorithms are in
62413+ plugin/node/ */
62414+
62415+/* tree lookup cache
62416+ *
62417+ * The coord by key cache consists of small list of recently accessed nodes
62418+ * maintained according to the LRU discipline. Before doing real top-to-down
62419+ * tree traversal this cache is scanned for nodes that can contain key
62420+ * requested.
62421+ *
62422+ * The efficiency of coord cache depends heavily on locality of reference for
62423+ * tree accesses. Our user level simulations show reasonably good hit ratios
62424+ * for coord cache under most loads so far.
62425+ */
62426+
62427+/* Initialise coord cache slot */
62428+static void cbk_cache_init_slot(cbk_cache_slot *slot)
62429+{
62430+ assert("nikita-345", slot != NULL);
62431+
62432+ INIT_LIST_HEAD(&slot->lru);
62433+ slot->node = NULL;
62434+}
62435+
62436+/* Initialize coord cache */
62437+int cbk_cache_init(cbk_cache *cache /* cache to init */ )
62438+{
62439+ int i;
62440+
62441+ assert("nikita-346", cache != NULL);
62442+
62443+ cache->slot =
62444+ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
62445+ reiser4_ctx_gfp_mask_get());
62446+ if (cache->slot == NULL)
62447+ return RETERR(-ENOMEM);
62448+
62449+ INIT_LIST_HEAD(&cache->lru);
62450+ for (i = 0; i < cache->nr_slots; ++i) {
62451+ cbk_cache_init_slot(cache->slot + i);
62452+ list_add_tail(&((cache->slot + i)->lru), &cache->lru);
62453+ }
62454+ rwlock_init(&cache->guard);
62455+ return 0;
62456+}
62457+
62458+/* free cbk cache data */
62459+void cbk_cache_done(cbk_cache * cache /* cache to release */ )
62460+{
62461+ assert("nikita-2493", cache != NULL);
62462+ if (cache->slot != NULL) {
62463+ kfree(cache->slot);
62464+ cache->slot = NULL;
62465+ }
62466+}
62467+
62468+/* macro to iterate over all cbk cache slots */
62469+#define for_all_slots(cache, slot) \
62470+ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
62471+ &(cache)->lru != &(slot)->lru; \
62472+ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
62473+
62474+#if REISER4_DEBUG
62475+/* this function assures that [cbk-cache-invariant] invariant holds */
62476+static int cbk_cache_invariant(const cbk_cache *cache)
62477+{
62478+ cbk_cache_slot *slot;
62479+ int result;
62480+ int unused;
62481+
62482+ if (cache->nr_slots == 0)
62483+ return 1;
62484+
62485+ assert("nikita-2469", cache != NULL);
62486+ unused = 0;
62487+ result = 1;
62488+ read_lock(&((cbk_cache *)cache)->guard);
62489+ for_all_slots(cache, slot) {
62490+ /* in LRU first go all `used' slots followed by `unused' */
62491+ if (unused && (slot->node != NULL))
62492+ result = 0;
62493+ if (slot->node == NULL)
62494+ unused = 1;
62495+ else {
62496+ cbk_cache_slot *scan;
62497+
62498+ /* all cached nodes are different */
62499+ scan = slot;
62500+ while (result) {
62501+ scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
62502+ if (&cache->lru == &scan->lru)
62503+ break;
62504+ if (slot->node == scan->node)
62505+ result = 0;
62506+ }
62507+ }
62508+ if (!result)
62509+ break;
62510+ }
62511+ read_unlock(&((cbk_cache *)cache)->guard);
62512+ return result;
62513+}
62514+
62515+#endif
62516+
62517+/* Remove references, if any, to @node from coord cache */
62518+void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
62519+ reiser4_tree * tree /* tree to remove node from */ )
62520+{
62521+ cbk_cache_slot *slot;
62522+ cbk_cache *cache;
62523+ int i;
62524+
62525+ assert("nikita-350", node != NULL);
62526+ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
62527+
62528+ cache = &tree->cbk_cache;
62529+ assert("nikita-2470", cbk_cache_invariant(cache));
62530+
62531+ write_lock(&(cache->guard));
62532+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62533+ if (slot->node == node) {
62534+ list_move_tail(&slot->lru, &cache->lru);
62535+ slot->node = NULL;
62536+ break;
62537+ }
62538+ }
62539+ write_unlock(&(cache->guard));
62540+ assert("nikita-2471", cbk_cache_invariant(cache));
62541+}
62542+
62543+/* add to the cbk-cache in the "tree" information about "node". This
62544+ can actually be update of existing slot in a cache. */
62545+static void cbk_cache_add(const znode *node /* node to add to the cache */ )
62546+{
62547+ cbk_cache *cache;
62548+ cbk_cache_slot *slot;
62549+ int i;
62550+
62551+ assert("nikita-352", node != NULL);
62552+
62553+ cache = &znode_get_tree(node)->cbk_cache;
62554+ assert("nikita-2472", cbk_cache_invariant(cache));
62555+
62556+ if (cache->nr_slots == 0)
62557+ return;
62558+
62559+ write_lock(&(cache->guard));
62560+ /* find slot to update/add */
62561+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62562+ /* oops, this node is already in a cache */
62563+ if (slot->node == node)
62564+ break;
62565+ }
62566+ /* if all slots are used, reuse least recently used one */
62567+ if (i == cache->nr_slots) {
62568+ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
62569+ slot->node = (znode *) node;
62570+ }
62571+ list_move(&slot->lru, &cache->lru);
62572+ write_unlock(&(cache->guard));
62573+ assert("nikita-2473", cbk_cache_invariant(cache));
62574+}
62575+
62576+static int setup_delimiting_keys(cbk_handle * h);
62577+static lookup_result coord_by_handle(cbk_handle * handle);
62578+static lookup_result traverse_tree(cbk_handle * h);
62579+static int cbk_cache_search(cbk_handle * h);
62580+
62581+static level_lookup_result cbk_level_lookup(cbk_handle * h);
62582+static level_lookup_result cbk_node_lookup(cbk_handle * h);
62583+
62584+/* helper functions */
62585+
62586+static void update_stale_dk(reiser4_tree * tree, znode * node);
62587+
62588+/* release parent node during traversal */
62589+static void put_parent(cbk_handle * h);
62590+/* check consistency of fields */
62591+static int sanity_check(cbk_handle * h);
62592+/* release resources in handle */
62593+static void hput(cbk_handle * h);
62594+
62595+static level_lookup_result search_to_left(cbk_handle * h);
62596+
62597+/* pack numerous (numberous I should say) arguments of coord_by_key() into
62598+ * cbk_handle */
62599+static cbk_handle *cbk_pack(cbk_handle * handle,
62600+ reiser4_tree * tree,
62601+ const reiser4_key * key,
62602+ coord_t * coord,
62603+ lock_handle * active_lh,
62604+ lock_handle * parent_lh,
62605+ znode_lock_mode lock_mode,
62606+ lookup_bias bias,
62607+ tree_level lock_level,
62608+ tree_level stop_level,
62609+ __u32 flags, ra_info_t * info)
62610+{
62611+ memset(handle, 0, sizeof *handle);
62612+
62613+ handle->tree = tree;
62614+ handle->key = key;
62615+ handle->lock_mode = lock_mode;
62616+ handle->bias = bias;
62617+ handle->lock_level = lock_level;
62618+ handle->stop_level = stop_level;
62619+ handle->coord = coord;
62620+ /* set flags. See comment in tree.h:cbk_flags */
62621+ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
62622+
62623+ handle->active_lh = active_lh;
62624+ handle->parent_lh = parent_lh;
62625+ handle->ra_info = info;
62626+ return handle;
62627+}
62628+
62629+/* main tree lookup procedure
62630+
62631+ Check coord cache. If key we are looking for is not found there, call cbk()
62632+ to do real tree traversal.
62633+
62634+ As we have extents on the twig level, @lock_level and @stop_level can
62635+ be different from LEAF_LEVEL and each other.
62636+
62637+ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
62638+ long term locks) while calling this.
62639+*/
62640+lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
62641+ * in. Usually this tree is
62642+ * part of file-system
62643+ * super-block */ ,
62644+ const reiser4_key * key /* key to look for */ ,
62645+ coord_t * coord /* where to store found
62646+ * position in a tree. Fields
62647+ * in "coord" are only valid if
62648+ * coord_by_key() returned
62649+ * "CBK_COORD_FOUND" */ ,
62650+ lock_handle * lh, /* resulting lock handle */
62651+ znode_lock_mode lock_mode /* type of lookup we
62652+ * want on node. Pass
62653+ * ZNODE_READ_LOCK here
62654+ * if you only want to
62655+ * read item found and
62656+ * ZNODE_WRITE_LOCK if
62657+ * you want to modify
62658+ * it */ ,
62659+ lookup_bias bias /* what to return if coord
62660+ * with exactly the @key is
62661+ * not in the tree */ ,
62662+ tree_level lock_level /* tree level where to start
62663+ * taking @lock type of
62664+ * locks */ ,
62665+ tree_level stop_level /* tree level to stop. Pass
62666+ * LEAF_LEVEL or TWIG_LEVEL
62667+ * here Item being looked
62668+ * for has to be between
62669+ * @lock_level and
62670+ * @stop_level, inclusive */ ,
62671+ __u32 flags /* search flags */ ,
62672+ ra_info_t *
62673+ info
62674+ /* information about desired tree traversal readahead */
62675+ )
62676+{
62677+ cbk_handle handle;
62678+ lock_handle parent_lh;
62679+ lookup_result result;
62680+
62681+ init_lh(lh);
62682+ init_lh(&parent_lh);
62683+
62684+ assert("nikita-3023", reiser4_schedulable());
62685+
62686+ assert("nikita-353", tree != NULL);
62687+ assert("nikita-354", key != NULL);
62688+ assert("nikita-355", coord != NULL);
62689+ assert("nikita-356", (bias == FIND_EXACT)
62690+ || (bias == FIND_MAX_NOT_MORE_THAN));
62691+ assert("nikita-357", stop_level >= LEAF_LEVEL);
62692+ /* no locks can be held during tree traversal */
62693+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
62694+
62695+ cbk_pack(&handle,
62696+ tree,
62697+ key,
62698+ coord,
62699+ lh,
62700+ &parent_lh,
62701+ lock_mode, bias, lock_level, stop_level, flags, info);
62702+
62703+ result = coord_by_handle(&handle);
62704+ assert("nikita-3247",
62705+ ergo(!IS_CBKERR(result), coord->node == lh->node));
62706+ return result;
62707+}
62708+
62709+/* like coord_by_key(), but starts traversal from vroot of @object rather than
62710+ * from tree root. */
62711+lookup_result reiser4_object_lookup(struct inode * object,
62712+ const reiser4_key * key,
62713+ coord_t * coord,
62714+ lock_handle * lh,
62715+ znode_lock_mode lock_mode,
62716+ lookup_bias bias,
62717+ tree_level lock_level,
62718+ tree_level stop_level, __u32 flags,
62719+ ra_info_t * info)
62720+{
62721+ cbk_handle handle;
62722+ lock_handle parent_lh;
62723+ lookup_result result;
62724+
62725+ init_lh(lh);
62726+ init_lh(&parent_lh);
62727+
62728+ assert("nikita-3023", reiser4_schedulable());
62729+
62730+ assert("nikita-354", key != NULL);
62731+ assert("nikita-355", coord != NULL);
62732+ assert("nikita-356", (bias == FIND_EXACT)
62733+ || (bias == FIND_MAX_NOT_MORE_THAN));
62734+ assert("nikita-357", stop_level >= LEAF_LEVEL);
62735+ /* no locks can be held during tree search by key */
62736+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
62737+
62738+ cbk_pack(&handle,
62739+ object != NULL ? reiser4_tree_by_inode(object) : current_tree,
62740+ key,
62741+ coord,
62742+ lh,
62743+ &parent_lh,
62744+ lock_mode, bias, lock_level, stop_level, flags, info);
62745+ handle.object = object;
62746+
62747+ result = coord_by_handle(&handle);
62748+ assert("nikita-3247",
62749+ ergo(!IS_CBKERR(result), coord->node == lh->node));
62750+ return result;
62751+}
62752+
62753+/* lookup by cbk_handle. Common part of coord_by_key() and
62754+ reiser4_object_lookup(). */
62755+static lookup_result coord_by_handle(cbk_handle * handle)
62756+{
62757+ /*
62758+ * first check cbk_cache (which is look-aside cache for our tree) and
62759+ * of this fails, start traversal.
62760+ */
62761+ /* first check whether "key" is in cache of recent lookups. */
62762+ if (cbk_cache_search(handle) == 0)
62763+ return handle->result;
62764+ else
62765+ return traverse_tree(handle);
62766+}
62767+
62768+/* Execute actor for each item (or unit, depending on @through_units_p),
62769+ starting from @coord, right-ward, until either:
62770+
62771+ - end of the tree is reached
62772+ - unformatted node is met
62773+ - error occurred
62774+ - @actor returns 0 or less
62775+
62776+ Error code, or last actor return value is returned.
62777+
62778+ This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
62779+ sequence of entries with identical keys and alikes.
62780+*/
62781+int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
62782+ coord_t * coord /* coord to start from */ ,
62783+ lock_handle * lh /* lock handle to start with and to
62784+ * update along the way */ ,
62785+ tree_iterate_actor_t actor /* function to call on each
62786+ * item/unit */ ,
62787+ void *arg /* argument to pass to @actor */ ,
62788+ znode_lock_mode mode /* lock mode on scanned nodes */ ,
62789+ int through_units_p /* call @actor on each item or on
62790+ * each unit */ )
62791+{
62792+ int result;
62793+
62794+ assert("nikita-1143", tree != NULL);
62795+ assert("nikita-1145", coord != NULL);
62796+ assert("nikita-1146", lh != NULL);
62797+ assert("nikita-1147", actor != NULL);
62798+
62799+ result = zload(coord->node);
62800+ coord_clear_iplug(coord);
62801+ if (result != 0)
62802+ return result;
62803+ if (!coord_is_existing_unit(coord)) {
62804+ zrelse(coord->node);
62805+ return -ENOENT;
62806+ }
62807+ while ((result = actor(tree, coord, lh, arg)) > 0) {
62808+ /* move further */
62809+ if ((through_units_p && coord_next_unit(coord)) ||
62810+ (!through_units_p && coord_next_item(coord))) {
62811+ do {
62812+ lock_handle couple;
62813+
62814+ /* move to the next node */
62815+ init_lh(&couple);
62816+ result =
62817+ reiser4_get_right_neighbor(&couple,
62818+ coord->node,
62819+ (int)mode,
62820+ GN_CAN_USE_UPPER_LEVELS);
62821+ zrelse(coord->node);
62822+ if (result == 0) {
62823+
62824+ result = zload(couple.node);
62825+ if (result != 0) {
62826+ done_lh(&couple);
62827+ return result;
62828+ }
62829+
62830+ coord_init_first_unit(coord,
62831+ couple.node);
62832+ done_lh(lh);
62833+ move_lh(lh, &couple);
62834+ } else
62835+ return result;
62836+ } while (node_is_empty(coord->node));
62837+ }
62838+
62839+ assert("nikita-1149", coord_is_existing_unit(coord));
62840+ }
62841+ zrelse(coord->node);
62842+ return result;
62843+}
62844+
62845+/* return locked uber znode for @tree */
62846+int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
62847+ znode_lock_request pri, lock_handle * lh)
62848+{
62849+ int result;
62850+
62851+ result = longterm_lock_znode(lh, tree->uber, mode, pri);
62852+ return result;
62853+}
62854+
62855+/* true if @key is strictly within @node
62856+
62857+ we are looking for possibly non-unique key and it is item is at the edge of
62858+ @node. May be it is in the neighbor.
62859+*/
62860+static int znode_contains_key_strict(znode * node /* node to check key
62861+ * against */ ,
62862+ const reiser4_key *
62863+ key /* key to check */ ,
62864+ int isunique)
62865+{
62866+ int answer;
62867+
62868+ assert("nikita-1760", node != NULL);
62869+ assert("nikita-1722", key != NULL);
62870+
62871+ if (keyge(key, &node->rd_key))
62872+ return 0;
62873+
62874+ answer = keycmp(&node->ld_key, key);
62875+
62876+ if (isunique)
62877+ return answer != GREATER_THAN;
62878+ else
62879+ return answer == LESS_THAN;
62880+}
62881+
62882+/*
62883+ * Virtual Root (vroot) code.
62884+ *
62885+ * For given file system object (e.g., regular file or directory) let's
62886+ * define its "virtual root" as lowest in the tree (that is, furtherest
62887+ * from the tree root) node such that all body items of said object are
62888+ * located in a tree rooted at this node.
62889+ *
62890+ * Once vroot of object is found all tree lookups for items within body of
62891+ * this object ("object lookups") can be started from its vroot rather
62892+ * than from real root. This has following advantages:
62893+ *
62894+ * 1. amount of nodes traversed during lookup (and, hence, amount of
62895+ * key comparisons made) decreases, and
62896+ *
62897+ * 2. contention on tree root is decreased. This latter was actually
62898+ * motivating reason behind vroot, because spin lock of root node,
62899+ * which is taken when acquiring long-term lock on root node is the
62900+ * hottest lock in the reiser4.
62901+ *
62902+ * How to find vroot.
62903+ *
62904+ * When vroot of object F is not yet determined, all object lookups start
62905+ * from the root of the tree. At each tree level during traversal we have
62906+ * a node N such that a key we are looking for (which is the key inside
62907+ * object's body) is located within N. In function handle_vroot() called
62908+ * from cbk_level_lookup() we check whether N is possible vroot for
62909+ * F. Check is trivial---if neither leftmost nor rightmost item of N
62910+ * belongs to F (and we already have helpful ->owns_item() method of
62911+ * object plugin for this), then N is possible vroot of F. This, of
62912+ * course, relies on the assumption that each object occupies contiguous
62913+ * range of keys in the tree.
62914+ *
62915+ * Thus, traversing tree downward and checking each node as we go, we can
62916+ * find lowest such node, which, by definition, is vroot.
62917+ *
62918+ * How to track vroot.
62919+ *
62920+ * Nohow. If actual vroot changes, next object lookup will just restart
62921+ * from the actual tree root, refreshing object's vroot along the way.
62922+ *
62923+ */
62924+
62925+/*
62926+ * Check whether @node is possible vroot of @object.
62927+ */
62928+static void handle_vroot(struct inode *object, znode * node)
62929+{
62930+ file_plugin *fplug;
62931+ coord_t coord;
62932+
62933+ fplug = inode_file_plugin(object);
62934+ assert("nikita-3353", fplug != NULL);
62935+ assert("nikita-3354", fplug->owns_item != NULL);
62936+
62937+ if (unlikely(node_is_empty(node)))
62938+ return;
62939+
62940+ coord_init_first_unit(&coord, node);
62941+ /*
62942+ * if leftmost item of @node belongs to @object, we cannot be sure
62943+ * that @node is vroot of @object, because, some items of @object are
62944+ * probably in the sub-tree rooted at the left neighbor of @node.
62945+ */
62946+ if (fplug->owns_item(object, &coord))
62947+ return;
62948+ coord_init_last_unit(&coord, node);
62949+ /* mutatis mutandis for the rightmost item */
62950+ if (fplug->owns_item(object, &coord))
62951+ return;
62952+ /* otherwise, @node is possible vroot of @object */
62953+ inode_set_vroot(object, node);
62954+}
62955+
62956+/*
62957+ * helper function used by traverse tree to start tree traversal not from the
62958+ * tree root, but from @h->object's vroot, if possible.
62959+ */
62960+static int prepare_object_lookup(cbk_handle * h)
62961+{
62962+ znode *vroot;
62963+ int result;
62964+
62965+ vroot = inode_get_vroot(h->object);
62966+ if (vroot == NULL) {
62967+ /*
62968+ * object doesn't have known vroot, start from real tree root.
62969+ */
62970+ return LOOKUP_CONT;
62971+ }
62972+
62973+ h->level = znode_get_level(vroot);
62974+ /* take a long-term lock on vroot */
62975+ h->result = longterm_lock_znode(h->active_lh, vroot,
62976+ cbk_lock_mode(h->level, h),
62977+ ZNODE_LOCK_LOPRI);
62978+ result = LOOKUP_REST;
62979+ if (h->result == 0) {
62980+ int isunique;
62981+ int inside;
62982+
62983+ isunique = h->flags & CBK_UNIQUE;
62984+ /* check that key is inside vroot */
62985+ read_lock_dk(h->tree);
62986+ inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
62987+ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
62988+ read_unlock_dk(h->tree);
62989+ if (inside) {
62990+ h->result = zload(vroot);
62991+ if (h->result == 0) {
62992+ /* search for key in vroot. */
62993+ result = cbk_node_lookup(h);
62994+ zrelse(vroot); /*h->active_lh->node); */
62995+ if (h->active_lh->node != vroot) {
62996+ result = LOOKUP_REST;
62997+ } else if (result == LOOKUP_CONT) {
62998+ move_lh(h->parent_lh, h->active_lh);
62999+ h->flags &= ~CBK_DKSET;
63000+ }
63001+ }
63002+ }
63003+ }
63004+
63005+ zput(vroot);
63006+
63007+ if (IS_CBKERR(h->result) || result == LOOKUP_REST)
63008+ hput(h);
63009+ return result;
63010+}
63011+
63012+/* main function that handles common parts of tree traversal: starting
63013+ (fake znode handling), restarts, error handling, completion */
63014+static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
63015+{
63016+ int done;
63017+ int iterations;
63018+ int vroot_used;
63019+
63020+ assert("nikita-365", h != NULL);
63021+ assert("nikita-366", h->tree != NULL);
63022+ assert("nikita-367", h->key != NULL);
63023+ assert("nikita-368", h->coord != NULL);
63024+ assert("nikita-369", (h->bias == FIND_EXACT)
63025+ || (h->bias == FIND_MAX_NOT_MORE_THAN));
63026+ assert("nikita-370", h->stop_level >= LEAF_LEVEL);
63027+ assert("nikita-2949", !(h->flags & CBK_DKSET));
63028+ assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
63029+
63030+ done = 0;
63031+ iterations = 0;
63032+ vroot_used = 0;
63033+
63034+ /* loop for restarts */
63035+ restart:
63036+
63037+ assert("nikita-3024", reiser4_schedulable());
63038+
63039+ h->result = CBK_COORD_FOUND;
63040+ /* connect_znode() needs it */
63041+ h->ld_key = *reiser4_min_key();
63042+ h->rd_key = *reiser4_max_key();
63043+ h->flags |= CBK_DKSET;
63044+ h->error = NULL;
63045+
63046+ if (!vroot_used && h->object != NULL) {
63047+ vroot_used = 1;
63048+ done = prepare_object_lookup(h);
63049+ if (done == LOOKUP_REST) {
63050+ goto restart;
63051+ } else if (done == LOOKUP_DONE)
63052+ return h->result;
63053+ }
63054+ if (h->parent_lh->node == NULL) {
63055+ done =
63056+ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
63057+ h->parent_lh);
63058+
63059+ assert("nikita-1637", done != -E_DEADLOCK);
63060+
63061+ h->block = h->tree->root_block;
63062+ h->level = h->tree->height;
63063+ h->coord->node = h->parent_lh->node;
63064+
63065+ if (done != 0)
63066+ return done;
63067+ }
63068+
63069+ /* loop descending a tree */
63070+ while (!done) {
63071+
63072+ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
63073+ IS_POW(iterations))) {
63074+ warning("nikita-1481", "Too many iterations: %i",
63075+ iterations);
63076+ reiser4_print_key("key", h->key);
63077+ ++iterations;
63078+ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
63079+ h->error =
63080+ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
63081+ h->result = RETERR(-EIO);
63082+ break;
63083+ }
63084+ switch (cbk_level_lookup(h)) {
63085+ case LOOKUP_CONT:
63086+ move_lh(h->parent_lh, h->active_lh);
63087+ continue;
63088+ default:
63089+ wrong_return_value("nikita-372", "cbk_level");
63090+ case LOOKUP_DONE:
63091+ done = 1;
63092+ break;
63093+ case LOOKUP_REST:
63094+ hput(h);
63095+ /* deadlock avoidance is normal case. */
63096+ if (h->result != -E_DEADLOCK)
63097+ ++iterations;
63098+ reiser4_preempt_point();
63099+ goto restart;
63100+ }
63101+ }
63102+ /* that's all. The rest is error handling */
63103+ if (unlikely(h->error != NULL)) {
63104+ warning("nikita-373", "%s: level: %i, "
63105+ "lock_level: %i, stop_level: %i "
63106+ "lock_mode: %s, bias: %s",
63107+ h->error, h->level, h->lock_level, h->stop_level,
63108+ lock_mode_name(h->lock_mode), bias_name(h->bias));
63109+ reiser4_print_address("block", &h->block);
63110+ reiser4_print_key("key", h->key);
63111+ print_coord_content("coord", h->coord);
63112+ }
63113+ /* `unlikely' error case */
63114+ if (unlikely(IS_CBKERR(h->result))) {
63115+ /* failure. do cleanup */
63116+ hput(h);
63117+ } else {
63118+ assert("nikita-1605", WITH_DATA_RET
63119+ (h->coord->node, 1,
63120+ ergo((h->result == CBK_COORD_FOUND) &&
63121+ (h->bias == FIND_EXACT) &&
63122+ (!node_is_empty(h->coord->node)),
63123+ coord_is_existing_item(h->coord))));
63124+ }
63125+ return h->result;
63126+}
63127+
63128+/* find delimiting keys of child
63129+
63130+ Determine left and right delimiting keys for child pointed to by
63131+ @parent_coord.
63132+
63133+*/
63134+static void find_child_delimiting_keys(znode * parent /* parent znode, passed
63135+ * locked */ ,
63136+ const coord_t * parent_coord /* coord where
63137+ * pointer to
63138+ * child is
63139+ * stored */ ,
63140+ reiser4_key * ld /* where to store left
63141+ * delimiting key */ ,
63142+ reiser4_key * rd /* where to store right
63143+ * delimiting key */ )
63144+{
63145+ coord_t neighbor;
63146+
63147+ assert("nikita-1484", parent != NULL);
63148+ assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
63149+
63150+ coord_dup(&neighbor, parent_coord);
63151+
63152+ if (neighbor.between == AT_UNIT)
63153+ /* imitate item ->lookup() behavior. */
63154+ neighbor.between = AFTER_UNIT;
63155+
63156+ if (coord_set_to_left(&neighbor) == 0)
63157+ unit_key_by_coord(&neighbor, ld);
63158+ else {
63159+ assert("nikita-14851", 0);
63160+ *ld = *znode_get_ld_key(parent);
63161+ }
63162+
63163+ coord_dup(&neighbor, parent_coord);
63164+ if (neighbor.between == AT_UNIT)
63165+ neighbor.between = AFTER_UNIT;
63166+ if (coord_set_to_right(&neighbor) == 0)
63167+ unit_key_by_coord(&neighbor, rd);
63168+ else
63169+ *rd = *znode_get_rd_key(parent);
63170+}
63171+
63172+/*
63173+ * setup delimiting keys for a child
63174+ *
63175+ * @parent parent node
63176+ *
63177+ * @coord location in @parent where pointer to @child is
63178+ *
63179+ * @child child node
63180+ */
63181+int
63182+set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
63183+{
63184+ reiser4_tree *tree;
63185+
63186+ assert("nikita-2952",
63187+ znode_get_level(parent) == znode_get_level(coord->node));
63188+
63189+ /* fast check without taking dk lock. This is safe, because
63190+ * JNODE_DKSET is never cleared once set. */
63191+ if (!ZF_ISSET(child, JNODE_DKSET)) {
63192+ tree = znode_get_tree(parent);
63193+ write_lock_dk(tree);
63194+ if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
63195+ find_child_delimiting_keys(parent, coord,
63196+ &child->ld_key,
63197+ &child->rd_key);
63198+ ON_DEBUG(child->ld_key_version =
63199+ atomic_inc_return(&delim_key_version);
63200+ child->rd_key_version =
63201+ atomic_inc_return(&delim_key_version););
63202+ ZF_SET(child, JNODE_DKSET);
63203+ }
63204+ write_unlock_dk(tree);
63205+ return 1;
63206+ }
63207+ return 0;
63208+}
63209+
63210+/* Perform tree lookup at one level. This is called from cbk_traverse()
63211+ function that drives lookup through tree and calls cbk_node_lookup() to
63212+ perform lookup within one node.
63213+
63214+ See comments in a code.
63215+*/
63216+static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
63217+{
63218+ int ret;
63219+ int setdk;
63220+ int ldkeyset = 0;
63221+ reiser4_key ldkey;
63222+ reiser4_key key;
63223+ znode *active;
63224+
63225+ assert("nikita-3025", reiser4_schedulable());
63226+
63227+ /* acquire reference to @active node */
63228+ active =
63229+ zget(h->tree, &h->block, h->parent_lh->node, h->level,
63230+ reiser4_ctx_gfp_mask_get());
63231+
63232+ if (IS_ERR(active)) {
63233+ h->result = PTR_ERR(active);
63234+ return LOOKUP_DONE;
63235+ }
63236+
63237+ /* lock @active */
63238+ h->result = longterm_lock_znode(h->active_lh,
63239+ active,
63240+ cbk_lock_mode(h->level, h),
63241+ ZNODE_LOCK_LOPRI);
63242+ /* longterm_lock_znode() acquires additional reference to znode (which
63243+ will be later released by longterm_unlock_znode()). Release
63244+ reference acquired by zget().
63245+ */
63246+ zput(active);
63247+ if (unlikely(h->result != 0))
63248+ goto fail_or_restart;
63249+
63250+ setdk = 0;
63251+ /* if @active is accessed for the first time, setup delimiting keys on
63252+ it. Delimiting keys are taken from the parent node. See
63253+ setup_delimiting_keys() for details.
63254+ */
63255+ if (h->flags & CBK_DKSET) {
63256+ setdk = setup_delimiting_keys(h);
63257+ h->flags &= ~CBK_DKSET;
63258+ } else {
63259+ znode *parent;
63260+
63261+ parent = h->parent_lh->node;
63262+ h->result = zload(parent);
63263+ if (unlikely(h->result != 0))
63264+ goto fail_or_restart;
63265+
63266+ if (!ZF_ISSET(active, JNODE_DKSET))
63267+ setdk = set_child_delimiting_keys(parent,
63268+ h->coord, active);
63269+ else {
63270+ read_lock_dk(h->tree);
63271+ find_child_delimiting_keys(parent, h->coord, &ldkey,
63272+ &key);
63273+ read_unlock_dk(h->tree);
63274+ ldkeyset = 1;
63275+ }
63276+ zrelse(parent);
63277+ }
63278+
63279+ /* this is ugly kludge. Reminder: this is necessary, because
63280+ ->lookup() method returns coord with ->between field probably set
63281+ to something different from AT_UNIT.
63282+ */
63283+ h->coord->between = AT_UNIT;
63284+
63285+ if (znode_just_created(active) && (h->coord->node != NULL)) {
63286+ write_lock_tree(h->tree);
63287+ /* if we are going to load znode right now, setup
63288+ ->in_parent: coord where pointer to this node is stored in
63289+ parent.
63290+ */
63291+ coord_to_parent_coord(h->coord, &active->in_parent);
63292+ write_unlock_tree(h->tree);
63293+ }
63294+
63295+ /* check connectedness without holding tree lock---false negatives
63296+ * will be re-checked by connect_znode(), and false positives are
63297+ * impossible---@active cannot suddenly turn into unconnected
63298+ * state. */
63299+ if (!znode_is_connected(active)) {
63300+ h->result = connect_znode(h->coord, active);
63301+ if (unlikely(h->result != 0)) {
63302+ put_parent(h);
63303+ goto fail_or_restart;
63304+ }
63305+ }
63306+
63307+ jload_prefetch(ZJNODE(active));
63308+
63309+ if (setdk)
63310+ update_stale_dk(h->tree, active);
63311+
63312+ /* put_parent() cannot be called earlier, because connect_znode()
63313+ assumes parent node is referenced; */
63314+ put_parent(h);
63315+
63316+ if ((!znode_contains_key_lock(active, h->key) &&
63317+ (h->flags & CBK_TRUST_DK))
63318+ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
63319+ /* 1. key was moved out of this node while this thread was
63320+ waiting for the lock. Restart. More elaborate solution is
63321+ to determine where key moved (to the left, or to the right)
63322+ and try to follow it through sibling pointers.
63323+
63324+ 2. or, node itself is going to be removed from the
63325+ tree. Release lock and restart.
63326+ */
63327+ h->result = -E_REPEAT;
63328+ }
63329+ if (h->result == -E_REPEAT)
63330+ return LOOKUP_REST;
63331+
63332+ h->result = zload_ra(active, h->ra_info);
63333+ if (h->result) {
63334+ return LOOKUP_DONE;
63335+ }
63336+
63337+ /* sanity checks */
63338+ if (sanity_check(h)) {
63339+ zrelse(active);
63340+ return LOOKUP_DONE;
63341+ }
63342+
63343+ /* check that key of leftmost item in the @active is the same as in
63344+ * its parent */
63345+ if (ldkeyset && !node_is_empty(active) &&
63346+ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
63347+ warning("vs-3533", "Keys are inconsistent. Fsck?");
63348+ reiser4_print_key("inparent", &ldkey);
63349+ reiser4_print_key("inchild", &key);
63350+ h->result = RETERR(-EIO);
63351+ zrelse(active);
63352+ return LOOKUP_DONE;
63353+ }
63354+
63355+ if (h->object != NULL)
63356+ handle_vroot(h->object, active);
63357+
63358+ ret = cbk_node_lookup(h);
63359+
63360+ /* h->active_lh->node might change, but active is yet to be zrelsed */
63361+ zrelse(active);
63362+
63363+ return ret;
63364+
63365+ fail_or_restart:
63366+ if (h->result == -E_DEADLOCK)
63367+ return LOOKUP_REST;
63368+ return LOOKUP_DONE;
63369+}
63370+
63371+#if REISER4_DEBUG
63372+/* check left and right delimiting keys of a znode */
63373+void check_dkeys(znode * node)
63374+{
63375+ znode *left;
63376+ znode *right;
63377+
63378+ read_lock_tree(current_tree);
63379+ read_lock_dk(current_tree);
63380+
63381+ assert("vs-1710", znode_is_any_locked(node));
63382+ assert("vs-1197",
63383+ !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
63384+
63385+ left = node->left;
63386+ right = node->right;
63387+
63388+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63389+ && left != NULL && ZF_ISSET(left, JNODE_DKSET))
63390+ /* check left neighbor. Note that left neighbor is not locked,
63391+ so it might get wrong delimiting keys therefore */
63392+ assert("vs-1198",
63393+ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
63394+ || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
63395+
63396+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63397+ && right != NULL && ZF_ISSET(right, JNODE_DKSET))
63398+ /* check right neighbor. Note that right neighbor is not
63399+ locked, so it might get wrong delimiting keys therefore */
63400+ assert("vs-1199",
63401+ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
63402+ || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
63403+
63404+ read_unlock_dk(current_tree);
63405+ read_unlock_tree(current_tree);
63406+}
63407+#endif
63408+
63409+/* true if @key is left delimiting key of @node */
63410+static int key_is_ld(znode * node, const reiser4_key * key)
63411+{
63412+ int ld;
63413+
63414+ assert("nikita-1716", node != NULL);
63415+ assert("nikita-1758", key != NULL);
63416+
63417+ read_lock_dk(znode_get_tree(node));
63418+ assert("nikita-1759", znode_contains_key(node, key));
63419+ ld = keyeq(znode_get_ld_key(node), key);
63420+ read_unlock_dk(znode_get_tree(node));
63421+ return ld;
63422+}
63423+
63424+/* Process one node during tree traversal.
63425+
63426+ This is called by cbk_level_lookup(). */
63427+static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
63428+{
63429+ /* node plugin of @active */
63430+ node_plugin *nplug;
63431+ /* item plugin of item that was found */
63432+ item_plugin *iplug;
63433+ /* search bias */
63434+ lookup_bias node_bias;
63435+ /* node we are operating upon */
63436+ znode *active;
63437+ /* tree we are searching in */
63438+ reiser4_tree *tree;
63439+ /* result */
63440+ int result;
63441+
63442+ assert("nikita-379", h != NULL);
63443+
63444+ active = h->active_lh->node;
63445+ tree = h->tree;
63446+
63447+ nplug = active->nplug;
63448+ assert("nikita-380", nplug != NULL);
63449+
63450+ ON_DEBUG(check_dkeys(active));
63451+
63452+ /* return item from "active" node with maximal key not greater than
63453+ "key" */
63454+ node_bias = h->bias;
63455+ result = nplug->lookup(active, h->key, node_bias, h->coord);
63456+ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
63457+ /* error occurred */
63458+ h->result = result;
63459+ return LOOKUP_DONE;
63460+ }
63461+ if (h->level == h->stop_level) {
63462+ /* welcome to the stop level */
63463+ assert("nikita-381", h->coord->node == active);
63464+ if (result == NS_FOUND) {
63465+ /* success of tree lookup */
63466+ if (!(h->flags & CBK_UNIQUE)
63467+ && key_is_ld(active, h->key)) {
63468+ return search_to_left(h);
63469+ } else
63470+ h->result = CBK_COORD_FOUND;
63471+ } else {
63472+ h->result = CBK_COORD_NOTFOUND;
63473+ }
63474+ if (!(h->flags & CBK_IN_CACHE))
63475+ cbk_cache_add(active);
63476+ return LOOKUP_DONE;
63477+ }
63478+
63479+ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
63480+ h->error = "not found on internal node";
63481+ h->result = result;
63482+ return LOOKUP_DONE;
63483+ }
63484+
63485+ assert("vs-361", h->level > h->stop_level);
63486+
63487+ if (handle_eottl(h, &result)) {
63488+ assert("vs-1674", (result == LOOKUP_DONE ||
63489+ result == LOOKUP_REST));
63490+ return result;
63491+ }
63492+
63493+ /* go down to next level */
63494+ check_me("vs-12", zload(h->coord->node) == 0);
63495+ assert("nikita-2116", item_is_internal(h->coord));
63496+ iplug = item_plugin_by_coord(h->coord);
63497+ iplug->s.internal.down_link(h->coord, h->key, &h->block);
63498+ zrelse(h->coord->node);
63499+ --h->level;
63500+ return LOOKUP_CONT; /* continue */
63501+}
63502+
63503+/* scan cbk_cache slots looking for a match for @h */
63504+static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
63505+{
63506+ level_lookup_result llr;
63507+ znode *node;
63508+ reiser4_tree *tree;
63509+ cbk_cache_slot *slot;
63510+ cbk_cache *cache;
63511+ tree_level level;
63512+ int isunique;
63513+ const reiser4_key *key;
63514+ int result;
63515+
63516+ assert("nikita-1317", h != NULL);
63517+ assert("nikita-1315", h->tree != NULL);
63518+ assert("nikita-1316", h->key != NULL);
63519+
63520+ tree = h->tree;
63521+ cache = &tree->cbk_cache;
63522+ if (cache->nr_slots == 0)
63523+ /* size of cbk cache was set to 0 by mount time option. */
63524+ return RETERR(-ENOENT);
63525+
63526+ assert("nikita-2474", cbk_cache_invariant(cache));
63527+ node = NULL; /* to keep gcc happy */
63528+ level = h->level;
63529+ key = h->key;
63530+ isunique = h->flags & CBK_UNIQUE;
63531+ result = RETERR(-ENOENT);
63532+
63533+ /*
63534+ * this is time-critical function and dragons had, hence, been settled
63535+ * here.
63536+ *
63537+ * Loop below scans cbk cache slots trying to find matching node with
63538+ * suitable range of delimiting keys and located at the h->level.
63539+ *
63540+ * Scan is done under cbk cache spin lock that protects slot->node
63541+ * pointers. If suitable node is found we want to pin it in
63542+ * memory. But slot->node can point to the node with x_count 0
63543+ * (unreferenced). Such node can be recycled at any moment, or can
63544+ * already be in the process of being recycled (within jput()).
63545+ *
63546+ * As we found node in the cbk cache, it means that jput() hasn't yet
63547+ * called cbk_cache_invalidate().
63548+ *
63549+ * We acquire reference to the node without holding tree lock, and
63550+ * later, check node's RIP bit. This avoids races with jput().
63551+ */
63552+
63553+ rcu_read_lock();
63554+ read_lock(&((cbk_cache *)cache)->guard);
63555+
63556+ slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
63557+ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
63558+ BUG_ON(&slot->lru != &cache->lru);/*????*/
63559+ while (1) {
63560+
63561+ slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
63562+
63563+ if (&cache->lru != &slot->lru)
63564+ node = slot->node;
63565+ else
63566+ node = NULL;
63567+
63568+ if (unlikely(node == NULL))
63569+ break;
63570+
63571+ /*
63572+ * this is (hopefully) the only place in the code where we are
63573+ * working with delimiting keys without holding dk lock. This
63574+ * is fine here, because this is only "guess" anyway---keys
63575+ * are rechecked under dk lock below.
63576+ */
63577+ if (znode_get_level(node) == level &&
63578+ /* reiser4_min_key < key < reiser4_max_key */
63579+ znode_contains_key_strict(node, key, isunique)) {
63580+ zref(node);
63581+ result = 0;
63582+ spin_lock_prefetch(&tree->tree_lock);
63583+ break;
63584+ }
63585+ }
63586+ read_unlock(&((cbk_cache *)cache)->guard);
63587+
63588+ assert("nikita-2475", cbk_cache_invariant(cache));
63589+
63590+ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
63591+ result = -ENOENT;
63592+
63593+ rcu_read_unlock();
63594+
63595+ if (result != 0) {
63596+ h->result = CBK_COORD_NOTFOUND;
63597+ return RETERR(-ENOENT);
63598+ }
63599+
63600+ result =
63601+ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
63602+ ZNODE_LOCK_LOPRI);
63603+ zput(node);
63604+ if (result != 0)
63605+ return result;
63606+ result = zload(node);
63607+ if (result != 0)
63608+ return result;
63609+
63610+ /* recheck keys */
63611+ read_lock_dk(tree);
63612+ result = (znode_contains_key_strict(node, key, isunique) &&
63613+ !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
63614+ read_unlock_dk(tree);
63615+ if (result) {
63616+ /* do lookup inside node */
63617+ llr = cbk_node_lookup(h);
63618+ /* if cbk_node_lookup() wandered to another node (due to eottl
63619+ or non-unique keys), adjust @node */
63620+ /*node = h->active_lh->node; */
63621+
63622+ if (llr != LOOKUP_DONE) {
63623+ /* restart or continue on the next level */
63624+ result = RETERR(-ENOENT);
63625+ } else if (IS_CBKERR(h->result))
63626+ /* io or oom */
63627+ result = RETERR(-ENOENT);
63628+ else {
63629+ /* good. Either item found or definitely not found. */
63630+ result = 0;
63631+
63632+ write_lock(&(cache->guard));
63633+ if (slot->node == h->active_lh->node /*node */ ) {
63634+ /* if this node is still in cbk cache---move
63635+ its slot to the head of the LRU list. */
63636+ list_move(&slot->lru, &cache->lru);
63637+ }
63638+ write_unlock(&(cache->guard));
63639+ }
63640+ } else {
63641+ /* race. While this thread was waiting for the lock, node was
63642+ rebalanced and item we are looking for, shifted out of it
63643+ (if it ever was here).
63644+
63645+ Continuing scanning is almost hopeless: node key range was
63646+ moved to, is almost certainly at the beginning of the LRU
63647+ list at this time, because it's hot, but restarting
63648+ scanning from the very beginning is complex. Just return,
63649+ so that cbk() will be performed. This is not that
63650+ important, because such races should be rare. Are they?
63651+ */
63652+ result = RETERR(-ENOENT); /* -ERAUGHT */
63653+ }
63654+ zrelse(node);
63655+ assert("nikita-2476", cbk_cache_invariant(cache));
63656+ return result;
63657+}
63658+
63659+/* look for item with given key in the coord cache
63660+
63661+ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
63662+ which is a small LRU list of znodes accessed lately. For each znode in
63663+ znode in this list, it checks whether key we are looking for fits into key
63664+ range covered by this node. If so, and in addition, node lies at allowed
63665+ level (this is to handle extents on a twig level), node is locked, and
63666+ lookup inside it is performed.
63667+
63668+ we need a measurement of the cost of this cache search compared to the cost
63669+ of coord_by_key.
63670+
63671+*/
63672+static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
63673+{
63674+ int result = 0;
63675+ tree_level level;
63676+
63677+ /* add CBK_IN_CACHE to the handle flags. This means that
63678+ * cbk_node_lookup() assumes that cbk_cache is scanned and would add
63679+ * found node to the cache. */
63680+ h->flags |= CBK_IN_CACHE;
63681+ for (level = h->stop_level; level <= h->lock_level; ++level) {
63682+ h->level = level;
63683+ result = cbk_cache_scan_slots(h);
63684+ if (result != 0) {
63685+ done_lh(h->active_lh);
63686+ done_lh(h->parent_lh);
63687+ } else {
63688+ assert("nikita-1319", !IS_CBKERR(h->result));
63689+ break;
63690+ }
63691+ }
63692+ h->flags &= ~CBK_IN_CACHE;
63693+ return result;
63694+}
63695+
63696+/* type of lock we want to obtain during tree traversal. On stop level
63697+ we want type of lock user asked for, on upper levels: read lock. */
63698+znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
63699+{
63700+ assert("nikita-382", h != NULL);
63701+
63702+ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
63703+}
63704+
63705+/* update outdated delimiting keys */
63706+static void stale_dk(reiser4_tree * tree, znode * node)
63707+{
63708+ znode *right;
63709+
63710+ read_lock_tree(tree);
63711+ write_lock_dk(tree);
63712+ right = node->right;
63713+
63714+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63715+ right && ZF_ISSET(right, JNODE_DKSET) &&
63716+ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
63717+ znode_set_rd_key(node, znode_get_ld_key(right));
63718+
63719+ write_unlock_dk(tree);
63720+ read_unlock_tree(tree);
63721+}
63722+
63723+/* check for possibly outdated delimiting keys, and update them if
63724+ * necessary. */
63725+static void update_stale_dk(reiser4_tree * tree, znode * node)
63726+{
63727+ znode *right;
63728+ reiser4_key rd;
63729+
63730+ read_lock_tree(tree);
63731+ read_lock_dk(tree);
63732+ rd = *znode_get_rd_key(node);
63733+ right = node->right;
63734+ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63735+ right && ZF_ISSET(right, JNODE_DKSET) &&
63736+ !keyeq(&rd, znode_get_ld_key(right)))) {
63737+ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
63738+ read_unlock_dk(tree);
63739+ read_unlock_tree(tree);
63740+ stale_dk(tree, node);
63741+ return;
63742+ }
63743+ read_unlock_dk(tree);
63744+ read_unlock_tree(tree);
63745+}
63746+
63747+/*
63748+ * handle searches a the non-unique key.
63749+ *
63750+ * Suppose that we are looking for an item with possibly non-unique key 100.
63751+ *
63752+ * Root node contains two pointers: one to a node with left delimiting key 0,
63753+ * and another to a node with left delimiting key 100. Item we interested in
63754+ * may well happen in the sub-tree rooted at the first pointer.
63755+ *
63756+ * To handle this search_to_left() is called when search reaches stop
63757+ * level. This function checks it is _possible_ that item we are looking for
63758+ * is in the left neighbor (this can be done by comparing delimiting keys) and
63759+ * if so, tries to lock left neighbor (this is low priority lock, so it can
63760+ * deadlock, tree traversal is just restarted if it did) and then checks
63761+ * whether left neighbor actually contains items with our key.
63762+ *
63763+ * Note that this is done on the stop level only. It is possible to try such
63764+ * left-check on each level, but as duplicate keys are supposed to be rare
63765+ * (very unlikely that more than one node is completely filled with items with
63766+ * duplicate keys), it sis cheaper to scan to the left on the stop level once.
63767+ *
63768+ */
63769+static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
63770+{
63771+ level_lookup_result result;
63772+ coord_t *coord;
63773+ znode *node;
63774+ znode *neighbor;
63775+
63776+ lock_handle lh;
63777+
63778+ assert("nikita-1761", h != NULL);
63779+ assert("nikita-1762", h->level == h->stop_level);
63780+
63781+ init_lh(&lh);
63782+ coord = h->coord;
63783+ node = h->active_lh->node;
63784+ assert("nikita-1763", coord_is_leftmost_unit(coord));
63785+
63786+ h->result =
63787+ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
63788+ GN_CAN_USE_UPPER_LEVELS);
63789+ neighbor = NULL;
63790+ switch (h->result) {
63791+ case -E_DEADLOCK:
63792+ result = LOOKUP_REST;
63793+ break;
63794+ case 0:{
63795+ node_plugin *nplug;
63796+ coord_t crd;
63797+ lookup_bias bias;
63798+
63799+ neighbor = lh.node;
63800+ h->result = zload(neighbor);
63801+ if (h->result != 0) {
63802+ result = LOOKUP_DONE;
63803+ break;
63804+ }
63805+
63806+ nplug = neighbor->nplug;
63807+
63808+ coord_init_zero(&crd);
63809+ bias = h->bias;
63810+ h->bias = FIND_EXACT;
63811+ h->result =
63812+ nplug->lookup(neighbor, h->key, h->bias, &crd);
63813+ h->bias = bias;
63814+
63815+ if (h->result == NS_NOT_FOUND) {
63816+ case -E_NO_NEIGHBOR:
63817+ h->result = CBK_COORD_FOUND;
63818+ if (!(h->flags & CBK_IN_CACHE))
63819+ cbk_cache_add(node);
63820+ default: /* some other error */
63821+ result = LOOKUP_DONE;
63822+ } else if (h->result == NS_FOUND) {
63823+ read_lock_dk(znode_get_tree(neighbor));
63824+ h->rd_key = *znode_get_ld_key(node);
63825+ leftmost_key_in_node(neighbor, &h->ld_key);
63826+ read_unlock_dk(znode_get_tree(neighbor));
63827+ h->flags |= CBK_DKSET;
63828+
63829+ h->block = *znode_get_block(neighbor);
63830+ /* clear coord -> node so that cbk_level_lookup()
63831+ wouldn't overwrite parent hint in neighbor.
63832+
63833+ Parent hint was set up by
63834+ reiser4_get_left_neighbor()
63835+ */
63836+ /* FIXME: why do we have to spinlock here? */
63837+ write_lock_tree(znode_get_tree(neighbor));
63838+ h->coord->node = NULL;
63839+ write_unlock_tree(znode_get_tree(neighbor));
63840+ result = LOOKUP_CONT;
63841+ } else {
63842+ result = LOOKUP_DONE;
63843+ }
63844+ if (neighbor != NULL)
63845+ zrelse(neighbor);
63846+ }
63847+ }
63848+ done_lh(&lh);
63849+ return result;
63850+}
63851+
63852+/* debugging aid: return symbolic name of search bias */
63853+static const char *bias_name(lookup_bias bias /* bias to get name of */ )
63854+{
63855+ if (bias == FIND_EXACT)
63856+ return "exact";
63857+ else if (bias == FIND_MAX_NOT_MORE_THAN)
63858+ return "left-slant";
63859+/* else if( bias == RIGHT_SLANT_BIAS ) */
63860+/* return "right-bias"; */
63861+ else {
63862+ static char buf[30];
63863+
63864+ sprintf(buf, "unknown: %i", bias);
63865+ return buf;
63866+ }
63867+}
63868+
63869+#if REISER4_DEBUG
63870+/* debugging aid: print human readable information about @p */
63871+void print_coord_content(const char *prefix /* prefix to print */ ,
63872+ coord_t * p /* coord to print */ )
63873+{
63874+ reiser4_key key;
63875+
63876+ if (p == NULL) {
63877+ printk("%s: null\n", prefix);
63878+ return;
63879+ }
63880+ if ((p->node != NULL) && znode_is_loaded(p->node)
63881+ && coord_is_existing_item(p))
63882+ printk("%s: data: %p, length: %i\n", prefix,
63883+ item_body_by_coord(p), item_length_by_coord(p));
63884+ if (znode_is_loaded(p->node)) {
63885+ item_key_by_coord(p, &key);
63886+ reiser4_print_key(prefix, &key);
63887+ }
63888+}
63889+
63890+/* debugging aid: print human readable information about @block */
63891+void reiser4_print_address(const char *prefix /* prefix to print */ ,
63892+ const reiser4_block_nr * block /* block number to print */ )
63893+{
63894+ printk("%s: %s\n", prefix, sprint_address(block));
63895+}
63896+#endif
63897+
63898+/* return string containing human readable representation of @block */
63899+char *sprint_address(const reiser4_block_nr *
63900+ block /* block number to print */ )
63901+{
63902+ static char address[30];
63903+
63904+ if (block == NULL)
63905+ sprintf(address, "null");
63906+ else if (reiser4_blocknr_is_fake(block))
63907+ sprintf(address, "%llx", (unsigned long long)(*block));
63908+ else
63909+ sprintf(address, "%llu", (unsigned long long)(*block));
63910+ return address;
63911+}
63912+
63913+/* release parent node during traversal */
63914+static void put_parent(cbk_handle * h /* search handle */ )
63915+{
63916+ assert("nikita-383", h != NULL);
63917+ if (h->parent_lh->node != NULL) {
63918+ longterm_unlock_znode(h->parent_lh);
63919+ }
63920+}
63921+
63922+/* helper function used by coord_by_key(): release reference to parent znode
63923+ stored in handle before processing its child. */
63924+static void hput(cbk_handle * h /* search handle */ )
63925+{
63926+ assert("nikita-385", h != NULL);
63927+ done_lh(h->parent_lh);
63928+ done_lh(h->active_lh);
63929+}
63930+
63931+/* Helper function used by cbk(): update delimiting keys of child node (stored
63932+ in h->active_lh->node) using key taken from parent on the parent level. */
63933+static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
63934+{
63935+ znode *active;
63936+ reiser4_tree *tree;
63937+
63938+ assert("nikita-1088", h != NULL);
63939+
63940+ active = h->active_lh->node;
63941+
63942+ /* fast check without taking dk lock. This is safe, because
63943+ * JNODE_DKSET is never cleared once set. */
63944+ if (!ZF_ISSET(active, JNODE_DKSET)) {
63945+ tree = znode_get_tree(active);
63946+ write_lock_dk(tree);
63947+ if (!ZF_ISSET(active, JNODE_DKSET)) {
63948+ znode_set_ld_key(active, &h->ld_key);
63949+ znode_set_rd_key(active, &h->rd_key);
63950+ ZF_SET(active, JNODE_DKSET);
63951+ }
63952+ write_unlock_dk(tree);
63953+ return 1;
63954+ }
63955+ return 0;
63956+}
63957+
63958+/* true if @block makes sense for the @tree. Used to detect corrupted node
63959+ * pointers */
63960+static int
63961+block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
63962+ reiser4_tree * tree /* tree to check against */ )
63963+{
63964+ assert("nikita-757", block != NULL);
63965+ assert("nikita-758", tree != NULL);
63966+
63967+ /* check to see if it exceeds the size of the device. */
63968+ return reiser4_blocknr_is_sane_for(tree->super, block);
63969+}
63970+
63971+/* check consistency of fields */
63972+static int sanity_check(cbk_handle * h /* search handle */ )
63973+{
63974+ assert("nikita-384", h != NULL);
63975+
63976+ if (h->level < h->stop_level) {
63977+ h->error = "Buried under leaves";
63978+ h->result = RETERR(-EIO);
63979+ return LOOKUP_DONE;
63980+ } else if (!block_nr_is_correct(&h->block, h->tree)) {
63981+ h->error = "bad block number";
63982+ h->result = RETERR(-EIO);
63983+ return LOOKUP_DONE;
63984+ } else
63985+ return 0;
63986+}
63987+
63988+/* Make Linus happy.
63989+ Local variables:
63990+ c-indentation-style: "K&R"
63991+ mode-name: "LC"
63992+ c-basic-offset: 8
63993+ tab-width: 8
63994+ fill-column: 120
63995+ scroll-step: 1
63996+ End:
63997+*/
63998diff -urN linux-2.6.23.orig/fs/reiser4/status_flags.c linux-2.6.23/fs/reiser4/status_flags.c
63999--- linux-2.6.23.orig/fs/reiser4/status_flags.c 1970-01-01 03:00:00.000000000 +0300
64000+++ linux-2.6.23/fs/reiser4/status_flags.c 2007-12-04 21:05:55.810811035 +0300
64001@@ -0,0 +1,175 @@
64002+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64003+ * reiser4/README */
64004+
64005+/* Functions that deal with reiser4 status block, query status and update it, if needed */
64006+
64007+#include <linux/bio.h>
64008+#include <linux/highmem.h>
64009+#include <linux/fs.h>
64010+#include <linux/blkdev.h>
64011+#include "debug.h"
64012+#include "dformat.h"
64013+#include "status_flags.h"
64014+#include "super.h"
64015+
64016+/* This is our end I/O handler that marks page uptodate if IO was successful. It also
64017+ unconditionally unlocks the page, so we can see that io was done.
64018+ We do not free bio, because we hope to reuse that. */
64019+static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
64020+ int err)
64021+{
64022+ if (bio->bi_size)
64023+ return 1;
64024+
64025+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
64026+ SetPageUptodate(bio->bi_io_vec->bv_page);
64027+ } else {
64028+ ClearPageUptodate(bio->bi_io_vec->bv_page);
64029+ SetPageError(bio->bi_io_vec->bv_page);
64030+ }
64031+ unlock_page(bio->bi_io_vec->bv_page);
64032+ return 0;
64033+}
64034+
64035+/* Initialise status code. This is expected to be called from the disk format
64036+ code. block paremeter is where status block lives. */
64037+int reiser4_status_init(reiser4_block_nr block)
64038+{
64039+ struct super_block *sb = reiser4_get_current_sb();
64040+ struct reiser4_status *statuspage;
64041+ struct bio *bio;
64042+ struct page *page;
64043+
64044+ get_super_private(sb)->status_page = NULL;
64045+ get_super_private(sb)->status_bio = NULL;
64046+
64047+ page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
64048+ if (!page)
64049+ return -ENOMEM;
64050+
64051+ bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
64052+ if (bio != NULL) {
64053+ bio->bi_sector = block * (sb->s_blocksize >> 9);
64054+ bio->bi_bdev = sb->s_bdev;
64055+ bio->bi_io_vec[0].bv_page = page;
64056+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64057+ bio->bi_io_vec[0].bv_offset = 0;
64058+ bio->bi_vcnt = 1;
64059+ bio->bi_size = sb->s_blocksize;
64060+ bio->bi_end_io = reiser4_status_endio;
64061+ } else {
64062+ __free_pages(page, 0);
64063+ return -ENOMEM;
64064+ }
64065+ lock_page(page);
64066+ submit_bio(READ, bio);
64067+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64068+ wait_on_page_locked(page);
64069+ if (!PageUptodate(page)) {
64070+ warning("green-2007",
64071+ "I/O error while tried to read status page\n");
64072+ return -EIO;
64073+ }
64074+
64075+ statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
64076+ if (memcmp
64077+ (statuspage->magic, REISER4_STATUS_MAGIC,
64078+ sizeof(REISER4_STATUS_MAGIC))) {
64079+ /* Magic does not match. */
64080+ kunmap_atomic((char *)statuspage, KM_USER0);
64081+ warning("green-2008", "Wrong magic in status block\n");
64082+ __free_pages(page, 0);
64083+ bio_put(bio);
64084+ return -EINVAL;
64085+ }
64086+ kunmap_atomic((char *)statuspage, KM_USER0);
64087+
64088+ get_super_private(sb)->status_page = page;
64089+ get_super_private(sb)->status_bio = bio;
64090+ return 0;
64091+}
64092+
64093+/* Query the status of fs. Returns if the FS can be safely mounted.
64094+ Also if "status" and "extended" parameters are given, it will fill
64095+ actual parts of status from disk there. */
64096+int reiser4_status_query(u64 * status, u64 * extended)
64097+{
64098+ struct super_block *sb = reiser4_get_current_sb();
64099+ struct reiser4_status *statuspage;
64100+ int retval;
64101+
64102+ if (!get_super_private(sb)->status_page) { // No status page?
64103+ return REISER4_STATUS_MOUNT_UNKNOWN;
64104+ }
64105+ statuspage = (struct reiser4_status *)
64106+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64107+ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work.
64108+ case REISER4_STATUS_OK:
64109+ retval = REISER4_STATUS_MOUNT_OK;
64110+ break;
64111+ case REISER4_STATUS_CORRUPTED:
64112+ retval = REISER4_STATUS_MOUNT_WARN;
64113+ break;
64114+ case REISER4_STATUS_DAMAGED:
64115+ case REISER4_STATUS_DESTROYED:
64116+ case REISER4_STATUS_IOERROR:
64117+ retval = REISER4_STATUS_MOUNT_RO;
64118+ break;
64119+ default:
64120+ retval = REISER4_STATUS_MOUNT_UNKNOWN;
64121+ break;
64122+ }
64123+
64124+ if (status)
64125+ *status = le64_to_cpu(get_unaligned(&statuspage->status));
64126+ if (extended)
64127+ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
64128+
64129+ kunmap_atomic((char *)statuspage, KM_USER0);
64130+ return retval;
64131+}
64132+
64133+/* This function should be called when something bad happens (e.g. from reiser4_panic).
64134+ It fills the status structure and tries to push it to disk. */
64135+int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
64136+{
64137+ struct super_block *sb = reiser4_get_current_sb();
64138+ struct reiser4_status *statuspage;
64139+ struct bio *bio = get_super_private(sb)->status_bio;
64140+
64141+ if (!get_super_private(sb)->status_page) { // No status page?
64142+ return -1;
64143+ }
64144+ statuspage = (struct reiser4_status *)
64145+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64146+
64147+ put_unaligned(cpu_to_le64(status), &statuspage->status);
64148+ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
64149+ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
64150+
64151+ kunmap_atomic((char *)statuspage, KM_USER0);
64152+ bio->bi_bdev = sb->s_bdev;
64153+ bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
64154+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64155+ bio->bi_io_vec[0].bv_offset = 0;
64156+ bio->bi_vcnt = 1;
64157+ bio->bi_size = sb->s_blocksize;
64158+ bio->bi_end_io = reiser4_status_endio;
64159+ lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
64160+ /* We can block now, but we have no other choice anyway */
64161+ submit_bio(WRITE, bio);
64162+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64163+ return 0; // We do not wait for io to finish.
64164+}
64165+
64166+/* Frees the page with status and bio structure. Should be called by disk format at umount time */
64167+int reiser4_status_finish(void)
64168+{
64169+ struct super_block *sb = reiser4_get_current_sb();
64170+
64171+ __free_pages(get_super_private(sb)->status_page, 0);
64172+ get_super_private(sb)->status_page = NULL;
64173+ bio_put(get_super_private(sb)->status_bio);
64174+ get_super_private(sb)->status_bio = NULL;
64175+ return 0;
64176+}
64177diff -urN linux-2.6.23.orig/fs/reiser4/status_flags.h linux-2.6.23/fs/reiser4/status_flags.h
64178--- linux-2.6.23.orig/fs/reiser4/status_flags.h 1970-01-01 03:00:00.000000000 +0300
64179+++ linux-2.6.23/fs/reiser4/status_flags.h 2007-12-04 16:49:30.000000000 +0300
64180@@ -0,0 +1,43 @@
64181+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64182+ * reiser4/README */
64183+
64184+/* Here we declare structures and flags that store reiser4 status on disk.
64185+ The status that helps us to find out if the filesystem is valid or if it
64186+ contains some critical, or not so critical errors */
64187+
64188+#if !defined( __REISER4_STATUS_FLAGS_H__ )
64189+#define __REISER4_STATUS_FLAGS_H__
64190+
64191+#include "dformat.h"
64192+/* These are major status flags */
64193+#define REISER4_STATUS_OK 0
64194+#define REISER4_STATUS_CORRUPTED 0x1
64195+#define REISER4_STATUS_DAMAGED 0x2
64196+#define REISER4_STATUS_DESTROYED 0x4
64197+#define REISER4_STATUS_IOERROR 0x8
64198+
64199+/* Return values for reiser4_status_query() */
64200+#define REISER4_STATUS_MOUNT_OK 0
64201+#define REISER4_STATUS_MOUNT_WARN 1
64202+#define REISER4_STATUS_MOUNT_RO 2
64203+#define REISER4_STATUS_MOUNT_UNKNOWN -1
64204+
64205+#define REISER4_TEXTERROR_LEN 256
64206+
64207+#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
64208+/* We probably need to keep its size under sector size which is 512 bytes */
64209+struct reiser4_status {
64210+ char magic[16];
64211+ d64 status; /* Current FS state */
64212+ d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
64213+ last sector where io error happened if status is "io error encountered" */
64214+ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
64215+ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
64216+};
64217+
64218+int reiser4_status_init(reiser4_block_nr block);
64219+int reiser4_status_query(u64 * status, u64 * extended);
64220+int reiser4_status_write(u64 status, u64 extended_status, char *message);
64221+int reiser4_status_finish(void);
64222+
64223+#endif
64224diff -urN linux-2.6.23.orig/fs/reiser4/super.c linux-2.6.23/fs/reiser4/super.c
64225--- linux-2.6.23.orig/fs/reiser4/super.c 1970-01-01 03:00:00.000000000 +0300
64226+++ linux-2.6.23/fs/reiser4/super.c 2007-12-04 16:49:30.000000000 +0300
64227@@ -0,0 +1,316 @@
64228+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64229+ * reiser4/README */
64230+
64231+/* Super-block manipulations. */
64232+
64233+#include "debug.h"
64234+#include "dformat.h"
64235+#include "key.h"
64236+#include "plugin/security/perm.h"
64237+#include "plugin/space/space_allocator.h"
64238+#include "plugin/plugin.h"
64239+#include "tree.h"
64240+#include "vfs_ops.h"
64241+#include "super.h"
64242+#include "reiser4.h"
64243+
64244+#include <linux/types.h> /* for __u?? */
64245+#include <linux/fs.h> /* for struct super_block */
64246+
64247+static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
64248+static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
64249+static __u64 reserved_for_root(const struct super_block *super);
64250+
64251+/* Return reiser4-specific part of super block */
64252+reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block
64253+ * queried */ )
64254+{
64255+ return (reiser4_super_info_data *) super->s_fs_info;
64256+}
64257+
64258+/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
64259+long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
64260+{
64261+ assert("nikita-448", super != NULL);
64262+ assert("nikita-449", is_reiser4_super(super));
64263+ return (long)REISER4_SUPER_MAGIC;
64264+}
64265+
64266+/* functions to read/modify fields of reiser4_super_info_data */
64267+
64268+/* get number of blocks in file system */
64269+__u64 reiser4_block_count(const struct super_block *super /* super block
64270+ queried */ )
64271+{
64272+ assert("vs-494", super != NULL);
64273+ assert("vs-495", is_reiser4_super(super));
64274+ return get_super_private(super)->block_count;
64275+}
64276+
64277+#if REISER4_DEBUG
64278+/*
64279+ * number of blocks in the current file system
64280+ */
64281+__u64 reiser4_current_block_count(void)
64282+{
64283+ return get_current_super_private()->block_count;
64284+}
64285+#endif /* REISER4_DEBUG */
64286+
64287+/* set number of block in filesystem */
64288+void reiser4_set_block_count(const struct super_block *super, __u64 nr)
64289+{
64290+ assert("vs-501", super != NULL);
64291+ assert("vs-502", is_reiser4_super(super));
64292+ get_super_private(super)->block_count = nr;
64293+ /*
64294+ * The proper calculation of the reserved space counter (%5 of device
64295+ * block counter) we need a 64 bit division which is missing in Linux
64296+ * on i386 platform. Because we do not need a precise calculation here
64297+ * we can replace a div64 operation by this combination of
64298+ * multiplication and shift: 51. / (2^10) == .0498 .
64299+ * FIXME: this is a bug. It comes up only for very small filesystems
64300+ * which probably are never used. Nevertheless, it is a bug. Number of
64301+ * reserved blocks must be not less than maximal number of blocks which
64302+ * get grabbed with BA_RESERVED.
64303+ */
64304+ get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
64305+}
64306+
64307+/* amount of blocks used (allocated for data) in file system */
64308+__u64 reiser4_data_blocks(const struct super_block *super /* super block
64309+ queried */ )
64310+{
64311+ assert("nikita-452", super != NULL);
64312+ assert("nikita-453", is_reiser4_super(super));
64313+ return get_super_private(super)->blocks_used;
64314+}
64315+
64316+/* set number of block used in filesystem */
64317+void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
64318+{
64319+ assert("vs-503", super != NULL);
64320+ assert("vs-504", is_reiser4_super(super));
64321+ get_super_private(super)->blocks_used = nr;
64322+}
64323+
64324+/* amount of free blocks in file system */
64325+__u64 reiser4_free_blocks(const struct super_block *super /* super block
64326+ queried */ )
64327+{
64328+ assert("nikita-454", super != NULL);
64329+ assert("nikita-455", is_reiser4_super(super));
64330+ return get_super_private(super)->blocks_free;
64331+}
64332+
64333+/* set number of blocks free in filesystem */
64334+void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
64335+{
64336+ assert("vs-505", super != NULL);
64337+ assert("vs-506", is_reiser4_super(super));
64338+ get_super_private(super)->blocks_free = nr;
64339+}
64340+
64341+/* get mkfs unique identifier */
64342+__u32 reiser4_mkfs_id(const struct super_block *super /* super block
64343+ queried */ )
64344+{
64345+ assert("vpf-221", super != NULL);
64346+ assert("vpf-222", is_reiser4_super(super));
64347+ return get_super_private(super)->mkfs_id;
64348+}
64349+
64350+/* amount of free blocks in file system */
64351+__u64 reiser4_free_committed_blocks(const struct super_block *super)
64352+{
64353+ assert("vs-497", super != NULL);
64354+ assert("vs-498", is_reiser4_super(super));
64355+ return get_super_private(super)->blocks_free_committed;
64356+}
64357+
64358+/* amount of blocks in the file system reserved for @uid and @gid */
64359+long reiser4_reserved_blocks(const struct super_block *super /* super block
64360+ queried */ ,
64361+ uid_t uid /* user id */ ,
64362+ gid_t gid /* group id */ )
64363+{
64364+ long reserved;
64365+
64366+ assert("nikita-456", super != NULL);
64367+ assert("nikita-457", is_reiser4_super(super));
64368+
64369+ reserved = 0;
64370+ if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
64371+ reserved += reserved_for_gid(super, gid);
64372+ if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
64373+ reserved += reserved_for_uid(super, uid);
64374+ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
64375+ reserved += reserved_for_root(super);
64376+ return reserved;
64377+}
64378+
64379+/* get/set value of/to grabbed blocks counter */
64380+__u64 reiser4_grabbed_blocks(const struct super_block * super)
64381+{
64382+ assert("zam-512", super != NULL);
64383+ assert("zam-513", is_reiser4_super(super));
64384+
64385+ return get_super_private(super)->blocks_grabbed;
64386+}
64387+
64388+__u64 reiser4_flush_reserved(const struct super_block * super)
64389+{
64390+ assert("vpf-285", super != NULL);
64391+ assert("vpf-286", is_reiser4_super(super));
64392+
64393+ return get_super_private(super)->blocks_flush_reserved;
64394+}
64395+
64396+/* get/set value of/to counter of fake allocated formatted blocks */
64397+__u64 reiser4_fake_allocated(const struct super_block * super)
64398+{
64399+ assert("zam-516", super != NULL);
64400+ assert("zam-517", is_reiser4_super(super));
64401+
64402+ return get_super_private(super)->blocks_fake_allocated;
64403+}
64404+
64405+/* get/set value of/to counter of fake allocated unformatted blocks */
64406+__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
64407+{
64408+ assert("zam-516", super != NULL);
64409+ assert("zam-517", is_reiser4_super(super));
64410+
64411+ return get_super_private(super)->blocks_fake_allocated_unformatted;
64412+}
64413+
64414+/* get/set value of/to counter of clustered blocks */
64415+__u64 reiser4_clustered_blocks(const struct super_block * super)
64416+{
64417+ assert("edward-601", super != NULL);
64418+ assert("edward-602", is_reiser4_super(super));
64419+
64420+ return get_super_private(super)->blocks_clustered;
64421+}
64422+
64423+/* space allocator used by this file system */
64424+reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
64425+ *super)
64426+{
64427+ assert("nikita-1965", super != NULL);
64428+ assert("nikita-1966", is_reiser4_super(super));
64429+ return &get_super_private(super)->space_allocator;
64430+}
64431+
64432+/* return fake inode used to bind formatted nodes in the page cache */
64433+struct inode *reiser4_get_super_fake(const struct super_block *super /* super block
64434+ queried */ )
64435+{
64436+ assert("nikita-1757", super != NULL);
64437+ return get_super_private(super)->fake;
64438+}
64439+
64440+/* return fake inode used to bind copied on capture nodes in the page cache */
64441+struct inode *reiser4_get_cc_fake(const struct super_block *super /* super block
64442+ queried */ )
64443+{
64444+ assert("nikita-1757", super != NULL);
64445+ return get_super_private(super)->cc;
64446+}
64447+
64448+/* return fake inode used to bind bitmaps and journlal heads */
64449+struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
64450+{
64451+ assert("nikita-17571", super != NULL);
64452+ return get_super_private(super)->bitmap;
64453+}
64454+
64455+/* tree used by this file system */
64456+reiser4_tree *reiser4_get_tree(const struct super_block * super /* super block
64457+ * queried */ )
64458+{
64459+ assert("nikita-460", super != NULL);
64460+ assert("nikita-461", is_reiser4_super(super));
64461+ return &get_super_private(super)->tree;
64462+}
64463+
64464+/* Check that @super is (looks like) reiser4 super block. This is mainly for
64465+ use in assertions. */
64466+int is_reiser4_super(const struct super_block *super /* super block
64467+ * queried */ )
64468+{
64469+ return
64470+ super != NULL &&
64471+ get_super_private(super) != NULL &&
64472+ super->s_op == &(get_super_private(super)->ops.super);
64473+}
64474+
64475+int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
64476+{
64477+ return test_bit((int)f, &get_super_private(super)->fs_flags);
64478+}
64479+
64480+/* amount of blocks reserved for given group in file system */
64481+static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super
64482+ * block
64483+ * queried */ ,
64484+ gid_t gid UNUSED_ARG /* group id */ )
64485+{
64486+ return 0;
64487+}
64488+
64489+/* amount of blocks reserved for given user in file system */
64490+static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super
64491+ block
64492+ queried */ ,
64493+ uid_t uid UNUSED_ARG /* user id */ )
64494+{
64495+ return 0;
64496+}
64497+
64498+/* amount of blocks reserved for super user in file system */
64499+static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super
64500+ block
64501+ queried */ )
64502+{
64503+ return 0;
64504+}
64505+
64506+/*
64507+ * true if block number @blk makes sense for the file system at @super.
64508+ */
64509+int
64510+reiser4_blocknr_is_sane_for(const struct super_block *super,
64511+ const reiser4_block_nr * blk)
64512+{
64513+ reiser4_super_info_data *sbinfo;
64514+
64515+ assert("nikita-2957", super != NULL);
64516+ assert("nikita-2958", blk != NULL);
64517+
64518+ if (reiser4_blocknr_is_fake(blk))
64519+ return 1;
64520+
64521+ sbinfo = get_super_private(super);
64522+ return *blk < sbinfo->block_count;
64523+}
64524+
64525+#if REISER4_DEBUG
64526+/*
64527+ * true, if block number @blk makes sense for the current file system
64528+ */
64529+int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
64530+{
64531+ return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
64532+}
64533+#endif /* REISER4_DEBUG */
64534+
64535+/* Make Linus happy.
64536+ Local variables:
64537+ c-indentation-style: "K&R"
64538+ mode-name: "LC"
64539+ c-basic-offset: 8
64540+ tab-width: 8
64541+ fill-column: 120
64542+ End:
64543+*/
64544diff -urN linux-2.6.23.orig/fs/reiser4/super.h linux-2.6.23/fs/reiser4/super.h
64545--- linux-2.6.23.orig/fs/reiser4/super.h 1970-01-01 03:00:00.000000000 +0300
64546+++ linux-2.6.23/fs/reiser4/super.h 2007-12-04 16:49:30.000000000 +0300
64547@@ -0,0 +1,466 @@
64548+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64549+ * reiser4/README */
64550+
64551+/* Super-block functions. See super.c for details. */
64552+
64553+#if !defined( __REISER4_SUPER_H__ )
64554+#define __REISER4_SUPER_H__
64555+
64556+#include <linux/exportfs.h>
64557+
64558+#include "tree.h"
64559+#include "entd.h"
64560+#include "wander.h"
64561+#include "fsdata.h"
64562+#include "plugin/object.h"
64563+#include "plugin/space/space_allocator.h"
64564+
64565+/*
64566+ * Flush algorithms parameters.
64567+ */
64568+struct flush_params {
64569+ unsigned relocate_threshold;
64570+ unsigned relocate_distance;
64571+ unsigned written_threshold;
64572+ unsigned scan_maxnodes;
64573+};
64574+
64575+typedef enum {
64576+ /*
64577+ * True if this file system doesn't support hard-links (multiple names)
64578+ * for directories: this is default UNIX behavior.
64579+ *
64580+ * If hard-links on directoires are not allowed, file system is Acyclic
64581+ * Directed Graph (modulo dot, and dotdot, of course).
64582+ *
64583+ * This is used by reiser4_link().
64584+ */
64585+ REISER4_ADG = 0,
64586+ /*
64587+ * set if all nodes in internal tree have the same node layout plugin.
64588+ * If so, znode_guess_plugin() will return tree->node_plugin in stead
64589+ * of guessing plugin by plugin id stored in the node.
64590+ */
64591+ REISER4_ONE_NODE_PLUGIN = 1,
64592+ /* if set, bsd gid assignment is supported. */
64593+ REISER4_BSD_GID = 2,
64594+ /* [mac]_time are 32 bit in inode */
64595+ REISER4_32_BIT_TIMES = 3,
64596+ /* load all bitmap blocks at mount time */
64597+ REISER4_DONT_LOAD_BITMAP = 5,
64598+ /* enforce atomicity during write(2) */
64599+ REISER4_ATOMIC_WRITE = 6,
64600+ /* don't use write barriers in the log writer code. */
64601+ REISER4_NO_WRITE_BARRIER = 7
64602+} reiser4_fs_flag;
64603+
64604+/*
64605+ * VFS related operation vectors.
64606+ */
64607+struct object_ops {
64608+ struct super_operations super;
64609+ struct dentry_operations dentry;
64610+ struct export_operations export;
64611+};
64612+
64613+/* reiser4-specific part of super block
64614+
64615+ Locking
64616+
64617+ Fields immutable after mount:
64618+
64619+ ->oid*
64620+ ->space*
64621+ ->default_[ug]id
64622+ ->mkfs_id
64623+ ->trace_flags
64624+ ->debug_flags
64625+ ->fs_flags
64626+ ->df_plug
64627+ ->optimal_io_size
64628+ ->plug
64629+ ->flush
64630+ ->u (bad name)
64631+ ->txnmgr
64632+ ->ra_params
64633+ ->fsuid
64634+ ->journal_header
64635+ ->journal_footer
64636+
64637+ Fields protected by ->lnode_guard
64638+
64639+ ->lnode_htable
64640+
64641+ Fields protected by per-super block spin lock
64642+
64643+ ->block_count
64644+ ->blocks_used
64645+ ->blocks_free
64646+ ->blocks_free_committed
64647+ ->blocks_grabbed
64648+ ->blocks_fake_allocated_unformatted
64649+ ->blocks_fake_allocated
64650+ ->blocks_flush_reserved
64651+ ->eflushed
64652+ ->blocknr_hint_default
64653+
64654+ After journal replaying during mount,
64655+
64656+ ->last_committed_tx
64657+
64658+ is protected by ->tmgr.commit_mutex
64659+
64660+ Invariants involving this data-type:
64661+
64662+ [sb-block-counts]
64663+ [sb-grabbed]
64664+ [sb-fake-allocated]
64665+*/
64666+struct reiser4_super_info_data {
64667+ /*
64668+ * guard spinlock which protects reiser4 super block fields (currently
64669+ * blocks_free, blocks_free_committed)
64670+ */
64671+ spinlock_t guard;
64672+
64673+ /* next oid that will be returned by oid_allocate() */
64674+ oid_t next_to_use;
64675+ /* total number of used oids */
64676+ oid_t oids_in_use;
64677+
64678+ /* space manager plugin */
64679+ reiser4_space_allocator space_allocator;
64680+
64681+ /* reiser4 internal tree */
64682+ reiser4_tree tree;
64683+
64684+ /*
64685+ * default user id used for light-weight files without their own
64686+ * stat-data.
64687+ */
64688+ uid_t default_uid;
64689+
64690+ /*
64691+ * default group id used for light-weight files without their own
64692+ * stat-data.
64693+ */
64694+ gid_t default_gid;
64695+
64696+ /* mkfs identifier generated at mkfs time. */
64697+ __u32 mkfs_id;
64698+ /* amount of blocks in a file system */
64699+ __u64 block_count;
64700+
64701+ /* inviolable reserve */
64702+ __u64 blocks_reserved;
64703+
64704+ /* amount of blocks used by file system data and meta-data. */
64705+ __u64 blocks_used;
64706+
64707+ /*
64708+ * amount of free blocks. This is "working" free blocks counter. It is
64709+ * like "working" bitmap, please see block_alloc.c for description.
64710+ */
64711+ __u64 blocks_free;
64712+
64713+ /*
64714+ * free block count for fs committed state. This is "commit" version of
64715+ * free block counter.
64716+ */
64717+ __u64 blocks_free_committed;
64718+
64719+ /*
64720+ * number of blocks reserved for further allocation, for all
64721+ * threads.
64722+ */
64723+ __u64 blocks_grabbed;
64724+
64725+ /* number of fake allocated unformatted blocks in tree. */
64726+ __u64 blocks_fake_allocated_unformatted;
64727+
64728+ /* number of fake allocated formatted blocks in tree. */
64729+ __u64 blocks_fake_allocated;
64730+
64731+ /* number of blocks reserved for flush operations. */
64732+ __u64 blocks_flush_reserved;
64733+
64734+ /* number of blocks reserved for cluster operations. */
64735+ __u64 blocks_clustered;
64736+
64737+ /* unique file-system identifier */
64738+ __u32 fsuid;
64739+
64740+ /* On-disk format version. If does not equal to the disk_format
64741+ plugin version, some format updates (e.g. enlarging plugin
64742+ set, etc) may have place on mount. */
64743+ int version;
64744+
64745+ /* file-system wide flags. See reiser4_fs_flag enum */
64746+ unsigned long fs_flags;
64747+
64748+ /* transaction manager */
64749+ txn_mgr tmgr;
64750+
64751+ /* ent thread */
64752+ entd_context entd;
64753+
64754+ /* fake inode used to bind formatted nodes */
64755+ struct inode *fake;
64756+ /* inode used to bind bitmaps (and journal heads) */
64757+ struct inode *bitmap;
64758+ /* inode used to bind copied on capture nodes */
64759+ struct inode *cc;
64760+
64761+ /* disk layout plugin */
64762+ disk_format_plugin *df_plug;
64763+
64764+ /* disk layout specific part of reiser4 super info data */
64765+ union {
64766+ format40_super_info format40;
64767+ } u;
64768+
64769+ /* value we return in st_blksize on stat(2) */
64770+ unsigned long optimal_io_size;
64771+
64772+ /* parameters for the flush algorithm */
64773+ struct flush_params flush;
64774+
64775+ /* pointers to jnodes for journal header and footer */
64776+ jnode *journal_header;
64777+ jnode *journal_footer;
64778+
64779+ journal_location jloc;
64780+
64781+ /* head block number of last committed transaction */
64782+ __u64 last_committed_tx;
64783+
64784+ /*
64785+ * we remember last written location for using as a hint for new block
64786+ * allocation
64787+ */
64788+ __u64 blocknr_hint_default;
64789+
64790+ /* committed number of files (oid allocator state variable ) */
64791+ __u64 nr_files_committed;
64792+
64793+ struct formatted_ra_params ra_params;
64794+
64795+ /*
64796+ * A mutex for serializing cut tree operation if out-of-free-space:
64797+ * the only one cut_tree thread is allowed to grab space from reserved
64798+ * area (it is 5% of disk space)
64799+ */
64800+ struct mutex delete_mutex;
64801+ /* task owning ->delete_mutex */
64802+ struct task_struct *delete_mutex_owner;
64803+
64804+ /* Diskmap's blocknumber */
64805+ __u64 diskmap_block;
64806+
64807+ /* What to do in case of error */
64808+ int onerror;
64809+
64810+ /* operations for objects on this file system */
64811+ struct object_ops ops;
64812+
64813+ /*
64814+ * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
64815+ * more details
64816+ */
64817+ struct d_cursor_info d_info;
64818+
64819+#ifdef CONFIG_REISER4_BADBLOCKS
64820+ /* Alternative master superblock offset (in bytes) */
64821+ unsigned long altsuper;
64822+#endif
64823+ struct repacker *repacker;
64824+ struct page *status_page;
64825+ struct bio *status_bio;
64826+
64827+#if REISER4_DEBUG
64828+ /*
64829+ * minimum used blocks value (includes super blocks, bitmap blocks and
64830+ * other fs reserved areas), depends on fs format and fs size.
64831+ */
64832+ __u64 min_blocks_used;
64833+
64834+ /*
64835+ * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
64836+ * are kept on a list anchored at sbinfo->all_jnodes. This list is
64837+ * protected by sbinfo->all_guard spin lock. This lock should be taken
64838+ * with _irq modifier, because it is also modified from interrupt
64839+ * contexts (by RCU).
64840+ */
64841+ spinlock_t all_guard;
64842+ /* list of all jnodes */
64843+ struct list_head all_jnodes;
64844+#endif
64845+ struct dentry *debugfs_root;
64846+};
64847+
64848+extern reiser4_super_info_data *get_super_private_nocheck(const struct
64849+ super_block *super);
64850+
64851+/* Return reiser4-specific part of super block */
64852+static inline reiser4_super_info_data *get_super_private(const struct
64853+ super_block *super)
64854+{
64855+ assert("nikita-447", super != NULL);
64856+
64857+ return (reiser4_super_info_data *) super->s_fs_info;
64858+}
64859+
64860+/* get ent context for the @super */
64861+static inline entd_context *get_entd_context(struct super_block *super)
64862+{
64863+ return &get_super_private(super)->entd;
64864+}
64865+
64866+/* "Current" super-block: main super block used during current system
64867+ call. Reference to this super block is stored in reiser4_context. */
64868+static inline struct super_block *reiser4_get_current_sb(void)
64869+{
64870+ return get_current_context()->super;
64871+}
64872+
64873+/* Reiser4-specific part of "current" super-block: main super block used
64874+ during current system call. Reference to this super block is stored in
64875+ reiser4_context. */
64876+static inline reiser4_super_info_data *get_current_super_private(void)
64877+{
64878+ return get_super_private(reiser4_get_current_sb());
64879+}
64880+
64881+static inline struct formatted_ra_params *get_current_super_ra_params(void)
64882+{
64883+ return &(get_current_super_private()->ra_params);
64884+}
64885+
64886+/*
64887+ * true, if file system on @super is read-only
64888+ */
64889+static inline int rofs_super(struct super_block *super)
64890+{
64891+ return super->s_flags & MS_RDONLY;
64892+}
64893+
64894+/*
64895+ * true, if @tree represents read-only file system
64896+ */
64897+static inline int rofs_tree(reiser4_tree * tree)
64898+{
64899+ return rofs_super(tree->super);
64900+}
64901+
64902+/*
64903+ * true, if file system where @inode lives on, is read-only
64904+ */
64905+static inline int rofs_inode(struct inode *inode)
64906+{
64907+ return rofs_super(inode->i_sb);
64908+}
64909+
64910+/*
64911+ * true, if file system where @node lives on, is read-only
64912+ */
64913+static inline int rofs_jnode(jnode * node)
64914+{
64915+ return rofs_tree(jnode_get_tree(node));
64916+}
64917+
64918+extern __u64 reiser4_current_block_count(void);
64919+
64920+extern void build_object_ops(struct super_block *super, struct object_ops * ops);
64921+
64922+#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
64923+
64924+static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
64925+{
64926+ spin_lock(&(sbinfo->guard));
64927+}
64928+
64929+static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
64930+{
64931+ assert_spin_locked(&(sbinfo->guard));
64932+ spin_unlock(&(sbinfo->guard));
64933+}
64934+
64935+extern __u64 reiser4_flush_reserved(const struct super_block *);
64936+extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
64937+extern long reiser4_statfs_type(const struct super_block *super);
64938+extern __u64 reiser4_block_count(const struct super_block *super);
64939+extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
64940+extern __u64 reiser4_data_blocks(const struct super_block *super);
64941+extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
64942+extern __u64 reiser4_free_blocks(const struct super_block *super);
64943+extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
64944+extern __u32 reiser4_mkfs_id(const struct super_block *super);
64945+
64946+extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
64947+
64948+extern __u64 reiser4_grabbed_blocks(const struct super_block *);
64949+extern __u64 reiser4_fake_allocated(const struct super_block *);
64950+extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
64951+extern __u64 reiser4_clustered_blocks(const struct super_block *);
64952+
64953+extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
64954+ gid_t gid);
64955+
64956+extern reiser4_space_allocator *
64957+reiser4_get_space_allocator(const struct super_block *super);
64958+extern reiser4_oid_allocator *
64959+reiser4_get_oid_allocator(const struct super_block *super);
64960+extern struct inode *reiser4_get_super_fake(const struct super_block *super);
64961+extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
64962+extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
64963+extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
64964+extern int is_reiser4_super(const struct super_block *super);
64965+
64966+extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
64967+extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
64968+ const reiser4_block_nr * blk);
64969+extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
64970+extern int reiser4_done_super(struct super_block *s);
64971+
64972+/* step of fill super */
64973+extern int reiser4_init_fs_info(struct super_block *);
64974+extern void reiser4_done_fs_info(struct super_block *);
64975+extern int reiser4_init_super_data(struct super_block *, char *opt_string);
64976+extern int reiser4_init_read_super(struct super_block *, int silent);
64977+extern int reiser4_init_root_inode(struct super_block *);
64978+extern reiser4_plugin *get_default_plugin(pset_member memb);
64979+
64980+/* Maximal possible object id. */
64981+#define ABSOLUTE_MAX_OID ((oid_t)~0)
64982+
64983+#define OIDS_RESERVED ( 1 << 16 )
64984+int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
64985+oid_t oid_allocate(struct super_block *);
64986+int oid_release(struct super_block *, oid_t);
64987+oid_t oid_next(const struct super_block *);
64988+void oid_count_allocated(void);
64989+void oid_count_released(void);
64990+long oids_used(const struct super_block *);
64991+
64992+#if REISER4_DEBUG
64993+void print_fs_info(const char *prefix, const struct super_block *);
64994+#endif
64995+
64996+extern void destroy_reiser4_cache(struct kmem_cache **);
64997+
64998+extern struct super_operations reiser4_super_operations;
64999+extern struct export_operations reiser4_export_operations;
65000+extern struct dentry_operations reiser4_dentry_operations;
65001+
65002+/* __REISER4_SUPER_H__ */
65003+#endif
65004+
65005+/*
65006+ * Local variables:
65007+ * c-indentation-style: "K&R"
65008+ * mode-name: "LC"
65009+ * c-basic-offset: 8
65010+ * tab-width: 8
65011+ * fill-column: 120
65012+ * End:
65013+ */
65014diff -urN linux-2.6.23.orig/fs/reiser4/super_ops.c linux-2.6.23/fs/reiser4/super_ops.c
65015--- linux-2.6.23.orig/fs/reiser4/super_ops.c 1970-01-01 03:00:00.000000000 +0300
65016+++ linux-2.6.23/fs/reiser4/super_ops.c 2007-12-04 22:57:33.998734400 +0300
65017@@ -0,0 +1,724 @@
65018+/* Copyright 2005 by Hans Reiser, licensing governed by
65019+ * reiser4/README */
65020+
65021+#include "inode.h"
65022+#include "page_cache.h"
65023+#include "ktxnmgrd.h"
65024+#include "flush.h"
65025+#include "safe_link.h"
65026+
65027+#include <linux/vfs.h>
65028+#include <linux/writeback.h>
65029+#include <linux/mount.h>
65030+#include <linux/seq_file.h>
65031+#include <linux/debugfs.h>
65032+
65033+/* slab cache for inodes */
65034+static struct kmem_cache *inode_cache;
65035+
65036+static struct dentry *reiser4_debugfs_root = NULL;
65037+
65038+/**
65039+ * init_once - constructor for reiser4 inodes
65040+ * @cache: cache @obj belongs to
65041+ * @obj: inode to be initialized
65042+ *
65043+ * Initialization function to be called when new page is allocated by reiser4
65044+ * inode cache. It is set on inode cache creation.
65045+ */
65046+static void init_once(void *obj, struct kmem_cache *cache, unsigned long flags)
65047+{
65048+ struct reiser4_inode_object *info;
65049+
65050+ info = obj;
65051+
65052+ /* initialize vfs inode */
65053+ inode_init_once(&info->vfs_inode);
65054+
65055+ /*
65056+ * initialize reiser4 specific part fo inode.
65057+ * NOTE-NIKITA add here initializations for locks, list heads,
65058+ * etc. that will be added to our private inode part.
65059+ */
65060+ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
65061+ init_rwsem(&info->p.conv_sem);
65062+ /* init semaphore which is used during inode loading */
65063+ loading_init_once(&info->p);
65064+ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
65065+ GFP_ATOMIC);
65066+#if REISER4_DEBUG
65067+ info->p.nr_jnodes = 0;
65068+#endif
65069+}
65070+
65071+/**
65072+ * init_inodes - create znode cache
65073+ *
65074+ * Initializes slab cache of inodes. It is part of reiser4 module initialization.
65075+ */
65076+static int init_inodes(void)
65077+{
65078+ inode_cache = kmem_cache_create("reiser4_inode",
65079+ sizeof(struct reiser4_inode_object),
65080+ 0,
65081+ SLAB_HWCACHE_ALIGN |
65082+ SLAB_RECLAIM_ACCOUNT, init_once);
65083+ if (inode_cache == NULL)
65084+ return RETERR(-ENOMEM);
65085+ return 0;
65086+}
65087+
65088+/**
65089+ * done_inodes - delete inode cache
65090+ *
65091+ * This is called on reiser4 module unloading or system shutdown.
65092+ */
65093+static void done_inodes(void)
65094+{
65095+ destroy_reiser4_cache(&inode_cache);
65096+}
65097+
65098+/**
65099+ * reiser4_alloc_inode - alloc_inode of super operations
65100+ * @super: super block new inode is allocated for
65101+ *
65102+ * Allocates new inode, initializes reiser4 specific part of it.
65103+ */
65104+static struct inode *reiser4_alloc_inode(struct super_block *super)
65105+{
65106+ struct reiser4_inode_object *obj;
65107+
65108+ assert("nikita-1696", super != NULL);
65109+ obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
65110+ if (obj != NULL) {
65111+ reiser4_inode *info;
65112+
65113+ info = &obj->p;
65114+
65115+ info->pset = plugin_set_get_empty();
65116+ info->hset = plugin_set_get_empty();
65117+ info->extmask = 0;
65118+ info->locality_id = 0ull;
65119+ info->plugin_mask = 0;
65120+ info->heir_mask = 0;
65121+#if !REISER4_INO_IS_OID
65122+ info->oid_hi = 0;
65123+#endif
65124+ reiser4_seal_init(&info->sd_seal, NULL, NULL);
65125+ coord_init_invalid(&info->sd_coord, NULL);
65126+ info->flags = 0;
65127+ spin_lock_init(&info->guard);
65128+ /* this deals with info's loading semaphore */
65129+ loading_alloc(info);
65130+ info->vroot = UBER_TREE_ADDR;
65131+ return &obj->vfs_inode;
65132+ } else
65133+ return NULL;
65134+}
65135+
65136+/**
65137+ * reiser4_destroy_inode - destroy_inode of super operations
65138+ * @inode: inode being destroyed
65139+ *
65140+ * Puts reiser4 specific portion of inode, frees memory occupied by inode.
65141+ */
65142+static void reiser4_destroy_inode(struct inode *inode)
65143+{
65144+ reiser4_inode *info;
65145+
65146+ info = reiser4_inode_data(inode);
65147+
65148+ assert("vs-1220", inode_has_no_jnodes(info));
65149+
65150+ if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
65151+ file_plugin *fplug = inode_file_plugin(inode);
65152+ if (fplug->destroy_inode != NULL)
65153+ fplug->destroy_inode(inode);
65154+ }
65155+ reiser4_dispose_cursors(inode);
65156+ if (info->pset)
65157+ plugin_set_put(info->pset);
65158+ if (info->hset)
65159+ plugin_set_put(info->hset);
65160+
65161+ /*
65162+ * cannot add similar assertion about ->i_list as prune_icache return
65163+ * inode into slab with dangling ->list.{next,prev}. This is safe,
65164+ * because they are re-initialized in the new_inode().
65165+ */
65166+ assert("nikita-2895", list_empty(&inode->i_dentry));
65167+ assert("nikita-2896", hlist_unhashed(&inode->i_hash));
65168+ assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
65169+
65170+ /* this deals with info's loading semaphore */
65171+ loading_destroy(info);
65172+
65173+ kmem_cache_free(inode_cache,
65174+ container_of(info, struct reiser4_inode_object, p));
65175+}
65176+
65177+/**
65178+ * reiser4_dirty_inode - dirty_inode of super operations
65179+ * @inode: inode being dirtied
65180+ *
65181+ * Updates stat data.
65182+ */
65183+static void reiser4_dirty_inode(struct inode *inode)
65184+{
65185+ int result;
65186+
65187+ if (!is_in_reiser4_context())
65188+ return;
65189+ assert("", !IS_RDONLY(inode));
65190+ assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
65191+ get_current_context()->grabbed_blocks));
65192+
65193+ result = reiser4_update_sd(inode);
65194+ if (result)
65195+ warning("", "failed to dirty inode for %llu: %d",
65196+ get_inode_oid(inode), result);
65197+}
65198+
65199+/**
65200+ * reiser4_delete_inode - delete_inode of super operations
65201+ * @inode: inode to delete
65202+ *
65203+ * Calls file plugin's delete_object method to delete object items from
65204+ * filesystem tree and calls clear_inode.
65205+ */
65206+static void reiser4_delete_inode(struct inode *inode)
65207+{
65208+ reiser4_context *ctx;
65209+ file_plugin *fplug;
65210+
65211+ ctx = reiser4_init_context(inode->i_sb);
65212+ if (IS_ERR(ctx)) {
65213+ warning("vs-15", "failed to init context");
65214+ return;
65215+ }
65216+
65217+ if (is_inode_loaded(inode)) {
65218+ fplug = inode_file_plugin(inode);
65219+ if (fplug != NULL && fplug->delete_object != NULL)
65220+ fplug->delete_object(inode);
65221+ }
65222+
65223+ truncate_inode_pages(&inode->i_data, 0);
65224+ inode->i_blocks = 0;
65225+ clear_inode(inode);
65226+ reiser4_exit_context(ctx);
65227+}
65228+
65229+/**
65230+ * reiser4_put_super - put_super of super operations
65231+ * @super: super block to free
65232+ *
65233+ * Stops daemons, release resources, umounts in short.
65234+ */
65235+static void reiser4_put_super(struct super_block *super)
65236+{
65237+ reiser4_super_info_data *sbinfo;
65238+ reiser4_context *ctx;
65239+
65240+ sbinfo = get_super_private(super);
65241+ assert("vs-1699", sbinfo);
65242+
65243+ debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
65244+ debugfs_remove(sbinfo->tmgr.debugfs_id_count);
65245+ debugfs_remove(sbinfo->debugfs_root);
65246+
65247+ ctx = reiser4_init_context(super);
65248+ if (IS_ERR(ctx)) {
65249+ warning("vs-17", "failed to init context");
65250+ return;
65251+ }
65252+
65253+ /* have disk format plugin to free its resources */
65254+ if (get_super_private(super)->df_plug->release)
65255+ get_super_private(super)->df_plug->release(super);
65256+
65257+ reiser4_done_formatted_fake(super);
65258+
65259+ /* stop daemons: ktxnmgr and entd */
65260+ reiser4_done_entd(super);
65261+ reiser4_done_ktxnmgrd(super);
65262+ reiser4_done_txnmgr(&sbinfo->tmgr);
65263+
65264+ reiser4_done_fs_info(super);
65265+ reiser4_exit_context(ctx);
65266+}
65267+
65268+/**
65269+ * reiser4_write_super - write_super of super operations
65270+ * @super: super block to write
65271+ *
65272+ * Captures znode associated with super block, comit all transactions.
65273+ */
65274+static void reiser4_write_super(struct super_block *super)
65275+{
65276+ int ret;
65277+ reiser4_context *ctx;
65278+
65279+ assert("vs-1700", !rofs_super(super));
65280+
65281+ ctx = reiser4_init_context(super);
65282+ if (IS_ERR(ctx)) {
65283+ warning("vs-16", "failed to init context");
65284+ return;
65285+ }
65286+
65287+ ret = reiser4_capture_super_block(super);
65288+ if (ret != 0)
65289+ warning("vs-1701",
65290+ "reiser4_capture_super_block failed in write_super: %d",
65291+ ret);
65292+ ret = txnmgr_force_commit_all(super, 0);
65293+ if (ret != 0)
65294+ warning("jmacd-77113",
65295+ "txn_force failed in write_super: %d", ret);
65296+
65297+ super->s_dirt = 0;
65298+
65299+ reiser4_exit_context(ctx);
65300+}
65301+
65302+/**
65303+ * reiser4_statfs - statfs of super operations
65304+ * @super: super block of file system in queried
65305+ * @stafs: buffer to fill with statistics
65306+ *
65307+ * Returns information about filesystem.
65308+ */
65309+static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
65310+{
65311+ sector_t total;
65312+ sector_t reserved;
65313+ sector_t free;
65314+ sector_t forroot;
65315+ sector_t deleted;
65316+ reiser4_context *ctx;
65317+ struct super_block *super = dentry->d_sb;
65318+
65319+ assert("nikita-408", super != NULL);
65320+ assert("nikita-409", statfs != NULL);
65321+
65322+ ctx = reiser4_init_context(super);
65323+ if (IS_ERR(ctx))
65324+ return PTR_ERR(ctx);
65325+
65326+ statfs->f_type = reiser4_statfs_type(super);
65327+ statfs->f_bsize = super->s_blocksize;
65328+
65329+ /*
65330+ * 5% of total block space is reserved. This is needed for flush and
65331+ * for truncates (so that we are able to perform truncate/unlink even
65332+ * on the otherwise completely full file system). If this reservation
65333+ * is hidden from statfs(2), users will mistakenly guess that they
65334+ * have enough free space to complete some operation, which is
65335+ * frustrating.
65336+ *
65337+ * Another possible solution is to subtract ->blocks_reserved from
65338+ * ->f_bfree, but changing available space seems less intrusive than
65339+ * letting user to see 5% of disk space to be used directly after
65340+ * mkfs.
65341+ */
65342+ total = reiser4_block_count(super);
65343+ reserved = get_super_private(super)->blocks_reserved;
65344+ deleted = txnmgr_count_deleted_blocks();
65345+ free = reiser4_free_blocks(super) + deleted;
65346+ forroot = reiser4_reserved_blocks(super, 0, 0);
65347+
65348+ /*
65349+ * These counters may be in inconsistent state because we take the
65350+ * values without keeping any global spinlock. Here we do a sanity
65351+ * check that free block counter does not exceed the number of all
65352+ * blocks.
65353+ */
65354+ if (free > total)
65355+ free = total;
65356+ statfs->f_blocks = total - reserved;
65357+ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
65358+ if (free > reserved)
65359+ free -= reserved;
65360+ else
65361+ free = 0;
65362+ statfs->f_bfree = free;
65363+
65364+ if (free > forroot)
65365+ free -= forroot;
65366+ else
65367+ free = 0;
65368+ statfs->f_bavail = free;
65369+
65370+ statfs->f_files = 0;
65371+ statfs->f_ffree = 0;
65372+
65373+ /* maximal acceptable name length depends on directory plugin. */
65374+ assert("nikita-3351", super->s_root->d_inode != NULL);
65375+ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
65376+ reiser4_exit_context(ctx);
65377+ return 0;
65378+}
65379+
65380+/**
65381+ * reiser4_clear_inode - clear_inode of super operation
65382+ * @inode: inode about to destroy
65383+ *
65384+ * Does sanity checks: being destroyed should have all jnodes detached.
65385+ */
65386+static void reiser4_clear_inode(struct inode *inode)
65387+{
65388+#if REISER4_DEBUG
65389+ reiser4_inode *r4_inode;
65390+
65391+ r4_inode = reiser4_inode_data(inode);
65392+ if (!inode_has_no_jnodes(r4_inode))
65393+ warning("vs-1732", "reiser4 inode has %ld jnodes\n",
65394+ r4_inode->nr_jnodes);
65395+#endif
65396+}
65397+
65398+/**
65399+ * reiser4_sync_inodes - sync_inodes of super operations
65400+ * @super:
65401+ * @wbc:
65402+ *
65403+ * This method is called by background and non-backgound writeback. Reiser4's
65404+ * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
65405+ * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
65406+ * mapping - dirty pages get into atoms. Writeout is called to flush some
65407+ * atoms.
65408+ */
65409+static void reiser4_sync_inodes(struct super_block *super,
65410+ struct writeback_control *wbc)
65411+{
65412+ reiser4_context *ctx;
65413+ long to_write;
65414+
65415+ if (wbc->for_kupdate)
65416+ /* reiser4 has its own means of periodical write-out */
65417+ return;
65418+
65419+ to_write = wbc->nr_to_write;
65420+ assert("vs-49", wbc->older_than_this == NULL);
65421+
65422+ ctx = reiser4_init_context(super);
65423+ if (IS_ERR(ctx)) {
65424+ warning("vs-13", "failed to init context");
65425+ return;
65426+ }
65427+
65428+ /*
65429+ * call reiser4_writepages for each of dirty inodes to turn dirty pages
65430+ * into transactions if they were not yet.
65431+ */
65432+ generic_sync_sb_inodes(super, wbc);
65433+
65434+ /* flush goes here */
65435+ wbc->nr_to_write = to_write;
65436+ reiser4_writeout(super, wbc);
65437+
65438+ /* avoid recursive calls to ->sync_inodes */
65439+ context_set_commit_async(ctx);
65440+ reiser4_exit_context(ctx);
65441+}
65442+
65443+/**
65444+ * reiser4_show_options - show_options of super operations
65445+ * @m: file where to write information
65446+ * @mnt: mount structure
65447+ *
65448+ * Makes reiser4 mount options visible in /proc/mounts.
65449+ */
65450+static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
65451+{
65452+ struct super_block *super;
65453+ reiser4_super_info_data *sbinfo;
65454+
65455+ super = mnt->mnt_sb;
65456+ sbinfo = get_super_private(super);
65457+
65458+ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
65459+ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
65460+ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
65461+ seq_printf(m, ",atom_max_flushers=0x%x",
65462+ sbinfo->tmgr.atom_max_flushers);
65463+ seq_printf(m, ",cbk_cache_slots=0x%x",
65464+ sbinfo->tree.cbk_cache.nr_slots);
65465+
65466+ return 0;
65467+}
65468+
65469+struct super_operations reiser4_super_operations = {
65470+ .alloc_inode = reiser4_alloc_inode,
65471+ .destroy_inode = reiser4_destroy_inode,
65472+ .dirty_inode = reiser4_dirty_inode,
65473+ .delete_inode = reiser4_delete_inode,
65474+ .put_super = reiser4_put_super,
65475+ .write_super = reiser4_write_super,
65476+ .statfs = reiser4_statfs,
65477+ .clear_inode = reiser4_clear_inode,
65478+ .sync_inodes = reiser4_sync_inodes,
65479+ .show_options = reiser4_show_options
65480+};
65481+
65482+/**
65483+ * fill_super - initialize super block on mount
65484+ * @super: super block to fill
65485+ * @data: reiser4 specific mount option
65486+ * @silent:
65487+ *
65488+ * This is to be called by reiser4_get_sb. Mounts filesystem.
65489+ */
65490+static int fill_super(struct super_block *super, void *data, int silent)
65491+{
65492+ reiser4_context ctx;
65493+ int result;
65494+ reiser4_super_info_data *sbinfo;
65495+
65496+ assert("zam-989", super != NULL);
65497+
65498+ super->s_op = NULL;
65499+ init_stack_context(&ctx, super);
65500+
65501+ /* allocate reiser4 specific super block */
65502+ if ((result = reiser4_init_fs_info(super)) != 0)
65503+ goto failed_init_sinfo;
65504+
65505+ sbinfo = get_super_private(super);
65506+ /* initialize various reiser4 parameters, parse mount options */
65507+ if ((result = reiser4_init_super_data(super, data)) != 0)
65508+ goto failed_init_super_data;
65509+
65510+ /* read reiser4 master super block, initialize disk format plugin */
65511+ if ((result = reiser4_init_read_super(super, silent)) != 0)
65512+ goto failed_init_read_super;
65513+
65514+ /* initialize transaction manager */
65515+ reiser4_init_txnmgr(&sbinfo->tmgr);
65516+
65517+ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
65518+ if ((result = reiser4_init_ktxnmgrd(super)) != 0)
65519+ goto failed_init_ktxnmgrd;
65520+
65521+ /* initialize entd context and start kernel thread entd */
65522+ if ((result = reiser4_init_entd(super)) != 0)
65523+ goto failed_init_entd;
65524+
65525+ /* initialize address spaces for formatted nodes and bitmaps */
65526+ if ((result = reiser4_init_formatted_fake(super)) != 0)
65527+ goto failed_init_formatted_fake;
65528+
65529+ /* initialize disk format plugin */
65530+ if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
65531+ goto failed_init_disk_format;
65532+
65533+ /*
65534+ * There are some 'committed' versions of reiser4 super block counters,
65535+ * which correspond to reiser4 on-disk state. These counters are
65536+ * initialized here
65537+ */
65538+ sbinfo->blocks_free_committed = sbinfo->blocks_free;
65539+ sbinfo->nr_files_committed = oids_used(super);
65540+
65541+ /* get inode of root directory */
65542+ if ((result = reiser4_init_root_inode(super)) != 0)
65543+ goto failed_init_root_inode;
65544+
65545+ if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 )
65546+ goto failed_update_format_version;
65547+
65548+ process_safelinks(super);
65549+ reiser4_exit_context(&ctx);
65550+
65551+ sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
65552+ reiser4_debugfs_root);
65553+ if (sbinfo->debugfs_root) {
65554+ sbinfo->tmgr.debugfs_atom_count =
65555+ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
65556+ sbinfo->debugfs_root,
65557+ &sbinfo->tmgr.atom_count);
65558+ sbinfo->tmgr.debugfs_id_count =
65559+ debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
65560+ sbinfo->debugfs_root,
65561+ &sbinfo->tmgr.id_count);
65562+ }
65563+ return 0;
65564+
65565+ failed_update_format_version:
65566+ failed_init_root_inode:
65567+ if (sbinfo->df_plug->release)
65568+ sbinfo->df_plug->release(super);
65569+ failed_init_disk_format:
65570+ reiser4_done_formatted_fake(super);
65571+ failed_init_formatted_fake:
65572+ reiser4_done_entd(super);
65573+ failed_init_entd:
65574+ reiser4_done_ktxnmgrd(super);
65575+ failed_init_ktxnmgrd:
65576+ reiser4_done_txnmgr(&sbinfo->tmgr);
65577+ failed_init_read_super:
65578+ failed_init_super_data:
65579+ reiser4_done_fs_info(super);
65580+ failed_init_sinfo:
65581+ reiser4_exit_context(&ctx);
65582+ return result;
65583+}
65584+
65585+/**
65586+ * reiser4_get_sb - get_sb of file_system_type operations
65587+ * @fs_type:
65588+ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
65589+ * @dev_name: block device file name
65590+ * @data: specific mount options
65591+ *
65592+ * Reiser4 mount entry.
65593+ */
65594+static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
65595+ const char *dev_name, void *data, struct vfsmount *mnt)
65596+{
65597+ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
65598+}
65599+
65600+/* structure describing the reiser4 filesystem implementation */
65601+static struct file_system_type reiser4_fs_type = {
65602+ .owner = THIS_MODULE,
65603+ .name = "reiser4",
65604+ .fs_flags = FS_REQUIRES_DEV,
65605+ .get_sb = reiser4_get_sb,
65606+ .kill_sb = kill_block_super,
65607+ .next = NULL
65608+};
65609+
65610+void destroy_reiser4_cache(struct kmem_cache **cachep)
65611+{
65612+ BUG_ON(*cachep == NULL);
65613+ kmem_cache_destroy(*cachep);
65614+ *cachep = NULL;
65615+}
65616+
65617+/**
65618+ * init_reiser4 - reiser4 initialization entry point
65619+ *
65620+ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
65621+ * on kernel initialization or during reiser4 module load.
65622+ */
65623+static int __init init_reiser4(void)
65624+{
65625+ int result;
65626+
65627+ printk(KERN_INFO
65628+ "Loading Reiser4. "
65629+ "See www.namesys.com for a description of Reiser4.\n");
65630+
65631+ /* initialize slab cache of inodes */
65632+ if ((result = init_inodes()) != 0)
65633+ goto failed_inode_cache;
65634+
65635+ /* initialize cache of znodes */
65636+ if ((result = init_znodes()) != 0)
65637+ goto failed_init_znodes;
65638+
65639+ /* initialize all plugins */
65640+ if ((result = init_plugins()) != 0)
65641+ goto failed_init_plugins;
65642+
65643+ /* initialize cache of plugin_set-s and plugin_set's hash table */
65644+ if ((result = init_plugin_set()) != 0)
65645+ goto failed_init_plugin_set;
65646+
65647+ /* initialize caches of txn_atom-s and txn_handle-s */
65648+ if ((result = init_txnmgr_static()) != 0)
65649+ goto failed_init_txnmgr_static;
65650+
65651+ /* initialize cache of jnodes */
65652+ if ((result = init_jnodes()) != 0)
65653+ goto failed_init_jnodes;
65654+
65655+ /* initialize cache of flush queues */
65656+ if ((result = reiser4_init_fqs()) != 0)
65657+ goto failed_init_fqs;
65658+
65659+ /* initialize cache of structures attached to dentry->d_fsdata */
65660+ if ((result = reiser4_init_dentry_fsdata()) != 0)
65661+ goto failed_init_dentry_fsdata;
65662+
65663+ /* initialize cache of structures attached to file->private_data */
65664+ if ((result = reiser4_init_file_fsdata()) != 0)
65665+ goto failed_init_file_fsdata;
65666+
65667+ /*
65668+ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
65669+ * more details
65670+ */
65671+ if ((result = reiser4_init_d_cursor()) != 0)
65672+ goto failed_init_d_cursor;
65673+
65674+ if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
65675+ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
65676+ return 0;
65677+ }
65678+
65679+ reiser4_done_d_cursor();
65680+ failed_init_d_cursor:
65681+ reiser4_done_file_fsdata();
65682+ failed_init_file_fsdata:
65683+ reiser4_done_dentry_fsdata();
65684+ failed_init_dentry_fsdata:
65685+ reiser4_done_fqs();
65686+ failed_init_fqs:
65687+ done_jnodes();
65688+ failed_init_jnodes:
65689+ done_txnmgr_static();
65690+ failed_init_txnmgr_static:
65691+ done_plugin_set();
65692+ failed_init_plugin_set:
65693+ failed_init_plugins:
65694+ done_znodes();
65695+ failed_init_znodes:
65696+ done_inodes();
65697+ failed_inode_cache:
65698+ return result;
65699+}
65700+
65701+/**
65702+ * done_reiser4 - reiser4 exit entry point
65703+ *
65704+ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
65705+ * or at module unload.
65706+ */
65707+static void __exit done_reiser4(void)
65708+{
65709+ int result;
65710+
65711+ debugfs_remove(reiser4_debugfs_root);
65712+ result = unregister_filesystem(&reiser4_fs_type);
65713+ BUG_ON(result != 0);
65714+ reiser4_done_d_cursor();
65715+ reiser4_done_file_fsdata();
65716+ reiser4_done_dentry_fsdata();
65717+ reiser4_done_fqs();
65718+ done_jnodes();
65719+ done_txnmgr_static();
65720+ done_plugin_set();
65721+ done_znodes();
65722+ destroy_reiser4_cache(&inode_cache);
65723+}
65724+
65725+module_init(init_reiser4);
65726+module_exit(done_reiser4);
65727+
65728+MODULE_DESCRIPTION("Reiser4 filesystem");
65729+MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
65730+
65731+MODULE_LICENSE("GPL");
65732+
65733+/*
65734+ * Local variables:
65735+ * c-indentation-style: "K&R"
65736+ * mode-name: "LC"
65737+ * c-basic-offset: 8
65738+ * tab-width: 8
65739+ * fill-column: 79
65740+ * End:
65741+ */
65742diff -urN linux-2.6.23.orig/fs/reiser4/tap.c linux-2.6.23/fs/reiser4/tap.c
65743--- linux-2.6.23.orig/fs/reiser4/tap.c 1970-01-01 03:00:00.000000000 +0300
65744+++ linux-2.6.23/fs/reiser4/tap.c 2007-12-04 16:49:30.000000000 +0300
65745@@ -0,0 +1,377 @@
65746+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
65747+ * reiser4/README */
65748+
65749+/*
65750+ Tree Access Pointer (tap).
65751+
65752+ tap is data structure combining coord and lock handle (mostly). It is
65753+ useful when one has to scan tree nodes (for example, in readdir, or flush),
65754+ for tap functions allow to move tap in either direction transparently
65755+ crossing unit/item/node borders.
65756+
65757+ Tap doesn't provide automatic synchronization of its fields as it is
65758+ supposed to be per-thread object.
65759+*/
65760+
65761+#include "forward.h"
65762+#include "debug.h"
65763+#include "coord.h"
65764+#include "tree.h"
65765+#include "context.h"
65766+#include "tap.h"
65767+#include "znode.h"
65768+#include "tree_walk.h"
65769+
65770+#if REISER4_DEBUG
65771+static int tap_invariant(const tap_t * tap);
65772+static void tap_check(const tap_t * tap);
65773+#else
65774+#define tap_check(tap) noop
65775+#endif
65776+
65777+/** load node tap is pointing to, if not loaded already */
65778+int reiser4_tap_load(tap_t * tap)
65779+{
65780+ tap_check(tap);
65781+ if (tap->loaded == 0) {
65782+ int result;
65783+
65784+ result = zload_ra(tap->coord->node, &tap->ra_info);
65785+ if (result != 0)
65786+ return result;
65787+ coord_clear_iplug(tap->coord);
65788+ }
65789+ ++tap->loaded;
65790+ tap_check(tap);
65791+ return 0;
65792+}
65793+
65794+/** release node tap is pointing to. Dual to tap_load() */
65795+void reiser4_tap_relse(tap_t * tap)
65796+{
65797+ tap_check(tap);
65798+ if (tap->loaded > 0) {
65799+ --tap->loaded;
65800+ if (tap->loaded == 0) {
65801+ zrelse(tap->coord->node);
65802+ }
65803+ }
65804+ tap_check(tap);
65805+}
65806+
65807+/**
65808+ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
65809+ * @mode
65810+ */
65811+void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
65812+ znode_lock_mode mode)
65813+{
65814+ tap->coord = coord;
65815+ tap->lh = lh;
65816+ tap->mode = mode;
65817+ tap->loaded = 0;
65818+ INIT_LIST_HEAD(&tap->linkage);
65819+ reiser4_init_ra_info(&tap->ra_info);
65820+}
65821+
65822+/** add @tap to the per-thread list of all taps */
65823+void reiser4_tap_monitor(tap_t * tap)
65824+{
65825+ assert("nikita-2623", tap != NULL);
65826+ tap_check(tap);
65827+ list_add(&tap->linkage, reiser4_taps_list());
65828+ tap_check(tap);
65829+}
65830+
65831+/* duplicate @src into @dst. Copy lock handle. @dst is not initially
65832+ * loaded. */
65833+void reiser4_tap_copy(tap_t * dst, tap_t * src)
65834+{
65835+ assert("nikita-3193", src != NULL);
65836+ assert("nikita-3194", dst != NULL);
65837+
65838+ *dst->coord = *src->coord;
65839+ if (src->lh->node)
65840+ copy_lh(dst->lh, src->lh);
65841+ dst->mode = src->mode;
65842+ dst->loaded = 0;
65843+ INIT_LIST_HEAD(&dst->linkage);
65844+ dst->ra_info = src->ra_info;
65845+}
65846+
65847+/** finish with @tap */
65848+void reiser4_tap_done(tap_t * tap)
65849+{
65850+ assert("nikita-2565", tap != NULL);
65851+ tap_check(tap);
65852+ if (tap->loaded > 0)
65853+ zrelse(tap->coord->node);
65854+ done_lh(tap->lh);
65855+ tap->loaded = 0;
65856+ list_del_init(&tap->linkage);
65857+ tap->coord->node = NULL;
65858+}
65859+
65860+/**
65861+ * move @tap to the new node, locked with @target. Load @target, if @tap was
65862+ * already loaded.
65863+ */
65864+int reiser4_tap_move(tap_t * tap, lock_handle * target)
65865+{
65866+ int result = 0;
65867+
65868+ assert("nikita-2567", tap != NULL);
65869+ assert("nikita-2568", target != NULL);
65870+ assert("nikita-2570", target->node != NULL);
65871+ assert("nikita-2569", tap->coord->node == tap->lh->node);
65872+
65873+ tap_check(tap);
65874+ if (tap->loaded > 0)
65875+ result = zload_ra(target->node, &tap->ra_info);
65876+
65877+ if (result == 0) {
65878+ if (tap->loaded > 0)
65879+ zrelse(tap->coord->node);
65880+ done_lh(tap->lh);
65881+ copy_lh(tap->lh, target);
65882+ tap->coord->node = target->node;
65883+ coord_clear_iplug(tap->coord);
65884+ }
65885+ tap_check(tap);
65886+ return result;
65887+}
65888+
65889+/**
65890+ * move @tap to @target. Acquire lock on @target, if @tap was already
65891+ * loaded.
65892+ */
65893+static int tap_to(tap_t * tap, znode * target)
65894+{
65895+ int result;
65896+
65897+ assert("nikita-2624", tap != NULL);
65898+ assert("nikita-2625", target != NULL);
65899+
65900+ tap_check(tap);
65901+ result = 0;
65902+ if (tap->coord->node != target) {
65903+ lock_handle here;
65904+
65905+ init_lh(&here);
65906+ result = longterm_lock_znode(&here, target,
65907+ tap->mode, ZNODE_LOCK_HIPRI);
65908+ if (result == 0) {
65909+ result = reiser4_tap_move(tap, &here);
65910+ done_lh(&here);
65911+ }
65912+ }
65913+ tap_check(tap);
65914+ return result;
65915+}
65916+
65917+/**
65918+ * move @tap to given @target, loading and locking @target->node if
65919+ * necessary
65920+ */
65921+int tap_to_coord(tap_t * tap, coord_t * target)
65922+{
65923+ int result;
65924+
65925+ tap_check(tap);
65926+ result = tap_to(tap, target->node);
65927+ if (result == 0)
65928+ coord_dup(tap->coord, target);
65929+ tap_check(tap);
65930+ return result;
65931+}
65932+
65933+/** return list of all taps */
65934+struct list_head *reiser4_taps_list(void)
65935+{
65936+ return &get_current_context()->taps;
65937+}
65938+
65939+/** helper function for go_{next,prev}_{item,unit,node}() */
65940+int go_dir_el(tap_t * tap, sideof dir, int units_p)
65941+{
65942+ coord_t dup;
65943+ coord_t *coord;
65944+ int result;
65945+
65946+ int (*coord_dir) (coord_t *);
65947+ int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
65948+ void (*coord_init) (coord_t *, const znode *);
65949+ ON_DEBUG(int (*coord_check) (const coord_t *));
65950+
65951+ assert("nikita-2556", tap != NULL);
65952+ assert("nikita-2557", tap->coord != NULL);
65953+ assert("nikita-2558", tap->lh != NULL);
65954+ assert("nikita-2559", tap->coord->node != NULL);
65955+
65956+ tap_check(tap);
65957+ if (dir == LEFT_SIDE) {
65958+ coord_dir = units_p ? coord_prev_unit : coord_prev_item;
65959+ get_dir_neighbor = reiser4_get_left_neighbor;
65960+ coord_init = coord_init_last_unit;
65961+ } else {
65962+ coord_dir = units_p ? coord_next_unit : coord_next_item;
65963+ get_dir_neighbor = reiser4_get_right_neighbor;
65964+ coord_init = coord_init_first_unit;
65965+ }
65966+ ON_DEBUG(coord_check =
65967+ units_p ? coord_is_existing_unit : coord_is_existing_item);
65968+ assert("nikita-2560", coord_check(tap->coord));
65969+
65970+ coord = tap->coord;
65971+ coord_dup(&dup, coord);
65972+ if (coord_dir(&dup) != 0) {
65973+ do {
65974+ /* move to the left neighboring node */
65975+ lock_handle dup;
65976+
65977+ init_lh(&dup);
65978+ result =
65979+ get_dir_neighbor(&dup, coord->node, (int)tap->mode,
65980+ GN_CAN_USE_UPPER_LEVELS);
65981+ if (result == 0) {
65982+ result = reiser4_tap_move(tap, &dup);
65983+ if (result == 0)
65984+ coord_init(tap->coord, dup.node);
65985+ done_lh(&dup);
65986+ }
65987+ /* skip empty nodes */
65988+ } while ((result == 0) && node_is_empty(coord->node));
65989+ } else {
65990+ result = 0;
65991+ coord_dup(coord, &dup);
65992+ }
65993+ assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
65994+ tap_check(tap);
65995+ return result;
65996+}
65997+
65998+/**
65999+ * move @tap to the next unit, transparently crossing item and node
66000+ * boundaries
66001+ */
66002+int go_next_unit(tap_t * tap)
66003+{
66004+ return go_dir_el(tap, RIGHT_SIDE, 1);
66005+}
66006+
66007+/**
66008+ * move @tap to the previous unit, transparently crossing item and node
66009+ * boundaries
66010+ */
66011+int go_prev_unit(tap_t * tap)
66012+{
66013+ return go_dir_el(tap, LEFT_SIDE, 1);
66014+}
66015+
66016+/**
66017+ * @shift times apply @actor to the @tap. This is used to move @tap by
66018+ * @shift units (or items, or nodes) in either direction.
66019+ */
66020+static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
66021+{
66022+ int result;
66023+
66024+ assert("nikita-2555", shift >= 0);
66025+ assert("nikita-2562", tap->coord->node == tap->lh->node);
66026+
66027+ tap_check(tap);
66028+ result = reiser4_tap_load(tap);
66029+ if (result != 0)
66030+ return result;
66031+
66032+ for (; shift > 0; --shift) {
66033+ result = actor(tap);
66034+ assert("nikita-2563", tap->coord->node == tap->lh->node);
66035+ if (result != 0)
66036+ break;
66037+ }
66038+ reiser4_tap_relse(tap);
66039+ tap_check(tap);
66040+ return result;
66041+}
66042+
66043+/** move @tap @shift units rightward */
66044+int rewind_right(tap_t * tap, int shift)
66045+{
66046+ return rewind_to(tap, go_next_unit, shift);
66047+}
66048+
66049+/** move @tap @shift units leftward */
66050+int rewind_left(tap_t * tap, int shift)
66051+{
66052+ return rewind_to(tap, go_prev_unit, shift);
66053+}
66054+
66055+#if REISER4_DEBUG
66056+/** debugging function: print @tap content in human readable form */
66057+static void print_tap(const char *prefix, const tap_t * tap)
66058+{
66059+ if (tap == NULL) {
66060+ printk("%s: null tap\n", prefix);
66061+ return;
66062+ }
66063+ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
66064+ tap->loaded, (&tap->linkage == tap->linkage.next &&
66065+ &tap->linkage == tap->linkage.prev),
66066+ tap->lh->node,
66067+ lock_mode_name(tap->mode));
66068+ print_coord("\tcoord", tap->coord, 0);
66069+}
66070+
66071+/** check [tap-sane] invariant */
66072+static int tap_invariant(const tap_t * tap)
66073+{
66074+ /* [tap-sane] invariant */
66075+
66076+ if (tap == NULL)
66077+ return 1;
66078+ /* tap->mode is one of
66079+ *
66080+ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
66081+ */
66082+ if (tap->mode != ZNODE_NO_LOCK &&
66083+ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
66084+ return 2;
66085+ /* tap->coord != NULL, and */
66086+ if (tap->coord == NULL)
66087+ return 3;
66088+ /* tap->lh != NULL, and */
66089+ if (tap->lh == NULL)
66090+ return 4;
66091+ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
66092+ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
66093+ return 5;
66094+ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
66095+ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
66096+ return 6;
66097+ return 0;
66098+}
66099+
66100+/** debugging function: check internal @tap consistency */
66101+static void tap_check(const tap_t * tap)
66102+{
66103+ int result;
66104+
66105+ result = tap_invariant(tap);
66106+ if (result != 0) {
66107+ print_tap("broken", tap);
66108+ reiser4_panic("nikita-2831", "tap broken: %i\n", result);
66109+ }
66110+}
66111+#endif
66112+
66113+/* Make Linus happy.
66114+ Local variables:
66115+ c-indentation-style: "K&R"
66116+ mode-name: "LC"
66117+ c-basic-offset: 8
66118+ tab-width: 8
66119+ fill-column: 120
66120+ scroll-step: 1
66121+ End:
66122+*/
66123diff -urN linux-2.6.23.orig/fs/reiser4/tap.h linux-2.6.23/fs/reiser4/tap.h
66124--- linux-2.6.23.orig/fs/reiser4/tap.h 1970-01-01 03:00:00.000000000 +0300
66125+++ linux-2.6.23/fs/reiser4/tap.h 2007-12-04 16:49:30.000000000 +0300
66126@@ -0,0 +1,70 @@
66127+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
66128+
66129+/* Tree Access Pointers. See tap.c for more details. */
66130+
66131+#if !defined( __REISER4_TAP_H__ )
66132+#define __REISER4_TAP_H__
66133+
66134+#include "forward.h"
66135+#include "readahead.h"
66136+
66137+/**
66138+ tree_access_pointer aka tap. Data structure combining coord_t and lock
66139+ handle.
66140+ Invariants involving this data-type, see doc/lock-ordering for details:
66141+
66142+ [tap-sane]
66143+ */
66144+struct tree_access_pointer {
66145+ /* coord tap is at */
66146+ coord_t *coord;
66147+ /* lock handle on ->coord->node */
66148+ lock_handle *lh;
66149+ /* mode of lock acquired by this tap */
66150+ znode_lock_mode mode;
66151+ /* incremented by reiser4_tap_load().
66152+ Decremented by reiser4_tap_relse(). */
66153+ int loaded;
66154+ /* list of taps */
66155+ struct list_head linkage;
66156+ /* read-ahead hint */
66157+ ra_info_t ra_info;
66158+};
66159+
66160+typedef int (*go_actor_t) (tap_t * tap);
66161+
66162+extern int reiser4_tap_load(tap_t * tap);
66163+extern void reiser4_tap_relse(tap_t * tap);
66164+extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
66165+ znode_lock_mode mode);
66166+extern void reiser4_tap_monitor(tap_t * tap);
66167+extern void reiser4_tap_copy(tap_t * dst, tap_t * src);
66168+extern void reiser4_tap_done(tap_t * tap);
66169+extern int reiser4_tap_move(tap_t * tap, lock_handle * target);
66170+extern int tap_to_coord(tap_t * tap, coord_t * target);
66171+
66172+extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
66173+extern int go_next_unit(tap_t * tap);
66174+extern int go_prev_unit(tap_t * tap);
66175+extern int rewind_right(tap_t * tap, int shift);
66176+extern int rewind_left(tap_t * tap, int shift);
66177+
66178+extern struct list_head *reiser4_taps_list(void);
66179+
66180+#define for_all_taps(tap) \
66181+ for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \
66182+ reiser4_taps_list() != &tap->linkage; \
66183+ tap = list_entry(tap->linkage.next, tap_t, linkage))
66184+
66185+/* __REISER4_TAP_H__ */
66186+#endif
66187+/* Make Linus happy.
66188+ Local variables:
66189+ c-indentation-style: "K&R"
66190+ mode-name: "LC"
66191+ c-basic-offset: 8
66192+ tab-width: 8
66193+ fill-column: 120
66194+ scroll-step: 1
66195+ End:
66196+*/
66197diff -urN linux-2.6.23.orig/fs/reiser4/tree.c linux-2.6.23/fs/reiser4/tree.c
66198--- linux-2.6.23.orig/fs/reiser4/tree.c 1970-01-01 03:00:00.000000000 +0300
66199+++ linux-2.6.23/fs/reiser4/tree.c 2007-12-04 16:49:30.000000000 +0300
66200@@ -0,0 +1,1876 @@
66201+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66202+ * reiser4/README */
66203+
66204+/*
66205+ * KEYS IN A TREE.
66206+ *
66207+ * The tree consists of nodes located on the disk. Node in the tree is either
66208+ * formatted or unformatted. Formatted node is one that has structure
66209+ * understood by the tree balancing and traversal code. Formatted nodes are
66210+ * further classified into leaf and internal nodes. Latter distinctions is
66211+ * (almost) of only historical importance: general structure of leaves and
66212+ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
66213+ * that are part of bodies of ordinary files and attributes.
66214+ *
66215+ * Each node in the tree spawns some interval in the key space. Key ranges for
66216+ * all nodes in the tree are disjoint. Actually, this only holds in some weak
66217+ * sense, because of the non-unique keys: intersection of key ranges for
66218+ * different nodes is either empty, or consists of exactly one key.
66219+ *
66220+ * Formatted node consists of a sequence of items. Each item spawns some
66221+ * interval in key space. Key ranges for all items in a tree are disjoint,
66222+ * modulo non-unique keys again. Items within nodes are ordered in the key
66223+ * order of the smallest key in a item.
66224+ *
66225+ * Particular type of item can be further split into units. Unit is piece of
66226+ * item that can be cut from item and moved into another item of the same
66227+ * time. Units are used by balancing code to repack data during balancing.
66228+ *
66229+ * Unit can be further split into smaller entities (for example, extent unit
66230+ * represents several pages, and it is natural for extent code to operate on
66231+ * particular pages and even bytes within one unit), but this is of no
66232+ * relevance to the generic balancing and lookup code.
66233+ *
66234+ * Although item is said to "spawn" range or interval of keys, it is not
66235+ * necessary that item contains piece of data addressable by each and every
66236+ * key in this range. For example, compound directory item, consisting of
66237+ * units corresponding to directory entries and keyed by hashes of file names,
66238+ * looks more as having "discrete spectrum": only some disjoint keys inside
66239+ * range occupied by this item really address data.
66240+ *
66241+ * No than less, each item always has well-defined least (minimal) key, that
66242+ * is recorded in item header, stored in the node this item is in. Also, item
66243+ * plugin can optionally define method ->max_key_inside() returning maximal
66244+ * key that can _possibly_ be located within this item. This method is used
66245+ * (mainly) to determine when given piece of data should be merged into
66246+ * existing item, in stead of creating new one. Because of this, even though
66247+ * ->max_key_inside() can be larger that any key actually located in the item,
66248+ * intervals
66249+ *
66250+ * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
66251+ *
66252+ * are still disjoint for all items within the _same_ node.
66253+ *
66254+ * In memory node is represented by znode. It plays several roles:
66255+ *
66256+ * . something locks are taken on
66257+ *
66258+ * . something tracked by transaction manager (this is going to change)
66259+ *
66260+ * . something used to access node data
66261+ *
66262+ * . something used to maintain tree structure in memory: sibling and
66263+ * parental linkage.
66264+ *
66265+ * . something used to organize nodes into "slums"
66266+ *
66267+ * More on znodes see in znode.[ch]
66268+ *
66269+ * DELIMITING KEYS
66270+ *
66271+ * To simplify balancing, allow some flexibility in locking and speed up
66272+ * important coord cache optimization, we keep delimiting keys of nodes in
66273+ * memory. Depending on disk format (implemented by appropriate node plugin)
66274+ * node on disk can record both left and right delimiting key, only one of
66275+ * them, or none. Still, our balancing and tree traversal code keep both
66276+ * delimiting keys for a node that is in memory stored in the znode. When
66277+ * node is first brought into memory during tree traversal, its left
66278+ * delimiting key is taken from its parent, and its right delimiting key is
66279+ * either next key in its parent, or is right delimiting key of parent if
66280+ * node is the rightmost child of parent.
66281+ *
66282+ * Physical consistency of delimiting key is protected by special dk
66283+ * read-write lock. That is, delimiting keys can only be inspected or
66284+ * modified under this lock. But dk lock is only sufficient for fast
66285+ * "pessimistic" check, because to simplify code and to decrease lock
66286+ * contention, balancing (carry) only updates delimiting keys right before
66287+ * unlocking all locked nodes on the given tree level. For example,
66288+ * coord-by-key cache scans LRU list of recently accessed znodes. For each
66289+ * node it first does fast check under dk spin lock. If key looked for is
66290+ * not between delimiting keys for this node, next node is inspected and so
66291+ * on. If key is inside of the key range, long term lock is taken on node
66292+ * and key range is rechecked.
66293+ *
66294+ * COORDINATES
66295+ *
66296+ * To find something in the tree, you supply a key, and the key is resolved
66297+ * by coord_by_key() into a coord (coordinate) that is valid as long as the
66298+ * node the coord points to remains locked. As mentioned above trees
66299+ * consist of nodes that consist of items that consist of units. A unit is
66300+ * the smallest and indivisible piece of tree as far as balancing and tree
66301+ * search are concerned. Each node, item, and unit can be addressed by
66302+ * giving its level in the tree and the key occupied by this entity. A node
66303+ * knows what the key ranges are of the items within it, and how to find its
66304+ * items and invoke their item handlers, but it does not know how to access
66305+ * individual units within its items except through the item handlers.
66306+ * coord is a structure containing a pointer to the node, the ordinal number
66307+ * of the item within this node (a sort of item offset), and the ordinal
66308+ * number of the unit within this item.
66309+ *
66310+ * TREE LOOKUP
66311+ *
66312+ * There are two types of access to the tree: lookup and modification.
66313+ *
66314+ * Lookup is a search for the key in the tree. Search can look for either
66315+ * exactly the key given to it, or for the largest key that is not greater
66316+ * than the key given to it. This distinction is determined by "bias"
66317+ * parameter of search routine (coord_by_key()). coord_by_key() either
66318+ * returns error (key is not in the tree, or some kind of external error
66319+ * occurred), or successfully resolves key into coord.
66320+ *
66321+ * This resolution is done by traversing tree top-to-bottom from root level
66322+ * to the desired level. On levels above twig level (level one above the
66323+ * leaf level) nodes consist exclusively of internal items. Internal item is
66324+ * nothing more than pointer to the tree node on the child level. On twig
66325+ * level nodes consist of internal items intermixed with extent
66326+ * items. Internal items form normal search tree structure used by traversal
66327+ * to descent through the tree.
66328+ *
66329+ * TREE LOOKUP OPTIMIZATIONS
66330+ *
66331+ * Tree lookup described above is expensive even if all nodes traversed are
66332+ * already in the memory: for each node binary search within it has to be
66333+ * performed and binary searches are CPU consuming and tend to destroy CPU
66334+ * caches.
66335+ *
66336+ * Several optimizations are used to work around this:
66337+ *
66338+ * . cbk_cache (look-aside cache for tree traversals, see search.c for
66339+ * details)
66340+ *
66341+ * . seals (see seal.[ch])
66342+ *
66343+ * . vroot (see search.c)
66344+ *
66345+ * General search-by-key is layered thusly:
66346+ *
66347+ * [check seal, if any] --ok--> done
66348+ * |
66349+ * failed
66350+ * |
66351+ * V
66352+ * [vroot defined] --no--> node = tree_root
66353+ * | |
66354+ * yes |
66355+ * | |
66356+ * V |
66357+ * node = vroot |
66358+ * | |
66359+ * | |
66360+ * | |
66361+ * V V
66362+ * [check cbk_cache for key] --ok--> done
66363+ * |
66364+ * failed
66365+ * |
66366+ * V
66367+ * [start tree traversal from node]
66368+ *
66369+ */
66370+
66371+#include "forward.h"
66372+#include "debug.h"
66373+#include "dformat.h"
66374+#include "key.h"
66375+#include "coord.h"
66376+#include "plugin/item/static_stat.h"
66377+#include "plugin/item/item.h"
66378+#include "plugin/node/node.h"
66379+#include "plugin/plugin.h"
66380+#include "txnmgr.h"
66381+#include "jnode.h"
66382+#include "znode.h"
66383+#include "block_alloc.h"
66384+#include "tree_walk.h"
66385+#include "carry.h"
66386+#include "carry_ops.h"
66387+#include "tap.h"
66388+#include "tree.h"
66389+#include "vfs_ops.h"
66390+#include "page_cache.h"
66391+#include "super.h"
66392+#include "reiser4.h"
66393+#include "inode.h"
66394+
66395+#include <linux/fs.h> /* for struct super_block */
66396+#include <linux/spinlock.h>
66397+
66398+/* Disk address (block number) never ever used for any real tree node. This is
66399+ used as block number of "uber" znode.
66400+
66401+ Invalid block addresses are 0 by tradition.
66402+
66403+*/
66404+const reiser4_block_nr UBER_TREE_ADDR = 0ull;
66405+
66406+#define CUT_TREE_MIN_ITERATIONS 64
66407+
66408+static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
66409+
66410+/* return node plugin of coord->node */
66411+node_plugin *node_plugin_by_coord(const coord_t * coord)
66412+{
66413+ assert("vs-1", coord != NULL);
66414+ assert("vs-2", coord->node != NULL);
66415+
66416+ return coord->node->nplug;
66417+}
66418+
66419+/* insert item into tree. Fields of @coord are updated so that they can be
66420+ * used by consequent insert operation. */
66421+insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
66422+ * into */ ,
66423+ const reiser4_key * key /* key of new item */ ,
66424+ reiser4_item_data * data /* parameters for item
66425+ * creation */ ,
66426+ coord_t * coord /* resulting insertion coord */ ,
66427+ lock_handle * lh /* resulting lock
66428+ * handle */ ,
66429+ tree_level stop_level /** level where to insert */ ,
66430+ __u32 flags /* insertion flags */ )
66431+{
66432+ int result;
66433+
66434+ assert("nikita-358", tree != NULL);
66435+ assert("nikita-360", coord != NULL);
66436+
66437+ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
66438+ FIND_EXACT, stop_level, stop_level,
66439+ flags | CBK_FOR_INSERT, NULL /*ra_info */ );
66440+ switch (result) {
66441+ default:
66442+ break;
66443+ case CBK_COORD_FOUND:
66444+ result = IBK_ALREADY_EXISTS;
66445+ break;
66446+ case CBK_COORD_NOTFOUND:
66447+ assert("nikita-2017", coord->node != NULL);
66448+ result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
66449+ break;
66450+ }
66451+ return result;
66452+}
66453+
66454+/* insert item by calling carry. Helper function called if short-cut
66455+ insertion failed */
66456+static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
66457+ lock_handle * lh, /* lock handle of insertion
66458+ * node */
66459+ reiser4_item_data * data, /* parameters of new
66460+ * item */
66461+ const reiser4_key * key, /* key of new item */
66462+ carry_opcode cop, /* carry operation to perform */
66463+ cop_insert_flag flags
66464+ /* carry flags */ )
66465+{
66466+ int result;
66467+ carry_pool *pool;
66468+ carry_level *lowest_level;
66469+ carry_insert_data *cdata;
66470+ carry_op *op;
66471+
66472+ assert("umka-314", coord != NULL);
66473+
66474+ /* allocate carry_pool and 3 carry_level-s */
66475+ pool =
66476+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66477+ sizeof(*cdata));
66478+ if (IS_ERR(pool))
66479+ return PTR_ERR(pool);
66480+ lowest_level = (carry_level *) (pool + 1);
66481+ init_carry_level(lowest_level, pool);
66482+
66483+ op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
66484+ if (IS_ERR(op) || (op == NULL)) {
66485+ done_carry_pool(pool);
66486+ return RETERR(op ? PTR_ERR(op) : -EIO);
66487+ }
66488+ cdata = (carry_insert_data *) (lowest_level + 3);
66489+ cdata->coord = coord;
66490+ cdata->data = data;
66491+ cdata->key = key;
66492+ op->u.insert.d = cdata;
66493+ if (flags == 0)
66494+ flags = znode_get_tree(coord->node)->carry.insert_flags;
66495+ op->u.insert.flags = flags;
66496+ op->u.insert.type = COPT_ITEM_DATA;
66497+ op->u.insert.child = NULL;
66498+ if (lh != NULL) {
66499+ assert("nikita-3245", lh->node == coord->node);
66500+ lowest_level->track_type = CARRY_TRACK_CHANGE;
66501+ lowest_level->tracked = lh;
66502+ }
66503+
66504+ result = reiser4_carry(lowest_level, NULL);
66505+ done_carry_pool(pool);
66506+
66507+ return result;
66508+}
66509+
66510+/* form carry queue to perform paste of @data with @key at @coord, and launch
66511+ its execution by calling carry().
66512+
66513+ Instruct carry to update @lh it after balancing insertion coord moves into
66514+ different block.
66515+
66516+*/
66517+static int paste_with_carry(coord_t * coord, /* coord of paste */
66518+ lock_handle * lh, /* lock handle of node
66519+ * where item is
66520+ * pasted */
66521+ reiser4_item_data * data, /* parameters of new
66522+ * item */
66523+ const reiser4_key * key, /* key of new item */
66524+ unsigned flags /* paste flags */ )
66525+{
66526+ int result;
66527+ carry_pool *pool;
66528+ carry_level *lowest_level;
66529+ carry_insert_data *cdata;
66530+ carry_op *op;
66531+
66532+ assert("umka-315", coord != NULL);
66533+ assert("umka-316", key != NULL);
66534+
66535+ pool =
66536+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66537+ sizeof(*cdata));
66538+ if (IS_ERR(pool))
66539+ return PTR_ERR(pool);
66540+ lowest_level = (carry_level *) (pool + 1);
66541+ init_carry_level(lowest_level, pool);
66542+
66543+ op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
66544+ if (IS_ERR(op) || (op == NULL)) {
66545+ done_carry_pool(pool);
66546+ return RETERR(op ? PTR_ERR(op) : -EIO);
66547+ }
66548+ cdata = (carry_insert_data *) (lowest_level + 3);
66549+ cdata->coord = coord;
66550+ cdata->data = data;
66551+ cdata->key = key;
66552+ op->u.paste.d = cdata;
66553+ if (flags == 0)
66554+ flags = znode_get_tree(coord->node)->carry.paste_flags;
66555+ op->u.paste.flags = flags;
66556+ op->u.paste.type = COPT_ITEM_DATA;
66557+ if (lh != NULL) {
66558+ lowest_level->track_type = CARRY_TRACK_CHANGE;
66559+ lowest_level->tracked = lh;
66560+ }
66561+
66562+ result = reiser4_carry(lowest_level, NULL);
66563+ done_carry_pool(pool);
66564+
66565+ return result;
66566+}
66567+
66568+/* insert item at the given coord.
66569+
66570+ First try to skip carry by directly calling ->create_item() method of node
66571+ plugin. If this is impossible (there is not enough free space in the node,
66572+ or leftmost item in the node is created), call insert_with_carry_by_coord()
66573+ that will do full carry().
66574+
66575+*/
66576+insert_result insert_by_coord(coord_t * coord /* coord where to
66577+ * insert. coord->node has
66578+ * to be write locked by
66579+ * caller */ ,
66580+ reiser4_item_data * data /* data to be
66581+ * inserted */ ,
66582+ const reiser4_key * key /* key of new item */ ,
66583+ lock_handle * lh /* lock handle of write
66584+ * lock on node */ ,
66585+ __u32 flags /* insertion flags */ )
66586+{
66587+ unsigned item_size;
66588+ int result;
66589+ znode *node;
66590+
66591+ assert("vs-247", coord != NULL);
66592+ assert("vs-248", data != NULL);
66593+ assert("vs-249", data->length >= 0);
66594+ assert("nikita-1191", znode_is_write_locked(coord->node));
66595+
66596+ node = coord->node;
66597+ coord_clear_iplug(coord);
66598+ result = zload(node);
66599+ if (result != 0)
66600+ return result;
66601+
66602+ item_size = space_needed(node, NULL, data, 1);
66603+ if (item_size > znode_free_space(node) &&
66604+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
66605+ && (flags & COPI_DONT_ALLOCATE)) {
66606+ /* we are forced to use free space of coord->node and new item
66607+ does not fit into it.
66608+
66609+ Currently we get here only when we allocate and copy units
66610+ of extent item from a node to its left neighbor during
66611+ "squalloc"-ing. If @node (this is left neighbor) does not
66612+ have enough free space - we do not want to attempt any
66613+ shifting and allocations because we are in squeezing and
66614+ everything to the left of @node is tightly packed.
66615+ */
66616+ result = -E_NODE_FULL;
66617+ } else if ((item_size <= znode_free_space(node)) &&
66618+ !coord_is_before_leftmost(coord) &&
66619+ (node_plugin_by_node(node)->fast_insert != NULL)
66620+ && node_plugin_by_node(node)->fast_insert(coord)) {
66621+ /* shortcut insertion without carry() overhead.
66622+
66623+ Only possible if:
66624+
66625+ - there is enough free space
66626+
66627+ - insertion is not into the leftmost position in a node
66628+ (otherwise it would require updating of delimiting key in a
66629+ parent)
66630+
66631+ - node plugin agrees with this
66632+
66633+ */
66634+ result =
66635+ node_plugin_by_node(node)->create_item(coord, key, data,
66636+ NULL);
66637+ znode_make_dirty(node);
66638+ } else {
66639+ /* otherwise do full-fledged carry(). */
66640+ result =
66641+ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
66642+ flags);
66643+ }
66644+ zrelse(node);
66645+ return result;
66646+}
66647+
66648+/* @coord is set to leaf level and @data is to be inserted to twig level */
66649+insert_result
66650+insert_extent_by_coord(coord_t *
66651+ coord
66652+ /* coord where to insert. coord->node * has to be write * locked by caller */
66653+ ,
66654+ reiser4_item_data * data /* data to be inserted */ ,
66655+ const reiser4_key * key /* key of new item */ ,
66656+ lock_handle *
66657+ lh /* lock handle of write lock on * node */ )
66658+{
66659+ assert("vs-405", coord != NULL);
66660+ assert("vs-406", data != NULL);
66661+ assert("vs-407", data->length > 0);
66662+ assert("vs-408", znode_is_write_locked(coord->node));
66663+ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
66664+
66665+ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
66666+ 0 /*flags */ );
66667+}
66668+
66669+/* Insert into the item at the given coord.
66670+
66671+ First try to skip carry by directly calling ->paste() method of item
66672+ plugin. If this is impossible (there is not enough free space in the node,
66673+ or we are pasting into leftmost position in the node), call
66674+ paste_with_carry() that will do full carry().
66675+
66676+*/
66677+/* paste_into_item */
66678+int insert_into_item(coord_t * coord /* coord of pasting */ ,
66679+ lock_handle * lh /* lock handle on node involved */ ,
66680+ const reiser4_key * key /* key of unit being pasted */ ,
66681+ reiser4_item_data * data /* parameters for new unit */ ,
66682+ unsigned flags /* insert/paste flags */ )
66683+{
66684+ int result;
66685+ int size_change;
66686+ node_plugin *nplug;
66687+ item_plugin *iplug;
66688+
66689+ assert("umka-317", coord != NULL);
66690+ assert("umka-318", key != NULL);
66691+
66692+ iplug = item_plugin_by_coord(coord);
66693+ nplug = node_plugin_by_coord(coord);
66694+
66695+ assert("nikita-1480", iplug == data->iplug);
66696+
66697+ size_change = space_needed(coord->node, coord, data, 0);
66698+ if (size_change > (int)znode_free_space(coord->node) &&
66699+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
66700+ && (flags & COPI_DONT_ALLOCATE)) {
66701+ /* we are forced to use free space of coord->node and new data
66702+ does not fit into it. */
66703+ return -E_NODE_FULL;
66704+ }
66705+
66706+ /* shortcut paste without carry() overhead.
66707+
66708+ Only possible if:
66709+
66710+ - there is enough free space
66711+
66712+ - paste is not into the leftmost unit in a node (otherwise
66713+ it would require updating of delimiting key in a parent)
66714+
66715+ - node plugin agrees with this
66716+
66717+ - item plugin agrees with us
66718+ */
66719+ if (size_change <= (int)znode_free_space(coord->node) &&
66720+ (coord->item_pos != 0 ||
66721+ coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
66722+ coord->unit_pos != 0 && nplug->fast_paste != NULL &&
66723+ nplug->fast_paste(coord) &&
66724+ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
66725+ if (size_change > 0)
66726+ nplug->change_item_size(coord, size_change);
66727+ /* NOTE-NIKITA: huh? where @key is used? */
66728+ result = iplug->b.paste(coord, data, NULL);
66729+ if (size_change < 0)
66730+ nplug->change_item_size(coord, size_change);
66731+ znode_make_dirty(coord->node);
66732+ } else
66733+ /* otherwise do full-fledged carry(). */
66734+ result = paste_with_carry(coord, lh, data, key, flags);
66735+ return result;
66736+}
66737+
66738+/* this either appends or truncates item @coord */
66739+int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
66740+ reiser4_item_data * data /* parameters of resize */ ,
66741+ reiser4_key * key /* key of new unit */ ,
66742+ lock_handle * lh /* lock handle of node
66743+ * being modified */ ,
66744+ cop_insert_flag flags /* carry flags */ )
66745+{
66746+ int result;
66747+ znode *node;
66748+
66749+ assert("nikita-362", coord != NULL);
66750+ assert("nikita-363", data != NULL);
66751+ assert("vs-245", data->length != 0);
66752+
66753+ node = coord->node;
66754+ coord_clear_iplug(coord);
66755+ result = zload(node);
66756+ if (result != 0)
66757+ return result;
66758+
66759+ if (data->length < 0)
66760+ result = node_plugin_by_coord(coord)->shrink_item(coord,
66761+ -data->length);
66762+ else
66763+ result = insert_into_item(coord, lh, key, data, flags);
66764+
66765+ zrelse(node);
66766+ return result;
66767+}
66768+
66769+/* insert flow @f */
66770+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
66771+{
66772+ int result;
66773+ carry_pool *pool;
66774+ carry_level *lowest_level;
66775+ reiser4_item_data *data;
66776+ carry_op *op;
66777+
66778+ pool =
66779+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66780+ sizeof(*data));
66781+ if (IS_ERR(pool))
66782+ return PTR_ERR(pool);
66783+ lowest_level = (carry_level *) (pool + 1);
66784+ init_carry_level(lowest_level, pool);
66785+
66786+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
66787+ 0 /* operate directly on coord -> node */ );
66788+ if (IS_ERR(op) || (op == NULL)) {
66789+ done_carry_pool(pool);
66790+ return RETERR(op ? PTR_ERR(op) : -EIO);
66791+ }
66792+
66793+ /* these are permanent during insert_flow */
66794+ data = (reiser4_item_data *) (lowest_level + 3);
66795+ data->user = 1;
66796+ data->iplug = item_plugin_by_id(FORMATTING_ID);
66797+ data->arg = NULL;
66798+ /* data.length and data.data will be set before calling paste or
66799+ insert */
66800+ data->length = 0;
66801+ data->data = NULL;
66802+
66803+ op->u.insert_flow.flags = 0;
66804+ op->u.insert_flow.insert_point = coord;
66805+ op->u.insert_flow.flow = f;
66806+ op->u.insert_flow.data = data;
66807+ op->u.insert_flow.new_nodes = 0;
66808+
66809+ lowest_level->track_type = CARRY_TRACK_CHANGE;
66810+ lowest_level->tracked = lh;
66811+
66812+ result = reiser4_carry(lowest_level, NULL);
66813+ done_carry_pool(pool);
66814+
66815+ return result;
66816+}
66817+
66818+/* Given a coord in parent node, obtain a znode for the corresponding child */
66819+znode *child_znode(const coord_t * parent_coord /* coord of pointer to
66820+ * child */ ,
66821+ znode * parent /* parent of child */ ,
66822+ int incore_p /* if !0 only return child if already in
66823+ * memory */ ,
66824+ int setup_dkeys_p /* if !0 update delimiting keys of
66825+ * child */ )
66826+{
66827+ znode *child;
66828+
66829+ assert("nikita-1374", parent_coord != NULL);
66830+ assert("nikita-1482", parent != NULL);
66831+#if REISER4_DEBUG
66832+ if (setup_dkeys_p)
66833+ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
66834+#endif
66835+ assert("nikita-2947", znode_is_any_locked(parent));
66836+
66837+ if (znode_get_level(parent) <= LEAF_LEVEL) {
66838+ /* trying to get child of leaf node */
66839+ warning("nikita-1217", "Child of maize?");
66840+ return ERR_PTR(RETERR(-EIO));
66841+ }
66842+ if (item_is_internal(parent_coord)) {
66843+ reiser4_block_nr addr;
66844+ item_plugin *iplug;
66845+ reiser4_tree *tree;
66846+
66847+ iplug = item_plugin_by_coord(parent_coord);
66848+ assert("vs-512", iplug->s.internal.down_link);
66849+ iplug->s.internal.down_link(parent_coord, NULL, &addr);
66850+
66851+ tree = znode_get_tree(parent);
66852+ if (incore_p)
66853+ child = zlook(tree, &addr);
66854+ else
66855+ child =
66856+ zget(tree, &addr, parent,
66857+ znode_get_level(parent) - 1,
66858+ reiser4_ctx_gfp_mask_get());
66859+ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
66860+ set_child_delimiting_keys(parent, parent_coord, child);
66861+ } else {
66862+ warning("nikita-1483", "Internal item expected");
66863+ child = ERR_PTR(RETERR(-EIO));
66864+ }
66865+ return child;
66866+}
66867+
66868+/* remove znode from transaction */
66869+static void uncapture_znode(znode * node)
66870+{
66871+ struct page *page;
66872+
66873+ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66874+
66875+ if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
66876+ int ret;
66877+
66878+ /* An already allocated block goes right to the atom's delete set. */
66879+ ret =
66880+ reiser4_dealloc_block(znode_get_block(node), 0,
66881+ BA_DEFER | BA_FORMATTED);
66882+ if (ret)
66883+ warning("zam-942",
66884+ "can\'t add a block (%llu) number to atom's delete set\n",
66885+ (unsigned long long)(*znode_get_block(node)));
66886+
66887+ spin_lock_znode(node);
66888+ /* Here we return flush reserved block which was reserved at the
66889+ * moment when this allocated node was marked dirty and still
66890+ * not used by flush in node relocation procedure. */
66891+ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
66892+ txn_atom *atom;
66893+
66894+ atom = jnode_get_atom(ZJNODE(node));
66895+ assert("zam-939", atom != NULL);
66896+ spin_unlock_znode(node);
66897+ flush_reserved2grabbed(atom, (__u64) 1);
66898+ spin_unlock_atom(atom);
66899+ } else
66900+ spin_unlock_znode(node);
66901+ } else {
66902+ /* znode has assigned block which is counted as "fake
66903+ allocated". Return it back to "free blocks") */
66904+ fake_allocated2free((__u64) 1, BA_FORMATTED);
66905+ }
66906+
66907+ /*
66908+ * uncapture page from transaction. There is a possibility of a race
66909+ * with ->releasepage(): reiser4_releasepage() detaches page from this
66910+ * jnode and we have nothing to uncapture. To avoid this, get
66911+ * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
66912+ * will deal with released page itself.
66913+ */
66914+ spin_lock_znode(node);
66915+ page = znode_page(node);
66916+ if (likely(page != NULL)) {
66917+ /*
66918+ * reiser4_uncapture_page() can only be called when we are sure
66919+ * that znode is pinned in memory, which we are, because
66920+ * forget_znode() is only called from longterm_unlock_znode().
66921+ */
66922+ page_cache_get(page);
66923+ spin_unlock_znode(node);
66924+ lock_page(page);
66925+ reiser4_uncapture_page(page);
66926+ unlock_page(page);
66927+ page_cache_release(page);
66928+ } else {
66929+ txn_atom *atom;
66930+
66931+ /* handle "flush queued" znodes */
66932+ while (1) {
66933+ atom = jnode_get_atom(ZJNODE(node));
66934+ assert("zam-943", atom != NULL);
66935+
66936+ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
66937+ || !atom->nr_running_queues)
66938+ break;
66939+
66940+ spin_unlock_znode(node);
66941+ reiser4_atom_wait_event(atom);
66942+ spin_lock_znode(node);
66943+ }
66944+
66945+ reiser4_uncapture_block(ZJNODE(node));
66946+ spin_unlock_atom(atom);
66947+ zput(node);
66948+ }
66949+}
66950+
66951+/* This is called from longterm_unlock_znode() when last lock is released from
66952+ the node that has been removed from the tree. At this point node is removed
66953+ from sibling list and its lock is invalidated. */
66954+void forget_znode(lock_handle * handle)
66955+{
66956+ znode *node;
66957+ reiser4_tree *tree;
66958+
66959+ assert("umka-319", handle != NULL);
66960+
66961+ node = handle->node;
66962+ tree = znode_get_tree(node);
66963+
66964+ assert("vs-164", znode_is_write_locked(node));
66965+ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66966+ assert_rw_locked(&(node->lock.guard));
66967+
66968+ /* We assume that this node was detached from its parent before
66969+ * unlocking, it gives no way to reach this node from parent through a
66970+ * down link. The node should have no children and, thereby, can't be
66971+ * reached from them by their parent pointers. The only way to obtain a
66972+ * reference to the node is to use sibling pointers from its left and
66973+ * right neighbors. In the next several lines we remove the node from
66974+ * the sibling list. */
66975+
66976+ write_lock_tree(tree);
66977+ sibling_list_remove(node);
66978+ znode_remove(node, tree);
66979+ write_unlock_tree(tree);
66980+
66981+ /* Here we set JNODE_DYING and cancel all pending lock requests. It
66982+ * forces all lock requestor threads to repeat iterations of getting
66983+ * lock on a child, neighbor or parent node. But, those threads can't
66984+ * come to this node again, because this node is no longer a child,
66985+ * neighbor or parent of any other node. This order of znode
66986+ * invalidation does not allow other threads to waste cpu time is a busy
66987+ * loop, trying to lock dying object. The exception is in the flush
66988+ * code when we take node directly from atom's capture list.*/
66989+ reiser4_invalidate_lock(handle);
66990+ uncapture_znode(node);
66991+}
66992+
66993+/* Check that internal item at @pointer really contains pointer to @child. */
66994+int check_tree_pointer(const coord_t * pointer /* would-be pointer to
66995+ * @child */ ,
66996+ const znode * child /* child znode */ )
66997+{
66998+ assert("nikita-1016", pointer != NULL);
66999+ assert("nikita-1017", child != NULL);
67000+ assert("nikita-1018", pointer->node != NULL);
67001+
67002+ assert("nikita-1325", znode_is_any_locked(pointer->node));
67003+
67004+ assert("nikita-2985",
67005+ znode_get_level(pointer->node) == znode_get_level(child) + 1);
67006+
67007+ coord_clear_iplug((coord_t *) pointer);
67008+
67009+ if (coord_is_existing_unit(pointer)) {
67010+ item_plugin *iplug;
67011+ reiser4_block_nr addr;
67012+
67013+ if (item_is_internal(pointer)) {
67014+ iplug = item_plugin_by_coord(pointer);
67015+ assert("vs-513", iplug->s.internal.down_link);
67016+ iplug->s.internal.down_link(pointer, NULL, &addr);
67017+ /* check that cached value is correct */
67018+ if (disk_addr_eq(&addr, znode_get_block(child))) {
67019+ return NS_FOUND;
67020+ }
67021+ }
67022+ }
67023+ /* warning ("jmacd-1002", "tree pointer incorrect"); */
67024+ return NS_NOT_FOUND;
67025+}
67026+
67027+/* find coord of pointer to new @child in @parent.
67028+
67029+ Find the &coord_t in the @parent where pointer to a given @child will
67030+ be in.
67031+
67032+*/
67033+int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
67034+ znode *
67035+ child UNUSED_ARG /* child znode, passed locked */ ,
67036+ znode * left /* left brother of new node */ ,
67037+ coord_t * result /* where result is stored in */ )
67038+{
67039+ int ret;
67040+
67041+ assert("nikita-1486", parent != NULL);
67042+ assert("nikita-1487", child != NULL);
67043+ assert("nikita-1488", result != NULL);
67044+
67045+ ret = find_child_ptr(parent, left, result);
67046+ if (ret != NS_FOUND) {
67047+ warning("nikita-1489", "Cannot find brother position: %i", ret);
67048+ return RETERR(-EIO);
67049+ } else {
67050+ result->between = AFTER_UNIT;
67051+ return RETERR(NS_NOT_FOUND);
67052+ }
67053+}
67054+
67055+/* find coord of pointer to @child in @parent.
67056+
67057+ Find the &coord_t in the @parent where pointer to a given @child is in.
67058+
67059+*/
67060+int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
67061+ znode * child /* child znode, passed locked */ ,
67062+ coord_t * result /* where result is stored in */ )
67063+{
67064+ int lookup_res;
67065+ node_plugin *nplug;
67066+ /* left delimiting key of a child */
67067+ reiser4_key ld;
67068+ reiser4_tree *tree;
67069+
67070+ assert("nikita-934", parent != NULL);
67071+ assert("nikita-935", child != NULL);
67072+ assert("nikita-936", result != NULL);
67073+ assert("zam-356", znode_is_loaded(parent));
67074+
67075+ coord_init_zero(result);
67076+ result->node = parent;
67077+
67078+ nplug = parent->nplug;
67079+ assert("nikita-939", nplug != NULL);
67080+
67081+ tree = znode_get_tree(parent);
67082+ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
67083+ * not aliased to ->in_parent of some znode. Otherwise,
67084+ * parent_coord_to_coord() below would modify data protected by tree
67085+ * lock. */
67086+ read_lock_tree(tree);
67087+ /* fast path. Try to use cached value. Lock tree to keep
67088+ node->pos_in_parent and pos->*_blocknr consistent. */
67089+ if (child->in_parent.item_pos + 1 != 0) {
67090+ parent_coord_to_coord(&child->in_parent, result);
67091+ if (check_tree_pointer(result, child) == NS_FOUND) {
67092+ read_unlock_tree(tree);
67093+ return NS_FOUND;
67094+ }
67095+
67096+ child->in_parent.item_pos = (unsigned short)~0;
67097+ }
67098+ read_unlock_tree(tree);
67099+
67100+ /* is above failed, find some key from @child. We are looking for the
67101+ least key in a child. */
67102+ read_lock_dk(tree);
67103+ ld = *znode_get_ld_key(child);
67104+ read_unlock_dk(tree);
67105+ /*
67106+ * now, lookup parent with key just found. Note, that left delimiting
67107+ * key doesn't identify node uniquely, because (in extremely rare
67108+ * case) two nodes can have equal left delimiting keys, if one of them
67109+ * is completely filled with directory entries that all happened to be
67110+ * hash collision. But, we check block number in check_tree_pointer()
67111+ * and, so, are safe.
67112+ */
67113+ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
67114+ /* update cached pos_in_node */
67115+ if (lookup_res == NS_FOUND) {
67116+ write_lock_tree(tree);
67117+ coord_to_parent_coord(result, &child->in_parent);
67118+ write_unlock_tree(tree);
67119+ lookup_res = check_tree_pointer(result, child);
67120+ }
67121+ if (lookup_res == NS_NOT_FOUND)
67122+ lookup_res = find_child_by_addr(parent, child, result);
67123+ return lookup_res;
67124+}
67125+
67126+/* find coord of pointer to @child in @parent by scanning
67127+
67128+ Find the &coord_t in the @parent where pointer to a given @child
67129+ is in by scanning all internal items in @parent and comparing block
67130+ numbers in them with that of @child.
67131+
67132+*/
67133+static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
67134+ znode * child /* child znode, passed locked */ ,
67135+ coord_t * result /* where result is stored in */ )
67136+{
67137+ int ret;
67138+
67139+ assert("nikita-1320", parent != NULL);
67140+ assert("nikita-1321", child != NULL);
67141+ assert("nikita-1322", result != NULL);
67142+
67143+ ret = NS_NOT_FOUND;
67144+
67145+ for_all_units(result, parent) {
67146+ if (check_tree_pointer(result, child) == NS_FOUND) {
67147+ write_lock_tree(znode_get_tree(parent));
67148+ coord_to_parent_coord(result, &child->in_parent);
67149+ write_unlock_tree(znode_get_tree(parent));
67150+ ret = NS_FOUND;
67151+ break;
67152+ }
67153+ }
67154+ return ret;
67155+}
67156+
67157+/* true, if @addr is "unallocated block number", which is just address, with
67158+ highest bit set. */
67159+int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
67160+ * check */ )
67161+{
67162+ assert("nikita-1766", addr != NULL);
67163+ cassert(sizeof(reiser4_block_nr) == 8);
67164+ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
67165+ REISER4_UNALLOCATED_STATUS_VALUE;
67166+}
67167+
67168+/* returns true if removing bytes of given range of key [from_key, to_key]
67169+ causes removing of whole item @from */
67170+static int
67171+item_removed_completely(coord_t * from, const reiser4_key * from_key,
67172+ const reiser4_key * to_key)
67173+{
67174+ item_plugin *iplug;
67175+ reiser4_key key_in_item;
67176+
67177+ assert("umka-325", from != NULL);
67178+ assert("", item_is_extent(from));
67179+
67180+ /* check first key just for case */
67181+ item_key_by_coord(from, &key_in_item);
67182+ if (keygt(from_key, &key_in_item))
67183+ return 0;
67184+
67185+ /* check last key */
67186+ iplug = item_plugin_by_coord(from);
67187+ assert("vs-611", iplug && iplug->s.file.append_key);
67188+
67189+ iplug->s.file.append_key(from, &key_in_item);
67190+ set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
67191+
67192+ if (keylt(to_key, &key_in_item))
67193+ /* last byte is not removed */
67194+ return 0;
67195+ return 1;
67196+}
67197+
67198+/* helper function for prepare_twig_kill(): @left and @right are formatted
67199+ * neighbors of extent item being completely removed. Load and lock neighbors
67200+ * and store lock handles into @cdata for later use by kill_hook_extent() */
67201+static int
67202+prepare_children(znode * left, znode * right, carry_kill_data * kdata)
67203+{
67204+ int result;
67205+ int left_loaded;
67206+ int right_loaded;
67207+
67208+ result = 0;
67209+ left_loaded = right_loaded = 0;
67210+
67211+ if (left != NULL) {
67212+ result = zload(left);
67213+ if (result == 0) {
67214+ left_loaded = 1;
67215+ result = longterm_lock_znode(kdata->left, left,
67216+ ZNODE_READ_LOCK,
67217+ ZNODE_LOCK_LOPRI);
67218+ }
67219+ }
67220+ if (result == 0 && right != NULL) {
67221+ result = zload(right);
67222+ if (result == 0) {
67223+ right_loaded = 1;
67224+ result = longterm_lock_znode(kdata->right, right,
67225+ ZNODE_READ_LOCK,
67226+ ZNODE_LOCK_HIPRI |
67227+ ZNODE_LOCK_NONBLOCK);
67228+ }
67229+ }
67230+ if (result != 0) {
67231+ done_lh(kdata->left);
67232+ done_lh(kdata->right);
67233+ if (left_loaded != 0)
67234+ zrelse(left);
67235+ if (right_loaded != 0)
67236+ zrelse(right);
67237+ }
67238+ return result;
67239+}
67240+
67241+static void done_children(carry_kill_data * kdata)
67242+{
67243+ if (kdata->left != NULL && kdata->left->node != NULL) {
67244+ zrelse(kdata->left->node);
67245+ done_lh(kdata->left);
67246+ }
67247+ if (kdata->right != NULL && kdata->right->node != NULL) {
67248+ zrelse(kdata->right->node);
67249+ done_lh(kdata->right);
67250+ }
67251+}
67252+
67253+/* part of cut_node. It is called when cut_node is called to remove or cut part
67254+ of extent item. When head of that item is removed - we have to update right
67255+ delimiting of left neighbor of extent. When item is removed completely - we
67256+ have to set sibling link between left and right neighbor of removed
67257+ extent. This may return -E_DEADLOCK because of trying to get left neighbor
67258+ locked. So, caller should repeat an attempt
67259+*/
67260+/* Audited by: umka (2002.06.16) */
67261+static int
67262+prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
67263+{
67264+ int result;
67265+ reiser4_key key;
67266+ lock_handle left_lh;
67267+ lock_handle right_lh;
67268+ coord_t left_coord;
67269+ coord_t *from;
67270+ znode *left_child;
67271+ znode *right_child;
67272+ reiser4_tree *tree;
67273+ int left_zloaded_here, right_zloaded_here;
67274+
67275+ from = kdata->params.from;
67276+ assert("umka-326", from != NULL);
67277+ assert("umka-327", kdata->params.to != NULL);
67278+
67279+ /* for one extent item only yet */
67280+ assert("vs-591", item_is_extent(from));
67281+ assert("vs-592", from->item_pos == kdata->params.to->item_pos);
67282+
67283+ if ((kdata->params.from_key
67284+ && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
67285+ || from->unit_pos != 0) {
67286+ /* head of item @from is not removed, there is nothing to
67287+ worry about */
67288+ return 0;
67289+ }
67290+
67291+ result = 0;
67292+ left_zloaded_here = 0;
67293+ right_zloaded_here = 0;
67294+
67295+ left_child = right_child = NULL;
67296+
67297+ coord_dup(&left_coord, from);
67298+ init_lh(&left_lh);
67299+ init_lh(&right_lh);
67300+ if (coord_prev_unit(&left_coord)) {
67301+ /* @from is leftmost item in its node */
67302+ if (!locked_left_neighbor) {
67303+ result =
67304+ reiser4_get_left_neighbor(&left_lh, from->node,
67305+ ZNODE_READ_LOCK,
67306+ GN_CAN_USE_UPPER_LEVELS);
67307+ switch (result) {
67308+ case 0:
67309+ break;
67310+ case -E_NO_NEIGHBOR:
67311+ /* there is no formatted node to the left of
67312+ from->node */
67313+ warning("vs-605",
67314+ "extent item has smallest key in "
67315+ "the tree and it is about to be removed");
67316+ return 0;
67317+ case -E_DEADLOCK:
67318+ /* need to restart */
67319+ default:
67320+ return result;
67321+ }
67322+
67323+ /* we have acquired left neighbor of from->node */
67324+ result = zload(left_lh.node);
67325+ if (result)
67326+ goto done;
67327+
67328+ locked_left_neighbor = left_lh.node;
67329+ } else {
67330+ /* squalloc_right_twig_cut should have supplied locked
67331+ * left neighbor */
67332+ assert("vs-834",
67333+ znode_is_write_locked(locked_left_neighbor));
67334+ result = zload(locked_left_neighbor);
67335+ if (result)
67336+ return result;
67337+ }
67338+
67339+ left_zloaded_here = 1;
67340+ coord_init_last_unit(&left_coord, locked_left_neighbor);
67341+ }
67342+
67343+ if (!item_is_internal(&left_coord)) {
67344+ /* what else but extent can be on twig level */
67345+ assert("vs-606", item_is_extent(&left_coord));
67346+
67347+ /* there is no left formatted child */
67348+ if (left_zloaded_here)
67349+ zrelse(locked_left_neighbor);
67350+ done_lh(&left_lh);
67351+ return 0;
67352+ }
67353+
67354+ tree = znode_get_tree(left_coord.node);
67355+ left_child = child_znode(&left_coord, left_coord.node, 1, 0);
67356+
67357+ if (IS_ERR(left_child)) {
67358+ result = PTR_ERR(left_child);
67359+ goto done;
67360+ }
67361+
67362+ /* left child is acquired, calculate new right delimiting key for it
67363+ and get right child if it is necessary */
67364+ if (item_removed_completely
67365+ (from, kdata->params.from_key, kdata->params.to_key)) {
67366+ /* try to get right child of removed item */
67367+ coord_t right_coord;
67368+
67369+ assert("vs-607",
67370+ kdata->params.to->unit_pos ==
67371+ coord_last_unit_pos(kdata->params.to));
67372+ coord_dup(&right_coord, kdata->params.to);
67373+ if (coord_next_unit(&right_coord)) {
67374+ /* @to is rightmost unit in the node */
67375+ result =
67376+ reiser4_get_right_neighbor(&right_lh, from->node,
67377+ ZNODE_READ_LOCK,
67378+ GN_CAN_USE_UPPER_LEVELS);
67379+ switch (result) {
67380+ case 0:
67381+ result = zload(right_lh.node);
67382+ if (result)
67383+ goto done;
67384+
67385+ right_zloaded_here = 1;
67386+ coord_init_first_unit(&right_coord,
67387+ right_lh.node);
67388+ item_key_by_coord(&right_coord, &key);
67389+ break;
67390+
67391+ case -E_NO_NEIGHBOR:
67392+ /* there is no formatted node to the right of
67393+ from->node */
67394+ read_lock_dk(tree);
67395+ key = *znode_get_rd_key(from->node);
67396+ read_unlock_dk(tree);
67397+ right_coord.node = NULL;
67398+ result = 0;
67399+ break;
67400+ default:
67401+ /* real error */
67402+ goto done;
67403+ }
67404+ } else {
67405+ /* there is an item to the right of @from - take its key */
67406+ item_key_by_coord(&right_coord, &key);
67407+ }
67408+
67409+ /* try to get right child of @from */
67410+ if (right_coord.node && /* there is right neighbor of @from */
67411+ item_is_internal(&right_coord)) { /* it is internal item */
67412+ right_child = child_znode(&right_coord,
67413+ right_coord.node, 1, 0);
67414+
67415+ if (IS_ERR(right_child)) {
67416+ result = PTR_ERR(right_child);
67417+ goto done;
67418+ }
67419+
67420+ }
67421+ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
67422+ update of right delimiting key of left_child */
67423+ result = prepare_children(left_child, right_child, kdata);
67424+ } else {
67425+ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
67426+ result = prepare_children(left_child, NULL, kdata);
67427+ }
67428+
67429+ done:
67430+ if (right_child)
67431+ zput(right_child);
67432+ if (right_zloaded_here)
67433+ zrelse(right_lh.node);
67434+ done_lh(&right_lh);
67435+
67436+ if (left_child)
67437+ zput(left_child);
67438+ if (left_zloaded_here)
67439+ zrelse(locked_left_neighbor);
67440+ done_lh(&left_lh);
67441+ return result;
67442+}
67443+
67444+/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
67445+ are to be cut completely */
67446+/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
67447+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
67448+ const reiser4_key * to_key, /* last key to be removed */
67449+ reiser4_key *
67450+ smallest_removed /* smallest key actually removed */ )
67451+{
67452+ int result;
67453+ carry_pool *pool;
67454+ carry_level *lowest_level;
67455+ carry_cut_data *cut_data;
67456+ carry_op *op;
67457+
67458+ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
67459+
67460+ pool =
67461+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67462+ sizeof(*cut_data));
67463+ if (IS_ERR(pool))
67464+ return PTR_ERR(pool);
67465+ lowest_level = (carry_level *) (pool + 1);
67466+ init_carry_level(lowest_level, pool);
67467+
67468+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67469+ assert("vs-1509", op != 0);
67470+ if (IS_ERR(op)) {
67471+ done_carry_pool(pool);
67472+ return PTR_ERR(op);
67473+ }
67474+
67475+ cut_data = (carry_cut_data *) (lowest_level + 3);
67476+ cut_data->params.from = from;
67477+ cut_data->params.to = to;
67478+ cut_data->params.from_key = from_key;
67479+ cut_data->params.to_key = to_key;
67480+ cut_data->params.smallest_removed = smallest_removed;
67481+
67482+ op->u.cut_or_kill.is_cut = 1;
67483+ op->u.cut_or_kill.u.cut = cut_data;
67484+
67485+ result = reiser4_carry(lowest_level, NULL);
67486+ done_carry_pool(pool);
67487+
67488+ return result;
67489+}
67490+
67491+/* cut part of the node
67492+
67493+ Cut part or whole content of node.
67494+
67495+ cut data between @from and @to of @from->node and call carry() to make
67496+ corresponding changes in the tree. @from->node may become empty. If so -
67497+ pointer to it will be removed. Neighboring nodes are not changed. Smallest
67498+ removed key is stored in @smallest_removed
67499+
67500+*/
67501+int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
67502+ coord_t * to, /* coord of the last unit/item that will be eliminated */
67503+ const reiser4_key * from_key, /* first key to be removed */
67504+ const reiser4_key * to_key, /* last key to be removed */
67505+ reiser4_key * smallest_removed, /* smallest key actually removed */
67506+ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
67507+ * locked (in squalloc_right_twig_cut, namely) */
67508+ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
67509+ invalidate pages together with item pointing to them */
67510+ int truncate)
67511+{ /* this call is made for file truncate) */
67512+ int result;
67513+ carry_pool *pool;
67514+ carry_level *lowest_level;
67515+ carry_kill_data *kdata;
67516+ lock_handle *left_child;
67517+ lock_handle *right_child;
67518+ carry_op *op;
67519+
67520+ assert("umka-328", from != NULL);
67521+ assert("vs-316", !node_is_empty(from->node));
67522+ assert("nikita-1812", coord_is_existing_unit(from)
67523+ && coord_is_existing_unit(to));
67524+
67525+ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
67526+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67527+ sizeof(carry_kill_data) +
67528+ 2 * sizeof(lock_handle) +
67529+ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
67530+ if (IS_ERR(pool))
67531+ return PTR_ERR(pool);
67532+
67533+ lowest_level = (carry_level *) (pool + 1);
67534+ init_carry_level(lowest_level, pool);
67535+
67536+ kdata = (carry_kill_data *) (lowest_level + 3);
67537+ left_child = (lock_handle *) (kdata + 1);
67538+ right_child = left_child + 1;
67539+
67540+ init_lh(left_child);
67541+ init_lh(right_child);
67542+
67543+ kdata->params.from = from;
67544+ kdata->params.to = to;
67545+ kdata->params.from_key = from_key;
67546+ kdata->params.to_key = to_key;
67547+ kdata->params.smallest_removed = smallest_removed;
67548+ kdata->params.truncate = truncate;
67549+ kdata->flags = 0;
67550+ kdata->inode = inode;
67551+ kdata->left = left_child;
67552+ kdata->right = right_child;
67553+ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
67554+ kdata->buf = (char *)(right_child + 1);
67555+
67556+ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
67557+ /* left child of extent item may have to get updated right
67558+ delimiting key and to get linked with right child of extent
67559+ @from if it will be removed completely */
67560+ result = prepare_twig_kill(kdata, locked_left_neighbor);
67561+ if (result) {
67562+ done_children(kdata);
67563+ done_carry_pool(pool);
67564+ return result;
67565+ }
67566+ }
67567+
67568+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67569+ if (IS_ERR(op) || (op == NULL)) {
67570+ done_children(kdata);
67571+ done_carry_pool(pool);
67572+ return RETERR(op ? PTR_ERR(op) : -EIO);
67573+ }
67574+
67575+ op->u.cut_or_kill.is_cut = 0;
67576+ op->u.cut_or_kill.u.kill = kdata;
67577+
67578+ result = reiser4_carry(lowest_level, NULL);
67579+
67580+ done_children(kdata);
67581+ done_carry_pool(pool);
67582+ return result;
67583+}
67584+
67585+void
67586+fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
67587+{
67588+ if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
67589+ pgoff_t start_pg, end_pg;
67590+
67591+ start_pg = start >> PAGE_CACHE_SHIFT;
67592+ end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
67593+
67594+ if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
67595+ /*
67596+ * kill up to the page boundary.
67597+ */
67598+ assert("vs-123456", start_pg == end_pg);
67599+ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
67600+ truncate);
67601+ } else if (start_pg != end_pg) {
67602+ /*
67603+ * page boundary is within killed portion of node.
67604+ */
67605+ assert("vs-654321", end_pg - start_pg == 1);
67606+ reiser4_invalidate_pages(inode->i_mapping, end_pg,
67607+ end_pg - start_pg, 1);
67608+ }
67609+ }
67610+ inode_sub_bytes(inode, end - start);
67611+}
67612+
67613+/**
67614+ * Delete whole @node from the reiser4 tree without loading it.
67615+ *
67616+ * @left: locked left neighbor,
67617+ * @node: node to be deleted,
67618+ * @smallest_removed: leftmost key of deleted node,
67619+ * @object: inode pointer, if we truncate a file body.
67620+ * @truncate: true if called for file truncate.
67621+ *
67622+ * @return: 0 if success, error code otherwise.
67623+ *
67624+ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
67625+ * contains the right value of the smallest removed key from the previous
67626+ * cut_worker() iteration. This is needed for proper accounting of
67627+ * "i_blocks" and "i_bytes" fields of the @object.
67628+ */
67629+int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
67630+ struct inode *object, int truncate)
67631+{
67632+ lock_handle parent_lock;
67633+ coord_t cut_from;
67634+ coord_t cut_to;
67635+ reiser4_tree *tree;
67636+ int ret;
67637+
67638+ assert("zam-937", node != NULL);
67639+ assert("zam-933", znode_is_write_locked(node));
67640+ assert("zam-999", smallest_removed != NULL);
67641+
67642+ init_lh(&parent_lock);
67643+
67644+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
67645+ if (ret)
67646+ return ret;
67647+
67648+ assert("zam-934", !znode_above_root(parent_lock.node));
67649+
67650+ ret = zload(parent_lock.node);
67651+ if (ret)
67652+ goto failed_nozrelse;
67653+
67654+ ret = find_child_ptr(parent_lock.node, node, &cut_from);
67655+ if (ret)
67656+ goto failed;
67657+
67658+ /* decrement child counter and set parent pointer to NULL before
67659+ deleting the list from parent node because of checks in
67660+ internal_kill_item_hook (we can delete the last item from the parent
67661+ node, the parent node is going to be deleted and its c_count should
67662+ be zero). */
67663+
67664+ tree = znode_get_tree(node);
67665+ write_lock_tree(tree);
67666+ init_parent_coord(&node->in_parent, NULL);
67667+ --parent_lock.node->c_count;
67668+ write_unlock_tree(tree);
67669+
67670+ assert("zam-989", item_is_internal(&cut_from));
67671+
67672+ /* @node should be deleted after unlocking. */
67673+ ZF_SET(node, JNODE_HEARD_BANSHEE);
67674+
67675+ /* remove a pointer from the parent node to the node being deleted. */
67676+ coord_dup(&cut_to, &cut_from);
67677+ /* FIXME: shouldn't this be kill_node_content */
67678+ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
67679+ if (ret)
67680+ /* FIXME(Zam): Should we re-connect the node to its parent if
67681+ * cut_node fails? */
67682+ goto failed;
67683+
67684+ {
67685+ reiser4_tree *tree = current_tree;
67686+ __u64 start_offset = 0, end_offset = 0;
67687+
67688+ read_lock_tree(tree);
67689+ write_lock_dk(tree);
67690+ if (object) {
67691+ /* We use @smallest_removed and the left delimiting of
67692+ * the current node for @object->i_blocks, i_bytes
67693+ * calculation. We assume that the items after the
67694+ * *@smallest_removed key have been deleted from the
67695+ * file body. */
67696+ start_offset = get_key_offset(znode_get_ld_key(node));
67697+ end_offset = get_key_offset(smallest_removed);
67698+ }
67699+
67700+ assert("zam-1021", znode_is_connected(node));
67701+ if (node->left)
67702+ znode_set_rd_key(node->left, znode_get_rd_key(node));
67703+
67704+ *smallest_removed = *znode_get_ld_key(node);
67705+
67706+ write_unlock_dk(tree);
67707+ read_unlock_tree(tree);
67708+
67709+ if (object) {
67710+ /* we used to perform actions which are to be performed on items on their removal from tree in
67711+ special item method - kill_hook. Here for optimization reasons we avoid reading node
67712+ containing item we remove and can not call item's kill hook. Instead we call function which
67713+ does exactly the same things as tail kill hook in assumption that node we avoid reading
67714+ contains only one item and that item is a tail one. */
67715+ fake_kill_hook_tail(object, start_offset, end_offset,
67716+ truncate);
67717+ }
67718+ }
67719+ failed:
67720+ zrelse(parent_lock.node);
67721+ failed_nozrelse:
67722+ done_lh(&parent_lock);
67723+
67724+ return ret;
67725+}
67726+
67727+static int can_delete(const reiser4_key *key, znode *node)
67728+{
67729+ int result;
67730+
67731+ read_lock_dk(current_tree);
67732+ result = keyle(key, znode_get_ld_key(node));
67733+ read_unlock_dk(current_tree);
67734+ return result;
67735+}
67736+
67737+/**
67738+ * This subroutine is not optimal but implementation seems to
67739+ * be easier).
67740+ *
67741+ * @tap: the point deletion process begins from,
67742+ * @from_key: the beginning of the deleted key range,
67743+ * @to_key: the end of the deleted key range,
67744+ * @smallest_removed: the smallest removed key,
67745+ * @truncate: true if called for file truncate.
67746+ * @progress: return true if a progress in file items deletions was made,
67747+ * @smallest_removed value is actual in that case.
67748+ *
67749+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long
67750+ * reiser4_cut_tree operation was interrupted for allowing atom commit.
67751+ */
67752+int
67753+cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
67754+ const reiser4_key * to_key,
67755+ reiser4_key * smallest_removed, struct inode *object,
67756+ int truncate, int *progress)
67757+{
67758+ lock_handle next_node_lock;
67759+ coord_t left_coord;
67760+ int result;
67761+
67762+ assert("zam-931", tap->coord->node != NULL);
67763+ assert("zam-932", znode_is_write_locked(tap->coord->node));
67764+
67765+ *progress = 0;
67766+ init_lh(&next_node_lock);
67767+
67768+ while (1) {
67769+ znode *node; /* node from which items are cut */
67770+ node_plugin *nplug; /* node plugin for @node */
67771+
67772+ node = tap->coord->node;
67773+
67774+ /* Move next_node_lock to the next node on the left. */
67775+ result =
67776+ reiser4_get_left_neighbor(&next_node_lock, node,
67777+ ZNODE_WRITE_LOCK,
67778+ GN_CAN_USE_UPPER_LEVELS);
67779+ if (result != 0 && result != -E_NO_NEIGHBOR)
67780+ break;
67781+ /* Check can we delete the node as a whole. */
67782+ if (*progress && znode_get_level(node) == LEAF_LEVEL &&
67783+ can_delete(from_key, node)) {
67784+ result = reiser4_delete_node(node, smallest_removed,
67785+ object, truncate);
67786+ } else {
67787+ result = reiser4_tap_load(tap);
67788+ if (result)
67789+ return result;
67790+
67791+ /* Prepare the second (right) point for cut_node() */
67792+ if (*progress)
67793+ coord_init_last_unit(tap->coord, node);
67794+
67795+ else if (item_plugin_by_coord(tap->coord)->b.lookup ==
67796+ NULL)
67797+ /* set rightmost unit for the items without lookup method */
67798+ tap->coord->unit_pos =
67799+ coord_last_unit_pos(tap->coord);
67800+
67801+ nplug = node->nplug;
67802+
67803+ assert("vs-686", nplug);
67804+ assert("vs-687", nplug->lookup);
67805+
67806+ /* left_coord is leftmost unit cut from @node */
67807+ result = nplug->lookup(node, from_key,
67808+ FIND_MAX_NOT_MORE_THAN,
67809+ &left_coord);
67810+
67811+ if (IS_CBKERR(result))
67812+ break;
67813+
67814+ /* adjust coordinates so that they are set to existing units */
67815+ if (coord_set_to_right(&left_coord)
67816+ || coord_set_to_left(tap->coord)) {
67817+ result = 0;
67818+ break;
67819+ }
67820+
67821+ if (coord_compare(&left_coord, tap->coord) ==
67822+ COORD_CMP_ON_RIGHT) {
67823+ /* keys from @from_key to @to_key are not in the tree */
67824+ result = 0;
67825+ break;
67826+ }
67827+
67828+ if (left_coord.item_pos != tap->coord->item_pos) {
67829+ /* do not allow to cut more than one item. It is added to solve problem of truncating
67830+ partially converted files. If file is partially converted there may exist a twig node
67831+ containing both internal item or items pointing to leaf nodes with formatting items
67832+ and extent item. We do not want to kill internal items being at twig node here
67833+ because cut_tree_worker assumes killing them from level level */
67834+ coord_dup(&left_coord, tap->coord);
67835+ assert("vs-1652",
67836+ coord_is_existing_unit(&left_coord));
67837+ left_coord.unit_pos = 0;
67838+ }
67839+
67840+ /* cut data from one node */
67841+ // *smallest_removed = *reiser4_min_key();
67842+ result =
67843+ kill_node_content(&left_coord, tap->coord, from_key,
67844+ to_key, smallest_removed,
67845+ next_node_lock.node, object,
67846+ truncate);
67847+ reiser4_tap_relse(tap);
67848+ }
67849+ if (result)
67850+ break;
67851+
67852+ ++(*progress);
67853+
67854+ /* Check whether all items with keys >= from_key were removed
67855+ * from the tree. */
67856+ if (keyle(smallest_removed, from_key))
67857+ /* result = 0; */
67858+ break;
67859+
67860+ if (next_node_lock.node == NULL)
67861+ break;
67862+
67863+ result = reiser4_tap_move(tap, &next_node_lock);
67864+ done_lh(&next_node_lock);
67865+ if (result)
67866+ break;
67867+
67868+ /* Break long reiser4_cut_tree operation (deletion of a large
67869+ file) if atom requires commit. */
67870+ if (*progress > CUT_TREE_MIN_ITERATIONS
67871+ && current_atom_should_commit()) {
67872+ result = -E_REPEAT;
67873+ break;
67874+ }
67875+ }
67876+ done_lh(&next_node_lock);
67877+ // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
67878+ return result;
67879+}
67880+
67881+/* there is a fundamental problem with optimizing deletes: VFS does it
67882+ one file at a time. Another problem is that if an item can be
67883+ anything, then deleting items must be done one at a time. It just
67884+ seems clean to writes this to specify a from and a to key, and cut
67885+ everything between them though. */
67886+
67887+/* use this function with care if deleting more than what is part of a single file. */
67888+/* do not use this when cutting a single item, it is suboptimal for that */
67889+
67890+/* You are encouraged to write plugin specific versions of this. It
67891+ cannot be optimal for all plugins because it works item at a time,
67892+ and some plugins could sometimes work node at a time. Regular files
67893+ however are not optimizable to work node at a time because of
67894+ extents needing to free the blocks they point to.
67895+
67896+ Optimizations compared to v3 code:
67897+
67898+ It does not balance (that task is left to memory pressure code).
67899+
67900+ Nodes are deleted only if empty.
67901+
67902+ Uses extents.
67903+
67904+ Performs read-ahead of formatted nodes whose contents are part of
67905+ the deletion.
67906+*/
67907+
67908+/**
67909+ * Delete everything from the reiser4 tree between two keys: @from_key and
67910+ * @to_key.
67911+ *
67912+ * @from_key: the beginning of the deleted key range,
67913+ * @to_key: the end of the deleted key range,
67914+ * @smallest_removed: the smallest removed key,
67915+ * @object: owner of cutting items.
67916+ * @truncate: true if called for file truncate.
67917+ * @progress: return true if a progress in file items deletions was made,
67918+ * @smallest_removed value is actual in that case.
67919+ *
67920+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
67921+ * operation was interrupted for allowing atom commit .
67922+ */
67923+
67924+int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
67925+ const reiser4_key * to_key,
67926+ reiser4_key * smallest_removed_p,
67927+ struct inode *object, int truncate, int *progress)
67928+{
67929+ lock_handle lock;
67930+ int result;
67931+ tap_t tap;
67932+ coord_t right_coord;
67933+ reiser4_key smallest_removed;
67934+ int (*cut_tree_worker) (tap_t *, const reiser4_key *,
67935+ const reiser4_key *, reiser4_key *,
67936+ struct inode *, int, int *);
67937+ STORE_COUNTERS;
67938+
67939+ assert("umka-329", tree != NULL);
67940+ assert("umka-330", from_key != NULL);
67941+ assert("umka-331", to_key != NULL);
67942+ assert("zam-936", keyle(from_key, to_key));
67943+
67944+ if (smallest_removed_p == NULL)
67945+ smallest_removed_p = &smallest_removed;
67946+
67947+ init_lh(&lock);
67948+
67949+ do {
67950+ /* Find rightmost item to cut away from the tree. */
67951+ result = reiser4_object_lookup(object, to_key, &right_coord,
67952+ &lock, ZNODE_WRITE_LOCK,
67953+ FIND_MAX_NOT_MORE_THAN,
67954+ TWIG_LEVEL, LEAF_LEVEL,
67955+ CBK_UNIQUE, NULL /*ra_info */);
67956+ if (result != CBK_COORD_FOUND)
67957+ break;
67958+ if (object == NULL
67959+ || inode_file_plugin(object)->cut_tree_worker == NULL)
67960+ cut_tree_worker = cut_tree_worker_common;
67961+ else
67962+ cut_tree_worker =
67963+ inode_file_plugin(object)->cut_tree_worker;
67964+ reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
67965+ result =
67966+ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
67967+ object, truncate, progress);
67968+ reiser4_tap_done(&tap);
67969+
67970+ reiser4_preempt_point();
67971+
67972+ } while (0);
67973+
67974+ done_lh(&lock);
67975+
67976+ if (result) {
67977+ switch (result) {
67978+ case -E_NO_NEIGHBOR:
67979+ result = 0;
67980+ break;
67981+ case -E_DEADLOCK:
67982+ result = -E_REPEAT;
67983+ case -E_REPEAT:
67984+ case -ENOMEM:
67985+ case -ENOENT:
67986+ break;
67987+ default:
67988+ warning("nikita-2861", "failure: %i", result);
67989+ }
67990+ }
67991+
67992+ CHECK_COUNTERS;
67993+ return result;
67994+}
67995+
67996+/* repeat reiser4_cut_tree_object until everything is deleted.
67997+ * unlike cut_file_items, it does not end current transaction if -E_REPEAT
67998+ * is returned by cut_tree_object. */
67999+int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68000+ const reiser4_key * to, struct inode *inode, int truncate)
68001+{
68002+ int result;
68003+ int progress;
68004+
68005+ do {
68006+ result = reiser4_cut_tree_object(tree, from, to, NULL,
68007+ inode, truncate, &progress);
68008+ } while (result == -E_REPEAT);
68009+
68010+ return result;
68011+}
68012+
68013+/* finishing reiser4 initialization */
68014+int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being
68015+ * initialized */ ,
68016+ const reiser4_block_nr * root_block /* address of a root block
68017+ * on a disk */ ,
68018+ tree_level height /* height of a tree */ ,
68019+ node_plugin * nplug /* default node plugin */ )
68020+{
68021+ int result;
68022+
68023+ assert("nikita-306", tree != NULL);
68024+ assert("nikita-307", root_block != NULL);
68025+ assert("nikita-308", height > 0);
68026+ assert("nikita-309", nplug != NULL);
68027+ assert("zam-587", tree->super != NULL);
68028+
68029+ tree->root_block = *root_block;
68030+ tree->height = height;
68031+ tree->estimate_one_insert = calc_estimate_one_insert(height);
68032+ tree->nplug = nplug;
68033+
68034+ tree->znode_epoch = 1ull;
68035+
68036+ cbk_cache_init(&tree->cbk_cache);
68037+
68038+ result = znodes_tree_init(tree);
68039+ if (result == 0)
68040+ result = jnodes_tree_init(tree);
68041+ if (result == 0) {
68042+ tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
68043+ reiser4_ctx_gfp_mask_get());
68044+ if (IS_ERR(tree->uber)) {
68045+ result = PTR_ERR(tree->uber);
68046+ tree->uber = NULL;
68047+ }
68048+ }
68049+ return result;
68050+}
68051+
68052+/* release resources associated with @tree */
68053+void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
68054+{
68055+ if (tree == NULL)
68056+ return;
68057+
68058+ if (tree->uber != NULL) {
68059+ zput(tree->uber);
68060+ tree->uber = NULL;
68061+ }
68062+ znodes_tree_done(tree);
68063+ jnodes_tree_done(tree);
68064+ cbk_cache_done(&tree->cbk_cache);
68065+}
68066+
68067+/* Make Linus happy.
68068+ Local variables:
68069+ c-indentation-style: "K&R"
68070+ mode-name: "LC"
68071+ c-basic-offset: 8
68072+ tab-width: 8
68073+ fill-column: 120
68074+ scroll-step: 1
68075+ End:
68076+*/
68077diff -urN linux-2.6.23.orig/fs/reiser4/tree.h linux-2.6.23/fs/reiser4/tree.h
68078--- linux-2.6.23.orig/fs/reiser4/tree.h 1970-01-01 03:00:00.000000000 +0300
68079+++ linux-2.6.23/fs/reiser4/tree.h 2007-12-04 16:49:30.000000000 +0300
68080@@ -0,0 +1,577 @@
68081+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68082+ * reiser4/README */
68083+
68084+/* Tree operations. See fs/reiser4/tree.c for comments */
68085+
68086+#if !defined( __REISER4_TREE_H__ )
68087+#define __REISER4_TREE_H__
68088+
68089+#include "forward.h"
68090+#include "debug.h"
68091+#include "dformat.h"
68092+#include "plugin/node/node.h"
68093+#include "plugin/plugin.h"
68094+#include "znode.h"
68095+#include "tap.h"
68096+
68097+#include <linux/types.h> /* for __u?? */
68098+#include <linux/fs.h> /* for struct super_block */
68099+#include <linux/spinlock.h>
68100+#include <linux/sched.h> /* for struct task_struct */
68101+
68102+/* fictive block number never actually used */
68103+extern const reiser4_block_nr UBER_TREE_ADDR;
68104+
68105+/* &cbk_cache_slot - entry in a coord cache.
68106+
68107+ This is entry in a coord_by_key (cbk) cache, represented by
68108+ &cbk_cache.
68109+
68110+*/
68111+typedef struct cbk_cache_slot {
68112+ /* cached node */
68113+ znode *node;
68114+ /* linkage to the next cbk cache slot in a LRU order */
68115+ struct list_head lru;
68116+} cbk_cache_slot;
68117+
68118+/* &cbk_cache - coord cache. This is part of reiser4_tree.
68119+
68120+ cbk_cache is supposed to speed up tree lookups by caching results of recent
68121+ successful lookups (we don't cache negative results as dentry cache
68122+ does). Cache consists of relatively small number of entries kept in a LRU
68123+ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
68124+ which we can obtain a range of keys that covered by this znode. Before
68125+ embarking into real tree traversal we scan cbk_cache slot by slot and for
68126+ each slot check whether key we are looking for is between minimal and
68127+ maximal keys for node pointed to by this slot. If no match is found, real
68128+ tree traversal is performed and if result is successful, appropriate entry
68129+ is inserted into cache, possibly pulling least recently used entry out of
68130+ it.
68131+
68132+ Tree spin lock is used to protect coord cache. If contention for this
68133+ lock proves to be too high, more finer grained locking can be added.
68134+
68135+ Invariants involving parts of this data-type:
68136+
68137+ [cbk-cache-invariant]
68138+*/
68139+typedef struct cbk_cache {
68140+ /* serializator */
68141+ rwlock_t guard;
68142+ int nr_slots;
68143+ /* head of LRU list of cache slots */
68144+ struct list_head lru;
68145+ /* actual array of slots */
68146+ cbk_cache_slot *slot;
68147+} cbk_cache;
68148+
68149+/* level_lookup_result - possible outcome of looking up key at some level.
68150+ This is used by coord_by_key when traversing tree downward. */
68151+typedef enum {
68152+ /* continue to the next level */
68153+ LOOKUP_CONT,
68154+ /* done. Either required item was found, or we can prove it
68155+ doesn't exist, or some error occurred. */
68156+ LOOKUP_DONE,
68157+ /* restart traversal from the root. Infamous "repetition". */
68158+ LOOKUP_REST
68159+} level_lookup_result;
68160+
68161+/* This is representation of internal reiser4 tree where all file-system
68162+ data and meta-data are stored. This structure is passed to all tree
68163+ manipulation functions. It's different from the super block because:
68164+ we don't want to limit ourselves to strictly one to one mapping
68165+ between super blocks and trees, and, because they are logically
68166+ different: there are things in a super block that have no relation to
68167+ the tree (bitmaps, journalling area, mount options, etc.) and there
68168+ are things in a tree that bear no relation to the super block, like
68169+ tree of znodes.
68170+
68171+ At this time, there is only one tree
68172+ per filesystem, and this struct is part of the super block. We only
68173+ call the super block the super block for historical reasons (most
68174+ other filesystems call the per filesystem metadata the super block).
68175+*/
68176+
68177+struct reiser4_tree {
68178+ /* block_nr == 0 is fake znode. Write lock it, while changing
68179+ tree height. */
68180+ /* disk address of root node of a tree */
68181+ reiser4_block_nr root_block;
68182+
68183+ /* level of the root node. If this is 1, tree consists of root
68184+ node only */
68185+ tree_level height;
68186+
68187+ /*
68188+ * this is cached here avoid calling plugins through function
68189+ * dereference all the time.
68190+ */
68191+ __u64 estimate_one_insert;
68192+
68193+ /* cache of recent tree lookup results */
68194+ cbk_cache cbk_cache;
68195+
68196+ /* hash table to look up znodes by block number. */
68197+ z_hash_table zhash_table;
68198+ z_hash_table zfake_table;
68199+ /* hash table to look up jnodes by inode and offset. */
68200+ j_hash_table jhash_table;
68201+
68202+ /* lock protecting:
68203+ - parent pointers,
68204+ - sibling pointers,
68205+ - znode hash table
68206+ - coord cache
68207+ */
68208+ /* NOTE: The "giant" tree lock can be replaced by more spin locks,
68209+ hoping they will be less contented. We can use one spin lock per one
68210+ znode hash bucket. With adding of some code complexity, sibling
68211+ pointers can be protected by both znode spin locks. However it looks
68212+ more SMP scalable we should test this locking change on n-ways (n >
68213+ 4) SMP machines. Current 4-ways machine test does not show that tree
68214+ lock is contented and it is a bottleneck (2003.07.25). */
68215+
68216+ rwlock_t tree_lock;
68217+
68218+ /* lock protecting delimiting keys */
68219+ rwlock_t dk_lock;
68220+
68221+ /* spin lock protecting znode_epoch */
68222+ spinlock_t epoch_lock;
68223+ /* version stamp used to mark znode updates. See seal.[ch] for more
68224+ * information. */
68225+ __u64 znode_epoch;
68226+
68227+ znode *uber;
68228+ node_plugin *nplug;
68229+ struct super_block *super;
68230+ struct {
68231+ /* carry flags used for insertion of new nodes */
68232+ __u32 new_node_flags;
68233+ /* carry flags used for insertion of new extents */
68234+ __u32 new_extent_flags;
68235+ /* carry flags used for paste operations */
68236+ __u32 paste_flags;
68237+ /* carry flags used for insert operations */
68238+ __u32 insert_flags;
68239+ } carry;
68240+};
68241+
68242+extern int reiser4_init_tree(reiser4_tree * tree,
68243+ const reiser4_block_nr * root_block,
68244+ tree_level height, node_plugin * default_plugin);
68245+extern void reiser4_done_tree(reiser4_tree * tree);
68246+
68247+/* cbk flags: options for coord_by_key() */
68248+typedef enum {
68249+ /* coord_by_key() is called for insertion. This is necessary because
68250+ of extents being located at the twig level. For explanation, see
68251+ comment just above is_next_item_internal().
68252+ */
68253+ CBK_FOR_INSERT = (1 << 0),
68254+ /* coord_by_key() is called with key that is known to be unique */
68255+ CBK_UNIQUE = (1 << 1),
68256+ /* coord_by_key() can trust delimiting keys. This options is not user
68257+ accessible. coord_by_key() will set it automatically. It will be
68258+ only cleared by special-case in extents-on-the-twig-level handling
68259+ where it is necessary to insert item with a key smaller than
68260+ leftmost key in a node. This is necessary because of extents being
68261+ located at the twig level. For explanation, see comment just above
68262+ is_next_item_internal().
68263+ */
68264+ CBK_TRUST_DK = (1 << 2),
68265+ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
68266+ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
68267+ CBK_DKSET = (1 << 5),
68268+ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
68269+ CBK_IN_CACHE = (1 << 7), /* node is already in cache */
68270+ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
68271+ * lock */
68272+} cbk_flags;
68273+
68274+/* insertion outcome. IBK = insert by key */
68275+typedef enum {
68276+ IBK_INSERT_OK = 0,
68277+ IBK_ALREADY_EXISTS = -EEXIST,
68278+ IBK_IO_ERROR = -EIO,
68279+ IBK_NO_SPACE = -E_NODE_FULL,
68280+ IBK_OOM = -ENOMEM
68281+} insert_result;
68282+
68283+#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
68284+
68285+typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
68286+ lock_handle * lh, void *arg);
68287+extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
68288+ lock_handle * lh,
68289+ tree_iterate_actor_t actor, void *arg,
68290+ znode_lock_mode mode, int through_units_p);
68291+extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
68292+ znode_lock_request pri, lock_handle * lh);
68293+
68294+/* return node plugin of @node */
68295+static inline node_plugin *node_plugin_by_node(const znode *
68296+ node /* node to query */ )
68297+{
68298+ assert("vs-213", node != NULL);
68299+ assert("vs-214", znode_is_loaded(node));
68300+
68301+ return node->nplug;
68302+}
68303+
68304+/* number of items in @node */
68305+static inline pos_in_node_t node_num_items(const znode * node)
68306+{
68307+ assert("nikita-2754", znode_is_loaded(node));
68308+ assert("nikita-2468",
68309+ node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
68310+
68311+ return node->nr_items;
68312+}
68313+
68314+/* Return the number of items at the present node. Asserts coord->node !=
68315+ NULL. */
68316+static inline unsigned coord_num_items(const coord_t * coord)
68317+{
68318+ assert("jmacd-9805", coord->node != NULL);
68319+
68320+ return node_num_items(coord->node);
68321+}
68322+
68323+/* true if @node is empty */
68324+static inline int node_is_empty(const znode * node)
68325+{
68326+ return node_num_items(node) == 0;
68327+}
68328+
68329+typedef enum {
68330+ SHIFTED_SOMETHING = 0,
68331+ SHIFT_NO_SPACE = -E_NODE_FULL,
68332+ SHIFT_IO_ERROR = -EIO,
68333+ SHIFT_OOM = -ENOMEM,
68334+} shift_result;
68335+
68336+extern node_plugin *node_plugin_by_coord(const coord_t * coord);
68337+extern int is_coord_in_node(const coord_t * coord);
68338+extern int key_in_node(const reiser4_key *, const coord_t *);
68339+extern void coord_item_move_to(coord_t * coord, int items);
68340+extern void coord_unit_move_to(coord_t * coord, int units);
68341+
68342+/* there are two types of repetitive accesses (ra): intra-syscall
68343+ (local) and inter-syscall (global). Local ra is used when
68344+ during single syscall we add/delete several items and units in the
68345+ same place in a tree. Note that plan-A fragments local ra by
68346+ separating stat-data and file body in key-space. Global ra is
68347+ used when user does repetitive modifications in the same place in a
68348+ tree.
68349+
68350+ Our ra implementation serves following purposes:
68351+ 1 it affects balancing decisions so that next operation in a row
68352+ can be performed faster;
68353+ 2 it affects lower-level read-ahead in page-cache;
68354+ 3 it allows to avoid unnecessary lookups by maintaining some state
68355+ across several operations (this is only for local ra);
68356+ 4 it leaves room for lazy-micro-balancing: when we start a sequence of
68357+ operations they are performed without actually doing any intra-node
68358+ shifts, until we finish sequence or scope of sequence leaves
68359+ current node, only then we really pack node (local ra only).
68360+*/
68361+
68362+/* another thing that can be useful is to keep per-tree and/or
68363+ per-process cache of recent lookups. This cache can be organised as a
68364+ list of block numbers of formatted nodes sorted by starting key in
68365+ this node. Balancings should invalidate appropriate parts of this
68366+ cache.
68367+*/
68368+
68369+lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
68370+ coord_t * coord, lock_handle * handle,
68371+ znode_lock_mode lock, lookup_bias bias,
68372+ tree_level lock_level, tree_level stop_level,
68373+ __u32 flags, ra_info_t *);
68374+
68375+lookup_result reiser4_object_lookup(struct inode *object,
68376+ const reiser4_key * key,
68377+ coord_t * coord,
68378+ lock_handle * lh,
68379+ znode_lock_mode lock_mode,
68380+ lookup_bias bias,
68381+ tree_level lock_level,
68382+ tree_level stop_level,
68383+ __u32 flags, ra_info_t * info);
68384+
68385+insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
68386+ reiser4_item_data * data, coord_t * coord,
68387+ lock_handle * lh,
68388+ tree_level stop_level, __u32 flags);
68389+insert_result insert_by_coord(coord_t * coord,
68390+ reiser4_item_data * data, const reiser4_key * key,
68391+ lock_handle * lh, __u32);
68392+insert_result insert_extent_by_coord(coord_t * coord,
68393+ reiser4_item_data * data,
68394+ const reiser4_key * key, lock_handle * lh);
68395+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
68396+ const reiser4_key * to_key,
68397+ reiser4_key * smallest_removed);
68398+int kill_node_content(coord_t * from, coord_t * to,
68399+ const reiser4_key * from_key, const reiser4_key * to_key,
68400+ reiser4_key * smallest_removed,
68401+ znode * locked_left_neighbor, struct inode *inode,
68402+ int truncate);
68403+
68404+int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
68405+ reiser4_key * key, lock_handle * lh, cop_insert_flag);
68406+int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
68407+ reiser4_item_data * data, unsigned);
68408+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
68409+int find_new_child_ptr(znode * parent, znode * child, znode * left,
68410+ coord_t * result);
68411+
68412+int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
68413+int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
68414+
68415+void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
68416+
68417+extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
68418+ const reiser4_key *, reiser4_key *,
68419+ struct inode *, int, int *);
68420+extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
68421+ const reiser4_key *, reiser4_key *,
68422+ struct inode *, int, int *);
68423+extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68424+ const reiser4_key * to, struct inode *, int);
68425+
68426+extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
68427+extern int check_tree_pointer(const coord_t * pointer, const znode * child);
68428+extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
68429+ znode * left, coord_t * result);
68430+extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
68431+extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
68432+ znode * child);
68433+extern znode *child_znode(const coord_t * in_parent, znode * parent,
68434+ int incore_p, int setup_dkeys_p);
68435+
68436+extern int cbk_cache_init(cbk_cache * cache);
68437+extern void cbk_cache_done(cbk_cache * cache);
68438+extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
68439+
68440+extern char *sprint_address(const reiser4_block_nr * block);
68441+
68442+#if REISER4_DEBUG
68443+extern void print_coord_content(const char *prefix, coord_t * p);
68444+extern void reiser4_print_address(const char *prefix,
68445+ const reiser4_block_nr * block);
68446+extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
68447+ __u32 flags);
68448+extern void check_dkeys(znode *node);
68449+#else
68450+#define print_coord_content(p, c) noop
68451+#define reiser4_print_address(p, b) noop
68452+#endif
68453+
68454+extern void forget_znode(lock_handle * handle);
68455+extern int deallocate_znode(znode * node);
68456+
68457+extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
68458+
68459+/* struct used internally to pack all numerous arguments of tree lookup.
68460+ Used to avoid passing a lot of arguments to helper functions. */
68461+typedef struct cbk_handle {
68462+ /* tree we are in */
68463+ reiser4_tree *tree;
68464+ /* key we are going after */
68465+ const reiser4_key *key;
68466+ /* coord we will store result in */
68467+ coord_t *coord;
68468+ /* type of lock to take on target node */
68469+ znode_lock_mode lock_mode;
68470+ /* lookup bias. See comments at the declaration of lookup_bias */
68471+ lookup_bias bias;
68472+ /* lock level: level starting from which tree traversal starts taking
68473+ * write locks. */
68474+ tree_level lock_level;
68475+ /* level where search will stop. Either item will be found between
68476+ lock_level and stop_level, or CBK_COORD_NOTFOUND will be
68477+ returned.
68478+ */
68479+ tree_level stop_level;
68480+ /* level we are currently at */
68481+ tree_level level;
68482+ /* block number of @active node. Tree traversal operates on two
68483+ nodes: active and parent. */
68484+ reiser4_block_nr block;
68485+ /* put here error message to be printed by caller */
68486+ const char *error;
68487+ /* result passed back to caller */
68488+ lookup_result result;
68489+ /* lock handles for active and parent */
68490+ lock_handle *parent_lh;
68491+ lock_handle *active_lh;
68492+ reiser4_key ld_key;
68493+ reiser4_key rd_key;
68494+ /* flags, passed to the cbk routine. Bits of this bitmask are defined
68495+ in tree.h:cbk_flags enum. */
68496+ __u32 flags;
68497+ ra_info_t *ra_info;
68498+ struct inode *object;
68499+} cbk_handle;
68500+
68501+extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
68502+
68503+/* eottl.c */
68504+extern int handle_eottl(cbk_handle *h, int *outcome);
68505+
68506+int lookup_multikey(cbk_handle * handle, int nr_keys);
68507+int lookup_couple(reiser4_tree * tree,
68508+ const reiser4_key * key1, const reiser4_key * key2,
68509+ coord_t * coord1, coord_t * coord2,
68510+ lock_handle * lh1, lock_handle * lh2,
68511+ znode_lock_mode lock_mode, lookup_bias bias,
68512+ tree_level lock_level, tree_level stop_level, __u32 flags,
68513+ int *result1, int *result2);
68514+
68515+static inline void read_lock_tree(reiser4_tree *tree)
68516+{
68517+ /* check that tree is not locked */
68518+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68519+ LOCK_CNT_NIL(read_locked_tree) &&
68520+ LOCK_CNT_NIL(write_locked_tree)));
68521+ /* check that spinlocks of lower priorities are not held */
68522+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68523+ LOCK_CNT_NIL(rw_locked_dk) &&
68524+ LOCK_CNT_NIL(spin_locked_stack)));
68525+
68526+ read_lock(&(tree->tree_lock));
68527+
68528+ LOCK_CNT_INC(read_locked_tree);
68529+ LOCK_CNT_INC(rw_locked_tree);
68530+ LOCK_CNT_INC(spin_locked);
68531+}
68532+
68533+static inline void read_unlock_tree(reiser4_tree *tree)
68534+{
68535+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
68536+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68537+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68538+
68539+ LOCK_CNT_DEC(read_locked_tree);
68540+ LOCK_CNT_DEC(rw_locked_tree);
68541+ LOCK_CNT_DEC(spin_locked);
68542+
68543+ read_unlock(&(tree->tree_lock));
68544+}
68545+
68546+static inline void write_lock_tree(reiser4_tree *tree)
68547+{
68548+ /* check that tree is not locked */
68549+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68550+ LOCK_CNT_NIL(read_locked_tree) &&
68551+ LOCK_CNT_NIL(write_locked_tree)));
68552+ /* check that spinlocks of lower priorities are not held */
68553+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68554+ LOCK_CNT_NIL(rw_locked_dk) &&
68555+ LOCK_CNT_NIL(spin_locked_stack)));
68556+
68557+ write_lock(&(tree->tree_lock));
68558+
68559+ LOCK_CNT_INC(write_locked_tree);
68560+ LOCK_CNT_INC(rw_locked_tree);
68561+ LOCK_CNT_INC(spin_locked);
68562+}
68563+
68564+static inline void write_unlock_tree(reiser4_tree *tree)
68565+{
68566+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
68567+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68568+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68569+
68570+ LOCK_CNT_DEC(write_locked_tree);
68571+ LOCK_CNT_DEC(rw_locked_tree);
68572+ LOCK_CNT_DEC(spin_locked);
68573+
68574+ write_unlock(&(tree->tree_lock));
68575+}
68576+
68577+static inline void read_lock_dk(reiser4_tree *tree)
68578+{
68579+ /* check that dk is not locked */
68580+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68581+ LOCK_CNT_NIL(read_locked_dk) &&
68582+ LOCK_CNT_NIL(write_locked_dk)));
68583+ /* check that spinlocks of lower priorities are not held */
68584+ assert("", LOCK_CNT_NIL(spin_locked_stack));
68585+
68586+ read_lock(&((tree)->dk_lock));
68587+
68588+ LOCK_CNT_INC(read_locked_dk);
68589+ LOCK_CNT_INC(rw_locked_dk);
68590+ LOCK_CNT_INC(spin_locked);
68591+}
68592+
68593+static inline void read_unlock_dk(reiser4_tree *tree)
68594+{
68595+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
68596+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
68597+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68598+
68599+ LOCK_CNT_DEC(read_locked_dk);
68600+ LOCK_CNT_DEC(rw_locked_dk);
68601+ LOCK_CNT_DEC(spin_locked);
68602+
68603+ read_unlock(&(tree->dk_lock));
68604+}
68605+
68606+static inline void write_lock_dk(reiser4_tree *tree)
68607+{
68608+ /* check that dk is not locked */
68609+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68610+ LOCK_CNT_NIL(read_locked_dk) &&
68611+ LOCK_CNT_NIL(write_locked_dk)));
68612+ /* check that spinlocks of lower priorities are not held */
68613+ assert("", LOCK_CNT_NIL(spin_locked_stack));
68614+
68615+ write_lock(&((tree)->dk_lock));
68616+
68617+ LOCK_CNT_INC(write_locked_dk);
68618+ LOCK_CNT_INC(rw_locked_dk);
68619+ LOCK_CNT_INC(spin_locked);
68620+}
68621+
68622+static inline void write_unlock_dk(reiser4_tree *tree)
68623+{
68624+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
68625+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
68626+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68627+
68628+ LOCK_CNT_DEC(write_locked_dk);
68629+ LOCK_CNT_DEC(rw_locked_dk);
68630+ LOCK_CNT_DEC(spin_locked);
68631+
68632+ write_unlock(&(tree->dk_lock));
68633+}
68634+
68635+/* estimate api. Implementation is in estimate.c */
68636+reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
68637+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
68638+reiser4_block_nr estimate_insert_flow(tree_level);
68639+reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
68640+reiser4_block_nr calc_estimate_one_insert(tree_level);
68641+reiser4_block_nr estimate_dirty_cluster(struct inode *);
68642+reiser4_block_nr estimate_insert_cluster(struct inode *);
68643+reiser4_block_nr estimate_update_cluster(struct inode *);
68644+
68645+/* __REISER4_TREE_H__ */
68646+#endif
68647+
68648+/* Make Linus happy.
68649+ Local variables:
68650+ c-indentation-style: "K&R"
68651+ mode-name: "LC"
68652+ c-basic-offset: 8
68653+ tab-width: 8
68654+ fill-column: 120
68655+ scroll-step: 1
68656+ End:
68657+*/
68658diff -urN linux-2.6.23.orig/fs/reiser4/tree_mod.c linux-2.6.23/fs/reiser4/tree_mod.c
68659--- linux-2.6.23.orig/fs/reiser4/tree_mod.c 1970-01-01 03:00:00.000000000 +0300
68660+++ linux-2.6.23/fs/reiser4/tree_mod.c 2007-12-04 16:49:30.000000000 +0300
68661@@ -0,0 +1,386 @@
68662+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68663+ * reiser4/README */
68664+
68665+/*
68666+ * Functions to add/delete new nodes to/from the tree.
68667+ *
68668+ * Functions from this file are used by carry (see carry*) to handle:
68669+ *
68670+ * . insertion of new formatted node into tree
68671+ *
68672+ * . addition of new tree root, increasing tree height
68673+ *
68674+ * . removing tree root, decreasing tree height
68675+ *
68676+ */
68677+
68678+#include "forward.h"
68679+#include "debug.h"
68680+#include "dformat.h"
68681+#include "key.h"
68682+#include "coord.h"
68683+#include "plugin/plugin.h"
68684+#include "jnode.h"
68685+#include "znode.h"
68686+#include "tree_mod.h"
68687+#include "block_alloc.h"
68688+#include "tree_walk.h"
68689+#include "tree.h"
68690+#include "super.h"
68691+
68692+#include <linux/err.h>
68693+
68694+static int add_child_ptr(znode * parent, znode * child);
68695+/* warning only issued if error is not -E_REPEAT */
68696+#define ewarning( error, ... ) \
68697+ if( ( error ) != -E_REPEAT ) \
68698+ warning( __VA_ARGS__ )
68699+
68700+/* allocate new node on the @level and immediately on the right of @brother. */
68701+znode * reiser4_new_node(znode * brother /* existing left neighbor
68702+ * of new node */,
68703+ tree_level level /* tree level at which new node is to
68704+ * be allocated */)
68705+{
68706+ znode *result;
68707+ int retcode;
68708+ reiser4_block_nr blocknr;
68709+
68710+ assert("nikita-930", brother != NULL);
68711+ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
68712+
68713+ retcode = assign_fake_blocknr_formatted(&blocknr);
68714+ if (retcode == 0) {
68715+ result =
68716+ zget(znode_get_tree(brother), &blocknr, NULL, level,
68717+ reiser4_ctx_gfp_mask_get());
68718+ if (IS_ERR(result)) {
68719+ ewarning(PTR_ERR(result), "nikita-929",
68720+ "Cannot allocate znode for carry: %li",
68721+ PTR_ERR(result));
68722+ return result;
68723+ }
68724+ /* cheap test, can be executed even when debugging is off */
68725+ if (!znode_just_created(result)) {
68726+ warning("nikita-2213",
68727+ "Allocated already existing block: %llu",
68728+ (unsigned long long)blocknr);
68729+ zput(result);
68730+ return ERR_PTR(RETERR(-EIO));
68731+ }
68732+
68733+ assert("nikita-931", result != NULL);
68734+ result->nplug = znode_get_tree(brother)->nplug;
68735+ assert("nikita-933", result->nplug != NULL);
68736+
68737+ retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
68738+ if (retcode == 0) {
68739+ ZF_SET(result, JNODE_CREATED);
68740+ zrelse(result);
68741+ } else {
68742+ zput(result);
68743+ result = ERR_PTR(retcode);
68744+ }
68745+ } else {
68746+ /* failure to allocate new node during balancing.
68747+ This should never happen. Ever. Returning -E_REPEAT
68748+ is not viable solution, because "out of disk space"
68749+ is not transient error that will go away by itself.
68750+ */
68751+ ewarning(retcode, "nikita-928",
68752+ "Cannot allocate block for carry: %i", retcode);
68753+ result = ERR_PTR(retcode);
68754+ }
68755+ assert("nikita-1071", result != NULL);
68756+ return result;
68757+}
68758+
68759+/* allocate new root and add it to the tree
68760+
68761+ This helper function is called by add_new_root().
68762+
68763+*/
68764+znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
68765+ znode * fake /* "fake" znode */ )
68766+{
68767+ reiser4_tree *tree = znode_get_tree(old_root);
68768+ znode *new_root = NULL; /* to shut gcc up */
68769+ int result;
68770+
68771+ assert("nikita-1069", old_root != NULL);
68772+ assert("umka-262", fake != NULL);
68773+ assert("umka-263", tree != NULL);
68774+
68775+ /* "fake" znode---one always hanging just above current root. This
68776+ node is locked when new root is created or existing root is
68777+ deleted. Downward tree traversal takes lock on it before taking
68778+ lock on a root node. This avoids race conditions with root
68779+ manipulations.
68780+
68781+ */
68782+ assert("nikita-1348", znode_above_root(fake));
68783+ assert("nikita-1211", znode_is_root(old_root));
68784+
68785+ result = 0;
68786+ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
68787+ warning("nikita-1344", "Tree is too tall: %i", tree->height);
68788+ /* ext2 returns -ENOSPC when it runs out of free inodes with a
68789+ following comment (fs/ext2/ialloc.c:441): Is it really
68790+ ENOSPC?
68791+
68792+ -EXFULL? -EINVAL?
68793+ */
68794+ result = RETERR(-ENOSPC);
68795+ } else {
68796+ /* Allocate block for new root. It's not that
68797+ important where it will be allocated, as root is
68798+ almost always in memory. Moreover, allocate on
68799+ flush can be going here.
68800+ */
68801+ assert("nikita-1448", znode_is_root(old_root));
68802+ new_root = reiser4_new_node(fake, tree->height + 1);
68803+ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
68804+ lock_handle rlh;
68805+
68806+ init_lh(&rlh);
68807+ result =
68808+ longterm_lock_znode(&rlh, new_root,
68809+ ZNODE_WRITE_LOCK,
68810+ ZNODE_LOCK_LOPRI);
68811+ if (result == 0) {
68812+ parent_coord_t *in_parent;
68813+
68814+ znode_make_dirty(fake);
68815+
68816+ /* new root is a child of "fake" node */
68817+ write_lock_tree(tree);
68818+
68819+ ++tree->height;
68820+
68821+ /* recalculate max balance overhead */
68822+ tree->estimate_one_insert =
68823+ estimate_one_insert_item(tree);
68824+
68825+ tree->root_block = *znode_get_block(new_root);
68826+ in_parent = &new_root->in_parent;
68827+ init_parent_coord(in_parent, fake);
68828+ /* manually insert new root into sibling
68829+ * list. With this all nodes involved into
68830+ * balancing are connected after balancing is
68831+ * done---useful invariant to check. */
68832+ sibling_list_insert_nolock(new_root, NULL);
68833+ write_unlock_tree(tree);
68834+
68835+ /* insert into new root pointer to the
68836+ @old_root. */
68837+ assert("nikita-1110",
68838+ WITH_DATA(new_root,
68839+ node_is_empty(new_root)));
68840+ write_lock_dk(tree);
68841+ znode_set_ld_key(new_root, reiser4_min_key());
68842+ znode_set_rd_key(new_root, reiser4_max_key());
68843+ write_unlock_dk(tree);
68844+ if (REISER4_DEBUG) {
68845+ ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
68846+ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
68847+ ZF_SET(old_root, JNODE_ORPHAN);
68848+ }
68849+ result = add_child_ptr(new_root, old_root);
68850+ done_lh(&rlh);
68851+ }
68852+ zrelse(new_root);
68853+ }
68854+ }
68855+ if (result != 0)
68856+ new_root = ERR_PTR(result);
68857+ return new_root;
68858+}
68859+
68860+/* build &reiser4_item_data for inserting child pointer
68861+
68862+ Build &reiser4_item_data that can be later used to insert pointer to @child
68863+ in its parent.
68864+
68865+*/
68866+void build_child_ptr_data(znode * child /* node pointer to which will be
68867+ * inserted */ ,
68868+ reiser4_item_data * data /* where to store result */ )
68869+{
68870+ assert("nikita-1116", child != NULL);
68871+ assert("nikita-1117", data != NULL);
68872+
68873+ /*
68874+ * NOTE: use address of child's blocknr as address of data to be
68875+ * inserted. As result of this data gets into on-disk structure in cpu
68876+ * byte order. internal's create_hook converts it to little endian byte
68877+ * order.
68878+ */
68879+ data->data = (char *)znode_get_block(child);
68880+ /* data -> data is kernel space */
68881+ data->user = 0;
68882+ data->length = sizeof(reiser4_block_nr);
68883+ /* FIXME-VS: hardcoded internal item? */
68884+
68885+ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
68886+ data->iplug = item_plugin_by_id(NODE_POINTER_ID);
68887+}
68888+
68889+/* add pointer to @child into empty @parent.
68890+
68891+ This is used when pointer to old root is inserted into new root which is
68892+ empty.
68893+*/
68894+static int add_child_ptr(znode * parent, znode * child)
68895+{
68896+ coord_t coord;
68897+ reiser4_item_data data;
68898+ int result;
68899+ reiser4_key key;
68900+
68901+ assert("nikita-1111", parent != NULL);
68902+ assert("nikita-1112", child != NULL);
68903+ assert("nikita-1115",
68904+ znode_get_level(parent) == znode_get_level(child) + 1);
68905+
68906+ result = zload(parent);
68907+ if (result != 0)
68908+ return result;
68909+ assert("nikita-1113", node_is_empty(parent));
68910+ coord_init_first_unit(&coord, parent);
68911+
68912+ build_child_ptr_data(child, &data);
68913+ data.arg = NULL;
68914+
68915+ read_lock_dk(znode_get_tree(parent));
68916+ key = *znode_get_ld_key(child);
68917+ read_unlock_dk(znode_get_tree(parent));
68918+
68919+ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
68920+ NULL);
68921+ znode_make_dirty(parent);
68922+ zrelse(parent);
68923+ return result;
68924+}
68925+
68926+/* actually remove tree root */
68927+static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
68928+ * being removed */,
68929+ znode * old_root /* root node that is being
68930+ * removed */ ,
68931+ znode * new_root /* new root---sole child of
68932+ * @old_root */,
68933+ const reiser4_block_nr * new_root_blk /* disk address of
68934+ * @new_root */)
68935+{
68936+ znode *uber;
68937+ int result;
68938+ lock_handle handle_for_uber;
68939+
68940+ assert("umka-265", tree != NULL);
68941+ assert("nikita-1198", new_root != NULL);
68942+ assert("nikita-1199",
68943+ znode_get_level(new_root) + 1 == znode_get_level(old_root));
68944+
68945+ assert("nikita-1201", znode_is_write_locked(old_root));
68946+
68947+ assert("nikita-1203",
68948+ disk_addr_eq(new_root_blk, znode_get_block(new_root)));
68949+
68950+ init_lh(&handle_for_uber);
68951+ /* obtain and lock "fake" znode protecting changes in tree height. */
68952+ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
68953+ &handle_for_uber);
68954+ if (result == 0) {
68955+ uber = handle_for_uber.node;
68956+
68957+ znode_make_dirty(uber);
68958+
68959+ /* don't take long term lock a @new_root. Take spinlock. */
68960+
68961+ write_lock_tree(tree);
68962+
68963+ tree->root_block = *new_root_blk;
68964+ --tree->height;
68965+
68966+ /* recalculate max balance overhead */
68967+ tree->estimate_one_insert = estimate_one_insert_item(tree);
68968+
68969+ assert("nikita-1202",
68970+ tree->height == znode_get_level(new_root));
68971+
68972+ /* new root is child on "fake" node */
68973+ init_parent_coord(&new_root->in_parent, uber);
68974+ ++uber->c_count;
68975+
68976+ /* sibling_list_insert_nolock(new_root, NULL); */
68977+ write_unlock_tree(tree);
68978+
68979+ /* reinitialise old root. */
68980+ result = node_plugin_by_node(old_root)->init(old_root);
68981+ znode_make_dirty(old_root);
68982+ if (result == 0) {
68983+ assert("nikita-1279", node_is_empty(old_root));
68984+ ZF_SET(old_root, JNODE_HEARD_BANSHEE);
68985+ old_root->c_count = 0;
68986+ }
68987+ }
68988+ done_lh(&handle_for_uber);
68989+
68990+ return result;
68991+}
68992+
68993+/* remove tree root
68994+
68995+ This function removes tree root, decreasing tree height by one. Tree root
68996+ and its only child (that is going to become new tree root) are write locked
68997+ at the entry.
68998+
68999+ To remove tree root we need to take lock on special "fake" znode that
69000+ protects changes of tree height. See comments in reiser4_add_tree_root() for
69001+ more on this.
69002+
69003+ Also parent pointers have to be updated in
69004+ old and new root. To simplify code, function is split into two parts: outer
69005+ reiser4_kill_tree_root() collects all necessary arguments and calls
69006+ reiser4_kill_root() to do the actual job.
69007+
69008+*/
69009+int reiser4_kill_tree_root(znode * old_root /* tree root that we are
69010+ removing*/)
69011+{
69012+ int result;
69013+ coord_t down_link;
69014+ znode *new_root;
69015+ reiser4_tree *tree;
69016+
69017+ assert("umka-266", current_tree != NULL);
69018+ assert("nikita-1194", old_root != NULL);
69019+ assert("nikita-1196", znode_is_root(old_root));
69020+ assert("nikita-1200", node_num_items(old_root) == 1);
69021+ assert("nikita-1401", znode_is_write_locked(old_root));
69022+
69023+ coord_init_first_unit(&down_link, old_root);
69024+
69025+ tree = znode_get_tree(old_root);
69026+ new_root = child_znode(&down_link, old_root, 0, 1);
69027+ if (!IS_ERR(new_root)) {
69028+ result =
69029+ reiser4_kill_root(tree, old_root, new_root,
69030+ znode_get_block(new_root));
69031+ zput(new_root);
69032+ } else
69033+ result = PTR_ERR(new_root);
69034+
69035+ return result;
69036+}
69037+
69038+/* Make Linus happy.
69039+ Local variables:
69040+ c-indentation-style: "K&R"
69041+ mode-name: "LC"
69042+ c-basic-offset: 8
69043+ tab-width: 8
69044+ fill-column: 120
69045+ scroll-step: 1
69046+ End:
69047+*/
69048diff -urN linux-2.6.23.orig/fs/reiser4/tree_mod.h linux-2.6.23/fs/reiser4/tree_mod.h
69049--- linux-2.6.23.orig/fs/reiser4/tree_mod.h 1970-01-01 03:00:00.000000000 +0300
69050+++ linux-2.6.23/fs/reiser4/tree_mod.h 2007-12-04 16:49:30.000000000 +0300
69051@@ -0,0 +1,29 @@
69052+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69053+ * reiser4/README */
69054+
69055+/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
69056+ * comments. */
69057+
69058+#if !defined( __REISER4_TREE_MOD_H__ )
69059+#define __REISER4_TREE_MOD_H__
69060+
69061+#include "forward.h"
69062+
69063+znode *reiser4_new_node(znode * brother, tree_level level);
69064+znode *reiser4_add_tree_root(znode * old_root, znode * fake);
69065+int reiser4_kill_tree_root(znode * old_root);
69066+void build_child_ptr_data(znode * child, reiser4_item_data * data);
69067+
69068+/* __REISER4_TREE_MOD_H__ */
69069+#endif
69070+
69071+/* Make Linus happy.
69072+ Local variables:
69073+ c-indentation-style: "K&R"
69074+ mode-name: "LC"
69075+ c-basic-offset: 8
69076+ tab-width: 8
69077+ fill-column: 120
69078+ scroll-step: 1
69079+ End:
69080+*/
69081diff -urN linux-2.6.23.orig/fs/reiser4/tree_walk.c linux-2.6.23/fs/reiser4/tree_walk.c
69082--- linux-2.6.23.orig/fs/reiser4/tree_walk.c 1970-01-01 03:00:00.000000000 +0300
69083+++ linux-2.6.23/fs/reiser4/tree_walk.c 2007-12-04 16:49:30.000000000 +0300
69084@@ -0,0 +1,927 @@
69085+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69086+ * reiser4/README */
69087+
69088+/* Routines and macros to:
69089+
69090+ get_left_neighbor()
69091+
69092+ get_right_neighbor()
69093+
69094+ get_parent()
69095+
69096+ get_first_child()
69097+
69098+ get_last_child()
69099+
69100+ various routines to walk the whole tree and do things to it like
69101+ repack it, or move it to tertiary storage. Please make them as
69102+ generic as is reasonable.
69103+
69104+*/
69105+
69106+#include "forward.h"
69107+#include "debug.h"
69108+#include "dformat.h"
69109+#include "coord.h"
69110+#include "plugin/item/item.h"
69111+#include "jnode.h"
69112+#include "znode.h"
69113+#include "tree_walk.h"
69114+#include "tree.h"
69115+#include "super.h"
69116+
69117+/* These macros are used internally in tree_walk.c in attempt to make
69118+ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
69119+ lock_left_neighbor */
69120+#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
69121+#define FIELD_OFFSET(name) offsetof(znode, name)
69122+#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
69123+#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
69124+#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
69125+
69126+/* This is the generic procedure to get and lock `generic' neighbor (left or
69127+ right neighbor or parent). It implements common algorithm for all cases of
69128+ getting lock on neighbor node, only znode structure field is different in
69129+ each case. This is parameterized by ptr_offset argument, which is byte
69130+ offset for the pointer to the desired neighbor within the current node's
69131+ znode structure. This function should be called with the tree lock held */
69132+static int lock_neighbor(
69133+ /* resulting lock handle */
69134+ lock_handle * result,
69135+ /* znode to lock */
69136+ znode * node,
69137+ /* pointer to neighbor (or parent) znode field offset, in bytes from
69138+ the base address of znode structure */
69139+ int ptr_offset,
69140+ /* lock mode for longterm_lock_znode call */
69141+ znode_lock_mode mode,
69142+ /* lock request for longterm_lock_znode call */
69143+ znode_lock_request req,
69144+ /* GN_* flags */
69145+ int flags, int rlocked)
69146+{
69147+ reiser4_tree *tree = znode_get_tree(node);
69148+ znode *neighbor;
69149+ int ret;
69150+
69151+ assert("umka-236", node != NULL);
69152+ assert("umka-237", tree != NULL);
69153+ assert_rw_locked(&(tree->tree_lock));
69154+
69155+ if (flags & GN_TRY_LOCK)
69156+ req |= ZNODE_LOCK_NONBLOCK;
69157+ if (flags & GN_SAME_ATOM)
69158+ req |= ZNODE_LOCK_DONT_FUSE;
69159+
69160+ /* get neighbor's address by using of sibling link, quit while loop
69161+ (and return) if link is not available. */
69162+ while (1) {
69163+ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
69164+
69165+ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
69166+ * node pointed by it is not connected.
69167+ *
69168+ * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
69169+ * check and allows passing reference to not connected znode to
69170+ * subsequent longterm_lock_znode() call. This kills possible
69171+ * busy loop if we are trying to get longterm lock on locked but
69172+ * not yet connected parent node. */
69173+ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
69174+ || znode_is_connected(neighbor))) {
69175+ return RETERR(-E_NO_NEIGHBOR);
69176+ }
69177+
69178+ /* protect it from deletion. */
69179+ zref(neighbor);
69180+
69181+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69182+
69183+ ret = longterm_lock_znode(result, neighbor, mode, req);
69184+
69185+ /* The lock handle obtains its own reference, release the one from above. */
69186+ zput(neighbor);
69187+
69188+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69189+
69190+ /* restart if node we got reference to is being
69191+ invalidated. we should not get reference to this node
69192+ again. */
69193+ if (ret == -EINVAL)
69194+ continue;
69195+ if (ret)
69196+ return ret;
69197+
69198+ /* check if neighbor link still points to just locked znode;
69199+ the link could have been changed while the process slept. */
69200+ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
69201+ return 0;
69202+
69203+ /* znode was locked by mistake; unlock it and restart locking
69204+ process from beginning. */
69205+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69206+ longterm_unlock_znode(result);
69207+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69208+ }
69209+}
69210+
69211+/* get parent node with longterm lock, accepts GN* flags. */
69212+int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
69213+ znode * node /* child node */ ,
69214+ znode_lock_mode mode
69215+ /* type of lock: read or write */ ,
69216+ int flags /* GN_* flags */ )
69217+{
69218+ int result;
69219+
69220+ read_lock_tree(znode_get_tree(node));
69221+ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
69222+ ZNODE_LOCK_HIPRI, flags, 1);
69223+ read_unlock_tree(znode_get_tree(node));
69224+ return result;
69225+}
69226+
69227+/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
69228+ bit in @flags parameter */
69229+/* Audited by: umka (2002.06.14) */
69230+static inline int
69231+lock_side_neighbor(lock_handle * result,
69232+ znode * node, znode_lock_mode mode, int flags, int rlocked)
69233+{
69234+ int ret;
69235+ int ptr_offset;
69236+ znode_lock_request req;
69237+
69238+ if (flags & GN_GO_LEFT) {
69239+ ptr_offset = LEFT_PTR_OFFSET;
69240+ req = ZNODE_LOCK_LOPRI;
69241+ } else {
69242+ ptr_offset = RIGHT_PTR_OFFSET;
69243+ req = ZNODE_LOCK_HIPRI;
69244+ }
69245+
69246+ ret =
69247+ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
69248+
69249+ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
69250+ * guarantee that neighbor is absent in the
69251+ * tree; in this case we return -ENOENT --
69252+ * means neighbor at least not found in
69253+ * cache */
69254+ return RETERR(-ENOENT);
69255+
69256+ return ret;
69257+}
69258+
69259+#if REISER4_DEBUG
69260+
69261+int check_sibling_list(znode * node)
69262+{
69263+ znode *scan;
69264+ znode *next;
69265+
69266+ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
69267+
69268+ if (node == NULL)
69269+ return 1;
69270+
69271+ if (ZF_ISSET(node, JNODE_RIP))
69272+ return 1;
69273+
69274+ assert("nikita-3270", node != NULL);
69275+ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
69276+
69277+ for (scan = node; znode_is_left_connected(scan); scan = next) {
69278+ next = scan->left;
69279+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69280+ assert("nikita-3271", znode_is_right_connected(next));
69281+ assert("nikita-3272", next->right == scan);
69282+ } else
69283+ break;
69284+ }
69285+ for (scan = node; znode_is_right_connected(scan); scan = next) {
69286+ next = scan->right;
69287+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69288+ assert("nikita-3273", znode_is_left_connected(next));
69289+ assert("nikita-3274", next->left == scan);
69290+ } else
69291+ break;
69292+ }
69293+ return 1;
69294+}
69295+
69296+#endif
69297+
69298+/* Znode sibling pointers maintenence. */
69299+
69300+/* Znode sibling pointers are established between any neighbored nodes which are
69301+ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
69302+ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
69303+ value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
69304+
69305+ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
69306+ take care about searching (hash table lookup may be required) of znode
69307+ neighbors, establishing sibling pointers between them and setting
69308+ JNODE_*_CONNECTED state bits. */
69309+
69310+/* adjusting of sibling pointers and `connected' states for two
69311+ neighbors; works if one neighbor is NULL (was not found). */
69312+
69313+/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
69314+void link_left_and_right(znode * left, znode * right)
69315+{
69316+ assert("nikita-3275", check_sibling_list(left));
69317+ assert("nikita-3275", check_sibling_list(right));
69318+
69319+ if (left != NULL) {
69320+ if (left->right == NULL) {
69321+ left->right = right;
69322+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
69323+
69324+ ON_DEBUG(left->right_version =
69325+ atomic_inc_return(&delim_key_version);
69326+ );
69327+
69328+ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
69329+ && left->right != right) {
69330+
69331+ ON_DEBUG(left->right->left_version =
69332+ atomic_inc_return(&delim_key_version);
69333+ left->right_version =
69334+ atomic_inc_return(&delim_key_version););
69335+
69336+ left->right->left = NULL;
69337+ left->right = right;
69338+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
69339+ } else
69340+ /*
69341+ * there is a race condition in renew_sibling_link()
69342+ * and assertions below check that it is only one
69343+ * there. Thread T1 calls renew_sibling_link() without
69344+ * GN_NO_ALLOC flag. zlook() doesn't find neighbor
69345+ * node, but before T1 gets to the
69346+ * link_left_and_right(), another thread T2 creates
69347+ * neighbor node and connects it. check for
69348+ * left->right == NULL above protects T1 from
69349+ * overwriting correct left->right pointer installed
69350+ * by T2.
69351+ */
69352+ assert("nikita-3302",
69353+ right == NULL || left->right == right);
69354+ }
69355+ if (right != NULL) {
69356+ if (right->left == NULL) {
69357+ right->left = left;
69358+ ZF_SET(right, JNODE_LEFT_CONNECTED);
69359+
69360+ ON_DEBUG(right->left_version =
69361+ atomic_inc_return(&delim_key_version);
69362+ );
69363+
69364+ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
69365+ && right->left != left) {
69366+
69367+ ON_DEBUG(right->left->right_version =
69368+ atomic_inc_return(&delim_key_version);
69369+ right->left_version =
69370+ atomic_inc_return(&delim_key_version););
69371+
69372+ right->left->right = NULL;
69373+ right->left = left;
69374+ ZF_SET(right, JNODE_LEFT_CONNECTED);
69375+
69376+ } else
69377+ assert("nikita-3303",
69378+ left == NULL || right->left == left);
69379+ }
69380+ assert("nikita-3275", check_sibling_list(left));
69381+ assert("nikita-3275", check_sibling_list(right));
69382+}
69383+
69384+/* Audited by: umka (2002.06.14) */
69385+static void link_znodes(znode * first, znode * second, int to_left)
69386+{
69387+ if (to_left)
69388+ link_left_and_right(second, first);
69389+ else
69390+ link_left_and_right(first, second);
69391+}
69392+
69393+/* getting of next (to left or to right, depend on gn_to_left bit in flags)
69394+ coord's unit position in horizontal direction, even across node
69395+ boundary. Should be called under tree lock, it protects nonexistence of
69396+ sibling link on parent level, if lock_side_neighbor() fails with
69397+ -ENOENT. */
69398+static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
69399+{
69400+ int ret;
69401+ znode *node;
69402+ reiser4_tree *tree;
69403+
69404+ assert("umka-243", coord != NULL);
69405+ assert("umka-244", handle != NULL);
69406+ assert("zam-1069", handle->node == NULL);
69407+
69408+ ret =
69409+ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
69410+ coord_next_unit(coord);
69411+ if (!ret)
69412+ return 0;
69413+
69414+ ret =
69415+ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
69416+ if (ret)
69417+ return ret;
69418+
69419+ node = handle->node;
69420+ tree = znode_get_tree(node);
69421+ write_unlock_tree(tree);
69422+
69423+ coord_init_zero(coord);
69424+
69425+ /* We avoid synchronous read here if it is specified by flag. */
69426+ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
69427+ ret = jstartio(ZJNODE(handle->node));
69428+ if (!ret)
69429+ ret = -E_REPEAT;
69430+ goto error_locked;
69431+ }
69432+
69433+ /* corresponded zrelse() should be called by the clients of
69434+ far_next_coord(), in place when this node gets unlocked. */
69435+ ret = zload(handle->node);
69436+ if (ret)
69437+ goto error_locked;
69438+
69439+ if (flags & GN_GO_LEFT)
69440+ coord_init_last_unit(coord, node);
69441+ else
69442+ coord_init_first_unit(coord, node);
69443+
69444+ if (0) {
69445+ error_locked:
69446+ longterm_unlock_znode(handle);
69447+ }
69448+ write_lock_tree(tree);
69449+ return ret;
69450+}
69451+
69452+/* Very significant function which performs a step in horizontal direction
69453+ when sibling pointer is not available. Actually, it is only function which
69454+ does it.
69455+ Note: this function does not restore locking status at exit,
69456+ caller should does care about proper unlocking and zrelsing */
69457+static int
69458+renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
69459+ tree_level level, int flags, int *nr_locked)
69460+{
69461+ int ret;
69462+ int to_left = flags & GN_GO_LEFT;
69463+ reiser4_block_nr da;
69464+ /* parent of the neighbor node; we set it to parent until not sharing
69465+ of one parent between child and neighbor node is detected */
69466+ znode *side_parent = coord->node;
69467+ reiser4_tree *tree = znode_get_tree(child);
69468+ znode *neighbor = NULL;
69469+
69470+ assert("umka-245", coord != NULL);
69471+ assert("umka-246", handle != NULL);
69472+ assert("umka-247", child != NULL);
69473+ assert("umka-303", tree != NULL);
69474+
69475+ init_lh(handle);
69476+ write_lock_tree(tree);
69477+ ret = far_next_coord(coord, handle, flags);
69478+
69479+ if (ret) {
69480+ if (ret != -ENOENT) {
69481+ write_unlock_tree(tree);
69482+ return ret;
69483+ }
69484+ } else {
69485+ item_plugin *iplug;
69486+
69487+ if (handle->node != NULL) {
69488+ (*nr_locked)++;
69489+ side_parent = handle->node;
69490+ }
69491+
69492+ /* does coord object points to internal item? We do not
69493+ support sibling pointers between znode for formatted and
69494+ unformatted nodes and return -E_NO_NEIGHBOR in that case. */
69495+ iplug = item_plugin_by_coord(coord);
69496+ if (!item_is_internal(coord)) {
69497+ link_znodes(child, NULL, to_left);
69498+ write_unlock_tree(tree);
69499+ /* we know there can't be formatted neighbor */
69500+ return RETERR(-E_NO_NEIGHBOR);
69501+ }
69502+ write_unlock_tree(tree);
69503+
69504+ iplug->s.internal.down_link(coord, NULL, &da);
69505+
69506+ if (flags & GN_NO_ALLOC) {
69507+ neighbor = zlook(tree, &da);
69508+ } else {
69509+ neighbor =
69510+ zget(tree, &da, side_parent, level,
69511+ reiser4_ctx_gfp_mask_get());
69512+ }
69513+
69514+ if (IS_ERR(neighbor)) {
69515+ ret = PTR_ERR(neighbor);
69516+ return ret;
69517+ }
69518+
69519+ if (neighbor)
69520+ /* update delimiting keys */
69521+ set_child_delimiting_keys(coord->node, coord, neighbor);
69522+
69523+ write_lock_tree(tree);
69524+ }
69525+
69526+ if (likely(neighbor == NULL ||
69527+ (znode_get_level(child) == znode_get_level(neighbor)
69528+ && child != neighbor)))
69529+ link_znodes(child, neighbor, to_left);
69530+ else {
69531+ warning("nikita-3532",
69532+ "Sibling nodes on the different levels: %i != %i\n",
69533+ znode_get_level(child), znode_get_level(neighbor));
69534+ ret = RETERR(-EIO);
69535+ }
69536+
69537+ write_unlock_tree(tree);
69538+
69539+ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
69540+ if (neighbor != NULL && (flags & GN_NO_ALLOC))
69541+ /* atomic_dec(&ZJNODE(neighbor)->x_count); */
69542+ zput(neighbor);
69543+
69544+ return ret;
69545+}
69546+
69547+/* This function is for establishing of one side relation. */
69548+/* Audited by: umka (2002.06.14) */
69549+static int connect_one_side(coord_t * coord, znode * node, int flags)
69550+{
69551+ coord_t local;
69552+ lock_handle handle;
69553+ int nr_locked;
69554+ int ret;
69555+
69556+ assert("umka-248", coord != NULL);
69557+ assert("umka-249", node != NULL);
69558+
69559+ coord_dup_nocheck(&local, coord);
69560+
69561+ init_lh(&handle);
69562+
69563+ ret =
69564+ renew_sibling_link(&local, &handle, node, znode_get_level(node),
69565+ flags | GN_NO_ALLOC, &nr_locked);
69566+
69567+ if (handle.node != NULL) {
69568+ /* complementary operations for zload() and lock() in far_next_coord() */
69569+ zrelse(handle.node);
69570+ longterm_unlock_znode(&handle);
69571+ }
69572+
69573+ /* we catch error codes which are not interesting for us because we
69574+ run renew_sibling_link() only for znode connection. */
69575+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
69576+ return 0;
69577+
69578+ return ret;
69579+}
69580+
69581+/* if @child is not in `connected' state, performs hash searches for left and
69582+ right neighbor nodes and establishes horizontal sibling links */
69583+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69584+int connect_znode(coord_t * parent_coord, znode * child)
69585+{
69586+ reiser4_tree *tree = znode_get_tree(child);
69587+ int ret = 0;
69588+
69589+ assert("zam-330", parent_coord != NULL);
69590+ assert("zam-331", child != NULL);
69591+ assert("zam-332", parent_coord->node != NULL);
69592+ assert("umka-305", tree != NULL);
69593+
69594+ /* it is trivial to `connect' root znode because it can't have
69595+ neighbors */
69596+ if (znode_above_root(parent_coord->node)) {
69597+ child->left = NULL;
69598+ child->right = NULL;
69599+ ZF_SET(child, JNODE_LEFT_CONNECTED);
69600+ ZF_SET(child, JNODE_RIGHT_CONNECTED);
69601+
69602+ ON_DEBUG(child->left_version =
69603+ atomic_inc_return(&delim_key_version);
69604+ child->right_version =
69605+ atomic_inc_return(&delim_key_version););
69606+
69607+ return 0;
69608+ }
69609+
69610+ /* load parent node */
69611+ coord_clear_iplug(parent_coord);
69612+ ret = zload(parent_coord->node);
69613+
69614+ if (ret != 0)
69615+ return ret;
69616+
69617+ /* protect `connected' state check by tree_lock */
69618+ read_lock_tree(tree);
69619+
69620+ if (!znode_is_right_connected(child)) {
69621+ read_unlock_tree(tree);
69622+ /* connect right (default is right) */
69623+ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
69624+ if (ret)
69625+ goto zrelse_and_ret;
69626+
69627+ read_lock_tree(tree);
69628+ }
69629+
69630+ ret = znode_is_left_connected(child);
69631+
69632+ read_unlock_tree(tree);
69633+
69634+ if (!ret) {
69635+ ret =
69636+ connect_one_side(parent_coord, child,
69637+ GN_NO_ALLOC | GN_GO_LEFT);
69638+ } else
69639+ ret = 0;
69640+
69641+ zrelse_and_ret:
69642+ zrelse(parent_coord->node);
69643+
69644+ return ret;
69645+}
69646+
69647+/* this function is like renew_sibling_link() but allocates neighbor node if
69648+ it doesn't exist and `connects' it. It may require making two steps in
69649+ horizontal direction, first one for neighbor node finding/allocation,
69650+ second one is for finding neighbor of neighbor to connect freshly allocated
69651+ znode. */
69652+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69653+static int
69654+renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
69655+{
69656+ coord_t local;
69657+ lock_handle empty[2];
69658+ reiser4_tree *tree = znode_get_tree(node);
69659+ znode *neighbor = NULL;
69660+ int nr_locked = 0;
69661+ int ret;
69662+
69663+ assert("umka-250", coord != NULL);
69664+ assert("umka-251", node != NULL);
69665+ assert("umka-307", tree != NULL);
69666+ assert("umka-308", level <= tree->height);
69667+
69668+ /* umka (2002.06.14)
69669+ Here probably should be a check for given "level" validness.
69670+ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
69671+ */
69672+
69673+ coord_dup(&local, coord);
69674+
69675+ ret =
69676+ renew_sibling_link(&local, &empty[0], node, level,
69677+ flags & ~GN_NO_ALLOC, &nr_locked);
69678+ if (ret)
69679+ goto out;
69680+
69681+ /* tree lock is not needed here because we keep parent node(s) locked
69682+ and reference to neighbor znode incremented */
69683+ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
69684+
69685+ read_lock_tree(tree);
69686+ ret = znode_is_connected(neighbor);
69687+ read_unlock_tree(tree);
69688+ if (ret) {
69689+ ret = 0;
69690+ goto out;
69691+ }
69692+
69693+ ret =
69694+ renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
69695+ flags | GN_NO_ALLOC, &nr_locked);
69696+ /* second renew_sibling_link() call is used for znode connection only,
69697+ so we can live with these errors */
69698+ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
69699+ ret = 0;
69700+
69701+ out:
69702+
69703+ for (--nr_locked; nr_locked >= 0; --nr_locked) {
69704+ zrelse(empty[nr_locked].node);
69705+ longterm_unlock_znode(&empty[nr_locked]);
69706+ }
69707+
69708+ if (neighbor != NULL)
69709+ /* decrement znode reference counter without actually
69710+ releasing it. */
69711+ atomic_dec(&ZJNODE(neighbor)->x_count);
69712+
69713+ return ret;
69714+}
69715+
69716+/*
69717+ reiser4_get_neighbor() -- lock node's neighbor.
69718+
69719+ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
69720+ given parameter) using sibling link to it. If sibling link is not available
69721+ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
69722+ level up for information about neighbor's disk address. We lock node's
69723+ parent, if it is common parent for both 'node' and its neighbor, neighbor's
69724+ disk address is in next (to left or to right) down link from link that points
69725+ to original node. If not, we need to lock parent's neighbor, read its content
69726+ and take first(last) downlink with neighbor's disk address. That locking
69727+ could be done by using sibling link and lock_neighbor() function, if sibling
69728+ link exists. In another case we have to go level up again until we find
69729+ common parent or valid sibling link. Then go down
69730+ allocating/connecting/locking/reading nodes until neighbor of first one is
69731+ locked.
69732+
69733+ @neighbor: result lock handle,
69734+ @node: a node which we lock neighbor of,
69735+ @lock_mode: lock mode {LM_READ, LM_WRITE},
69736+ @flags: logical OR of {GN_*} (see description above) subset.
69737+
69738+ @return: 0 if success, negative value if lock was impossible due to an error
69739+ or lack of neighbor node.
69740+*/
69741+
69742+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69743+int
69744+reiser4_get_neighbor(lock_handle * neighbor, znode * node,
69745+ znode_lock_mode lock_mode, int flags)
69746+{
69747+ reiser4_tree *tree = znode_get_tree(node);
69748+ lock_handle path[REAL_MAX_ZTREE_HEIGHT];
69749+
69750+ coord_t coord;
69751+
69752+ tree_level base_level;
69753+ tree_level h = 0;
69754+ int ret;
69755+
69756+ assert("umka-252", tree != NULL);
69757+ assert("umka-253", neighbor != NULL);
69758+ assert("umka-254", node != NULL);
69759+
69760+ base_level = znode_get_level(node);
69761+
69762+ assert("umka-310", base_level <= tree->height);
69763+
69764+ coord_init_zero(&coord);
69765+
69766+ again:
69767+ /* first, we try to use simple lock_neighbor() which requires sibling
69768+ link existence */
69769+ read_lock_tree(tree);
69770+ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
69771+ read_unlock_tree(tree);
69772+ if (!ret) {
69773+ /* load znode content if it was specified */
69774+ if (flags & GN_LOAD_NEIGHBOR) {
69775+ ret = zload(node);
69776+ if (ret)
69777+ longterm_unlock_znode(neighbor);
69778+ }
69779+ return ret;
69780+ }
69781+
69782+ /* only -ENOENT means we may look upward and try to connect
69783+ @node with its neighbor (if @flags allow us to do it) */
69784+ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
69785+ return ret;
69786+
69787+ /* before establishing of sibling link we lock parent node; it is
69788+ required by renew_neighbor() to work. */
69789+ init_lh(&path[0]);
69790+ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
69791+ if (ret)
69792+ return ret;
69793+ if (znode_above_root(path[0].node)) {
69794+ longterm_unlock_znode(&path[0]);
69795+ return RETERR(-E_NO_NEIGHBOR);
69796+ }
69797+
69798+ while (1) {
69799+ znode *child = (h == 0) ? node : path[h - 1].node;
69800+ znode *parent = path[h].node;
69801+
69802+ ret = zload(parent);
69803+ if (ret)
69804+ break;
69805+
69806+ ret = find_child_ptr(parent, child, &coord);
69807+
69808+ if (ret) {
69809+ zrelse(parent);
69810+ break;
69811+ }
69812+
69813+ /* try to establish missing sibling link */
69814+ ret = renew_neighbor(&coord, child, h + base_level, flags);
69815+
69816+ zrelse(parent);
69817+
69818+ switch (ret) {
69819+ case 0:
69820+ /* unlocking of parent znode prevents simple
69821+ deadlock situation */
69822+ done_lh(&path[h]);
69823+
69824+ /* depend on tree level we stay on we repeat first
69825+ locking attempt ... */
69826+ if (h == 0)
69827+ goto again;
69828+
69829+ /* ... or repeat establishing of sibling link at
69830+ one level below. */
69831+ --h;
69832+ break;
69833+
69834+ case -ENOENT:
69835+ /* sibling link is not available -- we go
69836+ upward. */
69837+ init_lh(&path[h + 1]);
69838+ ret =
69839+ reiser4_get_parent(&path[h + 1], parent,
69840+ ZNODE_READ_LOCK);
69841+ if (ret)
69842+ goto fail;
69843+ ++h;
69844+ if (znode_above_root(path[h].node)) {
69845+ ret = RETERR(-E_NO_NEIGHBOR);
69846+ goto fail;
69847+ }
69848+ break;
69849+
69850+ case -E_DEADLOCK:
69851+ /* there was lock request from hi-pri locker. if
69852+ it is possible we unlock last parent node and
69853+ re-lock it again. */
69854+ for (; reiser4_check_deadlock(); h--) {
69855+ done_lh(&path[h]);
69856+ if (h == 0)
69857+ goto fail;
69858+ }
69859+
69860+ break;
69861+
69862+ default: /* other errors. */
69863+ goto fail;
69864+ }
69865+ }
69866+ fail:
69867+ ON_DEBUG(check_lock_node_data(node));
69868+ ON_DEBUG(check_lock_data());
69869+
69870+ /* unlock path */
69871+ do {
69872+ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
69873+ fail; path[0] is already done_lh-ed, therefore
69874+ longterm_unlock_znode(&path[h]); is not applicable */
69875+ done_lh(&path[h]);
69876+ --h;
69877+ } while (h + 1 != 0);
69878+
69879+ return ret;
69880+}
69881+
69882+/* remove node from sibling list */
69883+/* Audited by: umka (2002.06.14) */
69884+void sibling_list_remove(znode * node)
69885+{
69886+ reiser4_tree *tree;
69887+
69888+ tree = znode_get_tree(node);
69889+ assert("umka-255", node != NULL);
69890+ assert_rw_write_locked(&(tree->tree_lock));
69891+ assert("nikita-3275", check_sibling_list(node));
69892+
69893+ write_lock_dk(tree);
69894+ if (znode_is_right_connected(node) && node->right != NULL &&
69895+ znode_is_left_connected(node) && node->left != NULL) {
69896+ assert("zam-32245",
69897+ keyeq(znode_get_rd_key(node),
69898+ znode_get_ld_key(node->right)));
69899+ znode_set_rd_key(node->left, znode_get_ld_key(node->right));
69900+ }
69901+ write_unlock_dk(tree);
69902+
69903+ if (znode_is_right_connected(node) && node->right != NULL) {
69904+ assert("zam-322", znode_is_left_connected(node->right));
69905+ node->right->left = node->left;
69906+ ON_DEBUG(node->right->left_version =
69907+ atomic_inc_return(&delim_key_version);
69908+ );
69909+ }
69910+ if (znode_is_left_connected(node) && node->left != NULL) {
69911+ assert("zam-323", znode_is_right_connected(node->left));
69912+ node->left->right = node->right;
69913+ ON_DEBUG(node->left->right_version =
69914+ atomic_inc_return(&delim_key_version);
69915+ );
69916+ }
69917+
69918+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
69919+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
69920+ ON_DEBUG(node->left = node->right = NULL;
69921+ node->left_version = atomic_inc_return(&delim_key_version);
69922+ node->right_version = atomic_inc_return(&delim_key_version););
69923+ assert("nikita-3276", check_sibling_list(node));
69924+}
69925+
69926+/* disconnect node from sibling list */
69927+void sibling_list_drop(znode * node)
69928+{
69929+ znode *right;
69930+ znode *left;
69931+
69932+ assert("nikita-2464", node != NULL);
69933+ assert("nikita-3277", check_sibling_list(node));
69934+
69935+ right = node->right;
69936+ if (right != NULL) {
69937+ assert("nikita-2465", znode_is_left_connected(right));
69938+ right->left = NULL;
69939+ ON_DEBUG(right->left_version =
69940+ atomic_inc_return(&delim_key_version);
69941+ );
69942+ }
69943+ left = node->left;
69944+ if (left != NULL) {
69945+ assert("zam-323", znode_is_right_connected(left));
69946+ left->right = NULL;
69947+ ON_DEBUG(left->right_version =
69948+ atomic_inc_return(&delim_key_version);
69949+ );
69950+ }
69951+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
69952+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
69953+ ON_DEBUG(node->left = node->right = NULL;
69954+ node->left_version = atomic_inc_return(&delim_key_version);
69955+ node->right_version = atomic_inc_return(&delim_key_version););
69956+}
69957+
69958+/* Insert new node into sibling list. Regular balancing inserts new node
69959+ after (at right side) existing and locked node (@before), except one case
69960+ of adding new tree root node. @before should be NULL in that case. */
69961+void sibling_list_insert_nolock(znode * new, znode * before)
69962+{
69963+ assert("zam-334", new != NULL);
69964+ assert("nikita-3298", !znode_is_left_connected(new));
69965+ assert("nikita-3299", !znode_is_right_connected(new));
69966+ assert("nikita-3300", new->left == NULL);
69967+ assert("nikita-3301", new->right == NULL);
69968+ assert("nikita-3278", check_sibling_list(new));
69969+ assert("nikita-3279", check_sibling_list(before));
69970+
69971+ if (before != NULL) {
69972+ assert("zam-333", znode_is_connected(before));
69973+ new->right = before->right;
69974+ new->left = before;
69975+ ON_DEBUG(new->right_version =
69976+ atomic_inc_return(&delim_key_version);
69977+ new->left_version =
69978+ atomic_inc_return(&delim_key_version););
69979+ if (before->right != NULL) {
69980+ before->right->left = new;
69981+ ON_DEBUG(before->right->left_version =
69982+ atomic_inc_return(&delim_key_version);
69983+ );
69984+ }
69985+ before->right = new;
69986+ ON_DEBUG(before->right_version =
69987+ atomic_inc_return(&delim_key_version);
69988+ );
69989+ } else {
69990+ new->right = NULL;
69991+ new->left = NULL;
69992+ ON_DEBUG(new->right_version =
69993+ atomic_inc_return(&delim_key_version);
69994+ new->left_version =
69995+ atomic_inc_return(&delim_key_version););
69996+ }
69997+ ZF_SET(new, JNODE_LEFT_CONNECTED);
69998+ ZF_SET(new, JNODE_RIGHT_CONNECTED);
69999+ assert("nikita-3280", check_sibling_list(new));
70000+ assert("nikita-3281", check_sibling_list(before));
70001+}
70002+
70003+/*
70004+ Local variables:
70005+ c-indentation-style: "K&R"
70006+ mode-name: "LC"
70007+ c-basic-offset: 8
70008+ tab-width: 8
70009+ fill-column: 80
70010+ End:
70011+*/
70012diff -urN linux-2.6.23.orig/fs/reiser4/tree_walk.h linux-2.6.23/fs/reiser4/tree_walk.h
70013--- linux-2.6.23.orig/fs/reiser4/tree_walk.h 1970-01-01 03:00:00.000000000 +0300
70014+++ linux-2.6.23/fs/reiser4/tree_walk.h 2007-12-04 16:49:30.000000000 +0300
70015@@ -0,0 +1,125 @@
70016+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
70017+
70018+/* definitions of reiser4 tree walk functions */
70019+
70020+#ifndef __FS_REISER4_TREE_WALK_H__
70021+#define __FS_REISER4_TREE_WALK_H__
70022+
70023+#include "debug.h"
70024+#include "forward.h"
70025+
70026+/* establishes horizontal links between cached znodes */
70027+int connect_znode(coord_t * coord, znode * node);
70028+
70029+/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
70030+ have the following common arguments:
70031+
70032+ return codes:
70033+
70034+ @return : 0 - OK,
70035+
70036+ZAM-FIXME-HANS: wrong return code name. Change them all.
70037+ -ENOENT - neighbor is not in cache, what is detected by sibling
70038+ link absence.
70039+
70040+ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
70041+ found (because we are left-/right- most node of the
70042+ tree, for example). Also, this return code is for
70043+ reiser4_get_parent() when we see no parent link -- it
70044+ means that our node is root node.
70045+
70046+ -E_DEADLOCK - deadlock detected (request from high-priority process
70047+ received), other error codes are conformed to
70048+ /usr/include/asm/errno.h .
70049+*/
70050+
70051+int
70052+reiser4_get_parent_flags(lock_handle * result, znode * node,
70053+ znode_lock_mode mode, int flags);
70054+
70055+/* bits definition for reiser4_get_neighbor function `flags' arg. */
70056+typedef enum {
70057+ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
70058+ * find not allocated not connected neigbor by going though upper
70059+ * levels */
70060+ GN_CAN_USE_UPPER_LEVELS = 0x1,
70061+ /* locking left neighbor instead of right one */
70062+ GN_GO_LEFT = 0x2,
70063+ /* automatically load neighbor node content */
70064+ GN_LOAD_NEIGHBOR = 0x4,
70065+ /* return -E_REPEAT if can't lock */
70066+ GN_TRY_LOCK = 0x8,
70067+ /* used internally in tree_walk.c, causes renew_sibling to not
70068+ allocate neighbor znode, but only search for it in znode cache */
70069+ GN_NO_ALLOC = 0x10,
70070+ /* do not go across atom boundaries */
70071+ GN_SAME_ATOM = 0x20,
70072+ /* allow to lock not connected nodes */
70073+ GN_ALLOW_NOT_CONNECTED = 0x40,
70074+ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
70075+ GN_ASYNC = 0x80
70076+} znode_get_neigbor_flags;
70077+
70078+/* A commonly used wrapper for reiser4_get_parent_flags(). */
70079+static inline int reiser4_get_parent(lock_handle * result, znode * node,
70080+ znode_lock_mode mode)
70081+{
70082+ return reiser4_get_parent_flags(result, node, mode,
70083+ GN_ALLOW_NOT_CONNECTED);
70084+}
70085+
70086+int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
70087+ znode_lock_mode lock_mode, int flags);
70088+
70089+/* there are wrappers for most common usages of reiser4_get_neighbor() */
70090+static inline int
70091+reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
70092+ int flags)
70093+{
70094+ return reiser4_get_neighbor(result, node, lock_mode,
70095+ flags | GN_GO_LEFT);
70096+}
70097+
70098+static inline int
70099+reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
70100+ int flags)
70101+{
70102+ ON_DEBUG(check_lock_node_data(node));
70103+ ON_DEBUG(check_lock_data());
70104+ return reiser4_get_neighbor(result, node, lock_mode,
70105+ flags & (~GN_GO_LEFT));
70106+}
70107+
70108+extern void sibling_list_remove(znode * node);
70109+extern void sibling_list_drop(znode * node);
70110+extern void sibling_list_insert_nolock(znode * new, znode * before);
70111+extern void link_left_and_right(znode * left, znode * right);
70112+
70113+/* Functions called by tree_walk() when tree_walk() ... */
70114+struct tree_walk_actor {
70115+ /* ... meets a formatted node, */
70116+ int (*process_znode) (tap_t *, void *);
70117+ /* ... meets an extent, */
70118+ int (*process_extent) (tap_t *, void *);
70119+ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
70120+ * node or extent processing functions. */
70121+ int (*before) (void *);
70122+};
70123+
70124+#if REISER4_DEBUG
70125+int check_sibling_list(znode * node);
70126+#else
70127+#define check_sibling_list(n) (1)
70128+#endif
70129+
70130+#endif /* __FS_REISER4_TREE_WALK_H__ */
70131+
70132+/*
70133+ Local variables:
70134+ c-indentation-style: "K&R"
70135+ mode-name: "LC"
70136+ c-basic-offset: 8
70137+ tab-width: 8
70138+ fill-column: 120
70139+ End:
70140+*/
70141diff -urN linux-2.6.23.orig/fs/reiser4/txnmgr.c linux-2.6.23/fs/reiser4/txnmgr.c
70142--- linux-2.6.23.orig/fs/reiser4/txnmgr.c 1970-01-01 03:00:00.000000000 +0300
70143+++ linux-2.6.23/fs/reiser4/txnmgr.c 2007-12-04 16:49:30.000000000 +0300
70144@@ -0,0 +1,3164 @@
70145+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70146+ * reiser4/README */
70147+
70148+/* Joshua MacDonald wrote the first draft of this code. */
70149+
70150+/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
70151+filesystem scales only as well as its worst locking design. You need to
70152+substantially restructure this code. Josh was not as experienced a programmer
70153+as you. Particularly review how the locking style differs from what you did
70154+for znodes usingt hi-lo priority locking, and present to me an opinion on
70155+whether the differences are well founded. */
70156+
70157+/* I cannot help but to disagree with the sentiment above. Locking of
70158+ * transaction manager is _not_ badly designed, and, at the very least, is not
70159+ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
70160+ * locking on znodes, especially on the root node of the tree. --nikita,
70161+ * 2003.10.13 */
70162+
70163+/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
70164+ txnmgr processes capture_block requests and manages the relationship between jnodes and
70165+ atoms through the various stages of a transcrash, and it also oversees the fusion and
70166+ capture-on-copy processes. The main difficulty with this task is maintaining a
70167+ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
70168+ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
70169+ must be broken. The main requirement is that atom-fusion be deadlock free, so once you
70170+ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
70171+ that any time you check the atom-pointer of a jnode or handle and then try to lock that
70172+ atom, you must use trylock() and possibly reverse the order.
70173+
70174+ This code implements the design documented at:
70175+
70176+ http://namesys.com/txn-doc.html
70177+
70178+ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
70179+above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
70180+topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
70181+year old --- define all technical terms used.
70182+
70183+*/
70184+
70185+/* Thoughts on the external transaction interface:
70186+
70187+ In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
70188+ creates state that lasts for the duration of a system call and is called at the start
70189+ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
70190+ occupying the scope of a single system call. We wish to give certain applications an
70191+ interface to begin and close (commit) transactions. Since our implementation of
70192+ transactions does not yet support isolation, allowing an application to open a
70193+ transaction implies trusting it to later close the transaction. Part of the
70194+ transaction interface will be aimed at enabling that trust, but the interface for
70195+ actually using transactions is fairly narrow.
70196+
70197+ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
70198+ this identifier into a string that a shell-script could use, allowing you to start a
70199+ transaction by issuing a command. Once open, the transcrash should be set in the task
70200+ structure, and there should be options (I suppose) to allow it to be carried across
70201+ fork/exec. A transcrash has several options:
70202+
70203+ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
70204+ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
70205+ capture on reads as well, it should set READ_FUSING.
70206+
70207+ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
70208+ eventually close (or else the machine must crash). If the application dies an
70209+ unexpected death with an open transcrash, for example, or if it hangs for a long
70210+ duration, one solution (to avoid crashing the machine) is to simply close it anyway.
70211+ This is a dangerous option, but it is one way to solve the problem until isolated
70212+ transcrashes are available for untrusted applications.
70213+
70214+ It seems to be what databases do, though it is unclear how one avoids a DoS attack
70215+ creating a vulnerability based on resource starvation. Guaranteeing that some
70216+ minimum amount of computational resources are made available would seem more correct
70217+ than guaranteeing some amount of time. When we again have someone to code the work,
70218+ this issue should be considered carefully. -Hans
70219+
70220+ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
70221+ many dirty blocks it expects. The reserve_blocks interface should be called at a point
70222+ where it is safe for the application to fail, because the system may not be able to
70223+ grant the allocation and the application must be able to back-out. For this reason,
70224+ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
70225+ the application may also wish to extend the allocation after beginning its transcrash.
70226+
70227+ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
70228+ modifications that require transaction protection. When isolated transactions are
70229+ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
70230+ RESERVE_BLOCKS call fails for the application, it should "abort" by calling
70231+ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
70232+ why, for safety, the application should call RESERVE_BLOCKS before making any changes).
70233+
70234+ For actually implementing these out-of-system-call-scopped transcrashes, the
70235+ reiser4_context has a "txn_handle *trans" pointer that may be set to an open
70236+ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
70237+ "struct kmem_cache *_txnh_slab" created for that purpose in this file.
70238+*/
70239+
70240+/* Extending the other system call interfaces for future transaction features:
70241+
70242+ Specialized applications may benefit from passing flags to the ordinary system call
70243+ interface such as read(), write(), or stat(). For example, the application specifies
70244+ WRITE_FUSING by default but wishes to add that a certain read() command should be
70245+ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
70246+ read, or the file-data read? These issues are straight-forward, but there are a lot of
70247+ them and adding the necessary flags-passing code will be tedious.
70248+
70249+ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
70250+ flag, which specifies that although it is a read operation being requested, a
70251+ write-lock should be taken. The reason is that read-locks are shared while write-locks
70252+ are exclusive, so taking a read-lock when a later-write is known in advance will often
70253+ leads to deadlock. If a reader knows it will write later, it should issue read
70254+ requests with the RMW flag set.
70255+*/
70256+
70257+/*
70258+ The znode/atom deadlock avoidance.
70259+
70260+ FIXME(Zam): writing of this comment is in progress.
70261+
70262+ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
70263+ long-term locking, which makes reiser4 locking scheme more complex. It had
70264+ deadlocks until we implement deadlock avoidance algorithms. That deadlocks
70265+ looked as the following: one stopped thread waits for a long-term lock on
70266+ znode, the thread who owns that lock waits when fusion with another atom will
70267+ be allowed.
70268+
70269+ The source of the deadlocks is an optimization of not capturing index nodes
70270+ for read. Let's prove it. Suppose we have dumb node capturing scheme which
70271+ unconditionally captures each block before locking it.
70272+
70273+ That scheme has no deadlocks. Let's begin with the thread which stage is
70274+ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
70275+ a capture because it's stage allows fusion with any atom except which are
70276+ being committed currently. A process of atom commit can't deadlock because
70277+ atom commit procedure does not acquire locks and does not fuse with other
70278+ atoms. Reiser4 does capturing right before going to sleep inside the
70279+ longtertm_lock_znode() function, it means the znode which we want to lock is
70280+ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
70281+ continue the analysis we understand that no one process in the sequence may
70282+ waits atom fusion. Thereby there are no deadlocks of described kind.
70283+
70284+ The capturing optimization makes the deadlocks possible. A thread can wait a
70285+ lock which owner did not captured that node. The lock owner's current atom
70286+ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
70287+ state. A deadlock is possible when that atom meets another one which is in
70288+ ASTAGE_CAPTURE_WAIT already.
70289+
70290+ The deadlock avoidance scheme includes two algorithms:
70291+
70292+ First algorithm is used when a thread captures a node which is locked but not
70293+ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
70294+ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
70295+ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
70296+ routine which forces all lock owners to join with current atom is executed.
70297+
70298+ Second algorithm does not allow to skip capturing of already captured nodes.
70299+
70300+ Both algorithms together prevent waiting a longterm lock without atom fusion
70301+ with atoms of all lock owners, which is a key thing for getting atom/znode
70302+ locking deadlocks.
70303+*/
70304+
70305+/*
70306+ * Transactions and mmap(2).
70307+ *
70308+ * 1. Transactions are not supported for accesses through mmap(2), because
70309+ * this would effectively amount to user-level transactions whose duration
70310+ * is beyond control of the kernel.
70311+ *
70312+ * 2. That said, we still want to preserve some decency with regard to
70313+ * mmap(2). During normal write(2) call, following sequence of events
70314+ * happens:
70315+ *
70316+ * 1. page is created;
70317+ *
70318+ * 2. jnode is created, dirtied and captured into current atom.
70319+ *
70320+ * 3. extent is inserted and modified.
70321+ *
70322+ * Steps (2) and (3) take place under long term lock on the twig node.
70323+ *
70324+ * When file is accessed through mmap(2) page is always created during
70325+ * page fault.
70326+ * After this (in reiser4_readpage()->reiser4_readpage_extent()):
70327+ *
70328+ * 1. if access is made to non-hole page new jnode is created, (if
70329+ * necessary)
70330+ *
70331+ * 2. if access is made to the hole page, jnode is not created (XXX
70332+ * not clear why).
70333+ *
70334+ * Also, even if page is created by write page fault it is not marked
70335+ * dirty immediately by handle_mm_fault(). Probably this is to avoid races
70336+ * with page write-out.
70337+ *
70338+ * Dirty bit installed by hardware is only transferred to the struct page
70339+ * later, when page is unmapped (in zap_pte_range(), or
70340+ * try_to_unmap_one()).
70341+ *
70342+ * So, with mmap(2) we have to handle following irksome situations:
70343+ *
70344+ * 1. there exists modified page (clean or dirty) without jnode
70345+ *
70346+ * 2. there exists modified page (clean or dirty) with clean jnode
70347+ *
70348+ * 3. clean page which is a part of atom can be transparently modified
70349+ * at any moment through mapping without becoming dirty.
70350+ *
70351+ * (1) and (2) can lead to the out-of-memory situation: ->writepage()
70352+ * doesn't know what to do with such pages and ->sync_sb()/->writepages()
70353+ * don't see them, because these methods operate on atoms.
70354+ *
70355+ * (3) can lead to the loss of data: suppose we have dirty page with dirty
70356+ * captured jnode captured by some atom. As part of early flush (for
70357+ * example) page was written out. Dirty bit was cleared on both page and
70358+ * jnode. After this page is modified through mapping, but kernel doesn't
70359+ * notice and just discards page and jnode as part of commit. (XXX
70360+ * actually it doesn't, because to reclaim page ->releasepage() has to be
70361+ * called and before this dirty bit will be transferred to the struct
70362+ * page).
70363+ *
70364+ */
70365+
70366+#include "debug.h"
70367+#include "txnmgr.h"
70368+#include "jnode.h"
70369+#include "znode.h"
70370+#include "block_alloc.h"
70371+#include "tree.h"
70372+#include "wander.h"
70373+#include "ktxnmgrd.h"
70374+#include "super.h"
70375+#include "page_cache.h"
70376+#include "reiser4.h"
70377+#include "vfs_ops.h"
70378+#include "inode.h"
70379+#include "flush.h"
70380+
70381+#include <asm/atomic.h>
70382+#include <linux/types.h>
70383+#include <linux/fs.h>
70384+#include <linux/mm.h>
70385+#include <linux/slab.h>
70386+#include <linux/pagemap.h>
70387+#include <linux/writeback.h>
70388+#include <linux/swap.h> /* for totalram_pages */
70389+
70390+static void atom_free(txn_atom * atom);
70391+
70392+static int commit_txnh(txn_handle * txnh);
70393+
70394+static void wakeup_atom_waitfor_list(txn_atom * atom);
70395+static void wakeup_atom_waiting_list(txn_atom * atom);
70396+
70397+static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
70398+
70399+static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
70400+
70401+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
70402+
70403+static int capture_init_fusion(jnode * node, txn_handle * txnh,
70404+ txn_capture mode);
70405+
70406+static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
70407+
70408+static void capture_fuse_into(txn_atom * small, txn_atom * large);
70409+
70410+void reiser4_invalidate_list(struct list_head *);
70411+
70412+/* GENERIC STRUCTURES */
70413+
70414+typedef struct _txn_wait_links txn_wait_links;
70415+
70416+struct _txn_wait_links {
70417+ lock_stack *_lock_stack;
70418+ struct list_head _fwaitfor_link;
70419+ struct list_head _fwaiting_link;
70420+ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70421+ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70422+};
70423+
70424+/* FIXME: In theory, we should be using the slab cache init & destructor
70425+ methods instead of, e.g., jnode_init, etc. */
70426+static struct kmem_cache *_atom_slab = NULL;
70427+/* this is for user-visible, cross system-call transactions. */
70428+static struct kmem_cache *_txnh_slab = NULL;
70429+
70430+/**
70431+ * init_txnmgr_static - create transaction manager slab caches
70432+ *
70433+ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
70434+ * initialization.
70435+ */
70436+int init_txnmgr_static(void)
70437+{
70438+ assert("jmacd-600", _atom_slab == NULL);
70439+ assert("jmacd-601", _txnh_slab == NULL);
70440+
70441+ ON_DEBUG(atomic_set(&flush_cnt, 0));
70442+
70443+ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
70444+ SLAB_HWCACHE_ALIGN |
70445+ SLAB_RECLAIM_ACCOUNT, NULL);
70446+ if (_atom_slab == NULL)
70447+ return RETERR(-ENOMEM);
70448+
70449+ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
70450+ SLAB_HWCACHE_ALIGN, NULL);
70451+ if (_txnh_slab == NULL) {
70452+ kmem_cache_destroy(_atom_slab);
70453+ _atom_slab = NULL;
70454+ return RETERR(-ENOMEM);
70455+ }
70456+
70457+ return 0;
70458+}
70459+
70460+/**
70461+ * done_txnmgr_static - delete txn_atom and txn_handle caches
70462+ *
70463+ * This is called on reiser4 module unloading or system shutdown.
70464+ */
70465+void done_txnmgr_static(void)
70466+{
70467+ destroy_reiser4_cache(&_atom_slab);
70468+ destroy_reiser4_cache(&_txnh_slab);
70469+}
70470+
70471+/**
70472+ * init_txnmgr - initialize a new transaction manager
70473+ * @mgr: pointer to transaction manager embedded in reiser4 super block
70474+ *
70475+ * This is called on mount. Makes necessary initializations.
70476+ */
70477+void reiser4_init_txnmgr(txn_mgr *mgr)
70478+{
70479+ assert("umka-169", mgr != NULL);
70480+
70481+ mgr->atom_count = 0;
70482+ mgr->id_count = 1;
70483+ INIT_LIST_HEAD(&mgr->atoms_list);
70484+ spin_lock_init(&mgr->tmgr_lock);
70485+ mutex_init(&mgr->commit_mutex);
70486+}
70487+
70488+/**
70489+ * reiser4_done_txnmgr - stop transaction manager
70490+ * @mgr: pointer to transaction manager embedded in reiser4 super block
70491+ *
70492+ * This is called on umount. Does sanity checks.
70493+ */
70494+void reiser4_done_txnmgr(txn_mgr *mgr)
70495+{
70496+ assert("umka-170", mgr != NULL);
70497+ assert("umka-1701", list_empty_careful(&mgr->atoms_list));
70498+ assert("umka-1702", mgr->atom_count == 0);
70499+}
70500+
70501+/* Initialize a transaction handle. */
70502+/* Audited by: umka (2002.06.13) */
70503+static void txnh_init(txn_handle * txnh, txn_mode mode)
70504+{
70505+ assert("umka-171", txnh != NULL);
70506+
70507+ txnh->mode = mode;
70508+ txnh->atom = NULL;
70509+ reiser4_ctx_gfp_mask_set();
70510+ txnh->flags = 0;
70511+ spin_lock_init(&txnh->hlock);
70512+ INIT_LIST_HEAD(&txnh->txnh_link);
70513+}
70514+
70515+#if REISER4_DEBUG
70516+/* Check if a transaction handle is clean. */
70517+static int txnh_isclean(txn_handle * txnh)
70518+{
70519+ assert("umka-172", txnh != NULL);
70520+ return txnh->atom == NULL &&
70521+ LOCK_CNT_NIL(spin_locked_txnh);
70522+}
70523+#endif
70524+
70525+/* Initialize an atom. */
70526+static void atom_init(txn_atom * atom)
70527+{
70528+ int level;
70529+
70530+ assert("umka-173", atom != NULL);
70531+
70532+ memset(atom, 0, sizeof(txn_atom));
70533+
70534+ atom->stage = ASTAGE_FREE;
70535+ atom->start_time = jiffies;
70536+
70537+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
70538+ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
70539+
70540+ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
70541+ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
70542+ INIT_LIST_HEAD(ATOM_WB_LIST(atom));
70543+ INIT_LIST_HEAD(&atom->inodes);
70544+ spin_lock_init(&(atom->alock));
70545+ /* list of transaction handles */
70546+ INIT_LIST_HEAD(&atom->txnh_list);
70547+ /* link to transaction manager's list of atoms */
70548+ INIT_LIST_HEAD(&atom->atom_link);
70549+ INIT_LIST_HEAD(&atom->fwaitfor_list);
70550+ INIT_LIST_HEAD(&atom->fwaiting_list);
70551+ blocknr_set_init(&atom->delete_set);
70552+ blocknr_set_init(&atom->wandered_map);
70553+
70554+ init_atom_fq_parts(atom);
70555+}
70556+
70557+#if REISER4_DEBUG
70558+/* Check if an atom is clean. */
70559+static int atom_isclean(txn_atom * atom)
70560+{
70561+ int level;
70562+
70563+ assert("umka-174", atom != NULL);
70564+
70565+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
70566+ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
70567+ return 0;
70568+ }
70569+ }
70570+
70571+ return atom->stage == ASTAGE_FREE &&
70572+ atom->txnh_count == 0 &&
70573+ atom->capture_count == 0 &&
70574+ atomic_read(&atom->refcount) == 0 &&
70575+ (&atom->atom_link == atom->atom_link.next &&
70576+ &atom->atom_link == atom->atom_link.prev) &&
70577+ list_empty_careful(&atom->txnh_list) &&
70578+ list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
70579+ list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
70580+ list_empty_careful(ATOM_WB_LIST(atom)) &&
70581+ list_empty_careful(&atom->fwaitfor_list) &&
70582+ list_empty_careful(&atom->fwaiting_list) &&
70583+ atom_fq_parts_are_clean(atom);
70584+}
70585+#endif
70586+
70587+/* Begin a transaction in this context. Currently this uses the reiser4_context's
70588+ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
70589+ this will be extended to allow transaction handles to span several contexts. */
70590+/* Audited by: umka (2002.06.13) */
70591+void reiser4_txn_begin(reiser4_context * context)
70592+{
70593+ assert("jmacd-544", context->trans == NULL);
70594+
70595+ context->trans = &context->trans_in_ctx;
70596+
70597+ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
70598+ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
70599+ stack allocated right now, but we would like to allow for dynamically allocated
70600+ transcrashes that span multiple system calls.
70601+ */
70602+ txnh_init(context->trans, TXN_WRITE_FUSING);
70603+}
70604+
70605+/* Finish a transaction handle context. */
70606+int reiser4_txn_end(reiser4_context * context)
70607+{
70608+ long ret = 0;
70609+ txn_handle *txnh;
70610+
70611+ assert("umka-283", context != NULL);
70612+ assert("nikita-3012", reiser4_schedulable());
70613+ assert("vs-24", context == get_current_context());
70614+ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
70615+
70616+ txnh = context->trans;
70617+ if (txnh != NULL) {
70618+ if (txnh->atom != NULL)
70619+ ret = commit_txnh(txnh);
70620+ assert("jmacd-633", txnh_isclean(txnh));
70621+ context->trans = NULL;
70622+ }
70623+ return ret;
70624+}
70625+
70626+void reiser4_txn_restart(reiser4_context * context)
70627+{
70628+ reiser4_txn_end(context);
70629+ reiser4_preempt_point();
70630+ reiser4_txn_begin(context);
70631+}
70632+
70633+void reiser4_txn_restart_current(void)
70634+{
70635+ reiser4_txn_restart(get_current_context());
70636+}
70637+
70638+/* TXN_ATOM */
70639+
70640+/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
70641+ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
70642+ return NULL. */
70643+static txn_atom *txnh_get_atom(txn_handle * txnh)
70644+{
70645+ txn_atom *atom;
70646+
70647+ assert("umka-180", txnh != NULL);
70648+ assert_spin_not_locked(&(txnh->hlock));
70649+
70650+ while (1) {
70651+ spin_lock_txnh(txnh);
70652+ atom = txnh->atom;
70653+
70654+ if (atom == NULL)
70655+ break;
70656+
70657+ if (spin_trylock_atom(atom))
70658+ break;
70659+
70660+ atomic_inc(&atom->refcount);
70661+
70662+ spin_unlock_txnh(txnh);
70663+ spin_lock_atom(atom);
70664+ spin_lock_txnh(txnh);
70665+
70666+ if (txnh->atom == atom) {
70667+ atomic_dec(&atom->refcount);
70668+ break;
70669+ }
70670+
70671+ spin_unlock_txnh(txnh);
70672+ atom_dec_and_unlock(atom);
70673+ }
70674+
70675+ return atom;
70676+}
70677+
70678+/* Get the current atom and spinlock it if current atom present. May return NULL */
70679+txn_atom *get_current_atom_locked_nocheck(void)
70680+{
70681+ reiser4_context *cx;
70682+ txn_atom *atom;
70683+ txn_handle *txnh;
70684+
70685+ cx = get_current_context();
70686+ assert("zam-437", cx != NULL);
70687+
70688+ txnh = cx->trans;
70689+ assert("zam-435", txnh != NULL);
70690+
70691+ atom = txnh_get_atom(txnh);
70692+
70693+ spin_unlock_txnh(txnh);
70694+ return atom;
70695+}
70696+
70697+/* Get the atom belonging to a jnode, which is initially locked. Return with
70698+ both jnode and atom locked. This performs the necessary spin_trylock to
70699+ break the lock-ordering cycle. Assumes the jnode is already locked, and
70700+ returns NULL if atom is not set. */
70701+txn_atom *jnode_get_atom(jnode * node)
70702+{
70703+ txn_atom *atom;
70704+
70705+ assert("umka-181", node != NULL);
70706+
70707+ while (1) {
70708+ assert_spin_locked(&(node->guard));
70709+
70710+ atom = node->atom;
70711+ /* node is not in any atom */
70712+ if (atom == NULL)
70713+ break;
70714+
70715+ /* If atom is not locked, grab the lock and return */
70716+ if (spin_trylock_atom(atom))
70717+ break;
70718+
70719+ /* At least one jnode belongs to this atom it guarantees that
70720+ * atom->refcount > 0, we can safely increment refcount. */
70721+ atomic_inc(&atom->refcount);
70722+ spin_unlock_jnode(node);
70723+
70724+ /* re-acquire spin locks in the right order */
70725+ spin_lock_atom(atom);
70726+ spin_lock_jnode(node);
70727+
70728+ /* check if node still points to the same atom. */
70729+ if (node->atom == atom) {
70730+ atomic_dec(&atom->refcount);
70731+ break;
70732+ }
70733+
70734+ /* releasing of atom lock and reference requires not holding
70735+ * locks on jnodes. */
70736+ spin_unlock_jnode(node);
70737+
70738+ /* We do not sure that this atom has extra references except our
70739+ * one, so we should call proper function which may free atom if
70740+ * last reference is released. */
70741+ atom_dec_and_unlock(atom);
70742+
70743+ /* lock jnode again for getting valid node->atom pointer
70744+ * value. */
70745+ spin_lock_jnode(node);
70746+ }
70747+
70748+ return atom;
70749+}
70750+
70751+/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
70752+ by flush code to indicate whether the next node (in some direction) is suitable for
70753+ flushing. */
70754+int
70755+same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
70756+{
70757+ int compat;
70758+ txn_atom *atom;
70759+
70760+ assert("umka-182", node != NULL);
70761+ assert("umka-183", check != NULL);
70762+
70763+ /* Not sure what this function is supposed to do if supplied with @check that is
70764+ neither formatted nor unformatted (bitmap or so). */
70765+ assert("nikita-2373", jnode_is_znode(check)
70766+ || jnode_is_unformatted(check));
70767+
70768+ /* Need a lock on CHECK to get its atom and to check various state bits.
70769+ Don't need a lock on NODE once we get the atom lock. */
70770+ /* It is not enough to lock two nodes and check (node->atom ==
70771+ check->atom) because atom could be locked and being fused at that
70772+ moment, jnodes of the atom of that state (being fused) can point to
70773+ different objects, but the atom is the same. */
70774+ spin_lock_jnode(check);
70775+
70776+ atom = jnode_get_atom(check);
70777+
70778+ if (atom == NULL) {
70779+ compat = 0;
70780+ } else {
70781+ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
70782+
70783+ if (compat && jnode_is_znode(check)) {
70784+ compat &= znode_is_connected(JZNODE(check));
70785+ }
70786+
70787+ if (compat && alloc_check) {
70788+ compat &= (alloc_value == jnode_is_flushprepped(check));
70789+ }
70790+
70791+ spin_unlock_atom(atom);
70792+ }
70793+
70794+ spin_unlock_jnode(check);
70795+
70796+ return compat;
70797+}
70798+
70799+/* Decrement the atom's reference count and if it falls to zero, free it. */
70800+void atom_dec_and_unlock(txn_atom * atom)
70801+{
70802+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70803+
70804+ assert("umka-186", atom != NULL);
70805+ assert_spin_locked(&(atom->alock));
70806+ assert("zam-1039", atomic_read(&atom->refcount) > 0);
70807+
70808+ if (atomic_dec_and_test(&atom->refcount)) {
70809+ /* take txnmgr lock and atom lock in proper order. */
70810+ if (!spin_trylock_txnmgr(mgr)) {
70811+ /* This atom should exist after we re-acquire its
70812+ * spinlock, so we increment its reference counter. */
70813+ atomic_inc(&atom->refcount);
70814+ spin_unlock_atom(atom);
70815+ spin_lock_txnmgr(mgr);
70816+ spin_lock_atom(atom);
70817+
70818+ if (!atomic_dec_and_test(&atom->refcount)) {
70819+ spin_unlock_atom(atom);
70820+ spin_unlock_txnmgr(mgr);
70821+ return;
70822+ }
70823+ }
70824+ assert_spin_locked(&(mgr->tmgr_lock));
70825+ atom_free(atom);
70826+ spin_unlock_txnmgr(mgr);
70827+ } else
70828+ spin_unlock_atom(atom);
70829+}
70830+
70831+/* Create new atom and connect it to given transaction handle. This adds the
70832+ atom to the transaction manager's list and sets its reference count to 1, an
70833+ artificial reference which is kept until it commits. We play strange games
70834+ to avoid allocation under jnode & txnh spinlocks.*/
70835+
70836+static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
70837+{
70838+ txn_atom *atom;
70839+ txn_mgr *mgr;
70840+
70841+ if (REISER4_DEBUG && rofs_tree(current_tree)) {
70842+ warning("nikita-3366", "Creating atom on rofs");
70843+ dump_stack();
70844+ }
70845+
70846+ if (*atom_alloc == NULL) {
70847+ (*atom_alloc) = kmem_cache_alloc(_atom_slab,
70848+ reiser4_ctx_gfp_mask_get());
70849+
70850+ if (*atom_alloc == NULL)
70851+ return RETERR(-ENOMEM);
70852+ }
70853+
70854+ /* and, also, txnmgr spin lock should be taken before jnode and txnh
70855+ locks. */
70856+ mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70857+ spin_lock_txnmgr(mgr);
70858+ spin_lock_txnh(txnh);
70859+
70860+ /* Check whether new atom still needed */
70861+ if (txnh->atom != NULL) {
70862+ /* NOTE-NIKITA probably it is rather better to free
70863+ * atom_alloc here than thread it up to reiser4_try_capture() */
70864+
70865+ spin_unlock_txnh(txnh);
70866+ spin_unlock_txnmgr(mgr);
70867+
70868+ return -E_REPEAT;
70869+ }
70870+
70871+ atom = *atom_alloc;
70872+ *atom_alloc = NULL;
70873+
70874+ atom_init(atom);
70875+
70876+ assert("jmacd-17", atom_isclean(atom));
70877+
70878+ /*
70879+ * lock ordering is broken here. It is ok, as long as @atom is new
70880+ * and inaccessible for others. We can't use spin_lock_atom or
70881+ * spin_lock(&atom->alock) because they care about locking
70882+ * dependencies. spin_trylock_lock doesn't.
70883+ */
70884+ check_me("", spin_trylock_atom(atom));
70885+
70886+ /* add atom to the end of transaction manager's list of atoms */
70887+ list_add_tail(&atom->atom_link, &mgr->atoms_list);
70888+ atom->atom_id = mgr->id_count++;
70889+ mgr->atom_count += 1;
70890+
70891+ /* Release txnmgr lock */
70892+ spin_unlock_txnmgr(mgr);
70893+
70894+ /* One reference until it commits. */
70895+ atomic_inc(&atom->refcount);
70896+ atom->stage = ASTAGE_CAPTURE_FUSE;
70897+ atom->super = reiser4_get_current_sb();
70898+ capture_assign_txnh_nolock(atom, txnh);
70899+
70900+ spin_unlock_atom(atom);
70901+ spin_unlock_txnh(txnh);
70902+
70903+ return -E_REPEAT;
70904+}
70905+
70906+/* Return true if an atom is currently "open". */
70907+static int atom_isopen(const txn_atom * atom)
70908+{
70909+ assert("umka-185", atom != NULL);
70910+
70911+ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
70912+}
70913+
70914+/* Return the number of pointers to this atom that must be updated during fusion. This
70915+ approximates the amount of work to be done. Fusion chooses the atom with fewer
70916+ pointers to fuse into the atom with more pointers. */
70917+static int atom_pointer_count(const txn_atom * atom)
70918+{
70919+ assert("umka-187", atom != NULL);
70920+
70921+ /* This is a measure of the amount of work needed to fuse this atom
70922+ * into another. */
70923+ return atom->txnh_count + atom->capture_count;
70924+}
70925+
70926+/* Called holding the atom lock, this removes the atom from the transaction manager list
70927+ and frees it. */
70928+static void atom_free(txn_atom * atom)
70929+{
70930+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70931+
70932+ assert("umka-188", atom != NULL);
70933+ assert_spin_locked(&(atom->alock));
70934+
70935+ /* Remove from the txn_mgr's atom list */
70936+ assert_spin_locked(&(mgr->tmgr_lock));
70937+ mgr->atom_count -= 1;
70938+ list_del_init(&atom->atom_link);
70939+
70940+ /* Clean the atom */
70941+ assert("jmacd-16",
70942+ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
70943+ atom->stage = ASTAGE_FREE;
70944+
70945+ blocknr_set_destroy(&atom->delete_set);
70946+ blocknr_set_destroy(&atom->wandered_map);
70947+
70948+ assert("jmacd-16", atom_isclean(atom));
70949+
70950+ spin_unlock_atom(atom);
70951+
70952+ kmem_cache_free(_atom_slab, atom);
70953+}
70954+
70955+static int atom_is_dotard(const txn_atom * atom)
70956+{
70957+ return time_after(jiffies, atom->start_time +
70958+ get_current_super_private()->tmgr.atom_max_age);
70959+}
70960+
70961+static int atom_can_be_committed(txn_atom * atom)
70962+{
70963+ assert_spin_locked(&(atom->alock));
70964+ assert("zam-885", atom->txnh_count > atom->nr_waiters);
70965+ return atom->txnh_count == atom->nr_waiters + 1;
70966+}
70967+
70968+/* Return true if an atom should commit now. This is determined by aging, atom
70969+ size or atom flags. */
70970+static int atom_should_commit(const txn_atom * atom)
70971+{
70972+ assert("umka-189", atom != NULL);
70973+ return
70974+ (atom->flags & ATOM_FORCE_COMMIT) ||
70975+ ((unsigned)atom_pointer_count(atom) >
70976+ get_current_super_private()->tmgr.atom_max_size)
70977+ || atom_is_dotard(atom);
70978+}
70979+
70980+/* return 1 if current atom exists and requires commit. */
70981+int current_atom_should_commit(void)
70982+{
70983+ txn_atom *atom;
70984+ int result = 0;
70985+
70986+ atom = get_current_atom_locked_nocheck();
70987+ if (atom) {
70988+ result = atom_should_commit(atom);
70989+ spin_unlock_atom(atom);
70990+ }
70991+ return result;
70992+}
70993+
70994+static int atom_should_commit_asap(const txn_atom * atom)
70995+{
70996+ unsigned int captured;
70997+ unsigned int pinnedpages;
70998+
70999+ assert("nikita-3309", atom != NULL);
71000+
71001+ captured = (unsigned)atom->capture_count;
71002+ pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
71003+
71004+ return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
71005+}
71006+
71007+static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
71008+{
71009+ jnode *first_dirty;
71010+
71011+ list_for_each_entry(first_dirty, head, capture_link) {
71012+ if (!(flags & JNODE_FLUSH_COMMIT)) {
71013+ /*
71014+ * skip jnodes which "heard banshee" or having active
71015+ * I/O
71016+ */
71017+ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
71018+ JF_ISSET(first_dirty, JNODE_WRITEBACK))
71019+ continue;
71020+ }
71021+ return first_dirty;
71022+ }
71023+ return NULL;
71024+}
71025+
71026+/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
71027+ nodes on atom's lists */
71028+jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
71029+{
71030+ jnode *first_dirty;
71031+ tree_level level;
71032+
71033+ assert_spin_locked(&(atom->alock));
71034+
71035+ /* The flush starts from LEAF_LEVEL (=1). */
71036+ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
71037+ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
71038+ continue;
71039+
71040+ first_dirty =
71041+ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
71042+ flags);
71043+ if (first_dirty)
71044+ return first_dirty;
71045+ }
71046+
71047+ /* znode-above-root is on the list #0. */
71048+ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
71049+}
71050+
71051+static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
71052+{
71053+ jnode *cur;
71054+
71055+ assert("zam-905", atom_is_protected(atom));
71056+
71057+ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
71058+ while (ATOM_WB_LIST(atom) != &cur->capture_link) {
71059+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
71060+
71061+ spin_lock_jnode(cur);
71062+ if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
71063+ if (JF_ISSET(cur, JNODE_DIRTY)) {
71064+ queue_jnode(fq, cur);
71065+ } else {
71066+ /* move jnode to atom's clean list */
71067+ list_move_tail(&cur->capture_link,
71068+ ATOM_CLEAN_LIST(atom));
71069+ }
71070+ }
71071+ spin_unlock_jnode(cur);
71072+
71073+ cur = next;
71074+ }
71075+}
71076+
71077+/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
71078+ * jnodes to disk. */
71079+static int submit_wb_list(void)
71080+{
71081+ int ret;
71082+ flush_queue_t *fq;
71083+
71084+ fq = get_fq_for_current_atom();
71085+ if (IS_ERR(fq))
71086+ return PTR_ERR(fq);
71087+
71088+ dispatch_wb_list(fq->atom, fq);
71089+ spin_unlock_atom(fq->atom);
71090+
71091+ ret = reiser4_write_fq(fq, NULL, 1);
71092+ reiser4_fq_put(fq);
71093+
71094+ return ret;
71095+}
71096+
71097+/* Wait completion of all writes, re-submit atom writeback list if needed. */
71098+static int current_atom_complete_writes(void)
71099+{
71100+ int ret;
71101+
71102+ /* Each jnode from that list was modified and dirtied when it had i/o
71103+ * request running already. After i/o completion we have to resubmit
71104+ * them to disk again.*/
71105+ ret = submit_wb_list();
71106+ if (ret < 0)
71107+ return ret;
71108+
71109+ /* Wait all i/o completion */
71110+ ret = current_atom_finish_all_fq();
71111+ if (ret)
71112+ return ret;
71113+
71114+ /* Scan wb list again; all i/o should be completed, we re-submit dirty
71115+ * nodes to disk */
71116+ ret = submit_wb_list();
71117+ if (ret < 0)
71118+ return ret;
71119+
71120+ /* Wait all nodes we just submitted */
71121+ return current_atom_finish_all_fq();
71122+}
71123+
71124+#if REISER4_DEBUG
71125+
71126+static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
71127+{
71128+ if (atom == NULL) {
71129+ printk("%s: no atom\n", prefix);
71130+ return;
71131+ }
71132+
71133+ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
71134+ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
71135+ atomic_read(&atom->refcount), atom->atom_id, atom->flags,
71136+ atom->txnh_count, atom->capture_count, atom->stage,
71137+ atom->start_time, atom->flushed);
71138+}
71139+
71140+#else /* REISER4_DEBUG */
71141+
71142+static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
71143+
71144+#endif /* REISER4_DEBUG */
71145+
71146+#define TOOMANYFLUSHES (1 << 13)
71147+
71148+/* Called with the atom locked and no open "active" transaction handlers except
71149+ ours, this function calls flush_current_atom() until all dirty nodes are
71150+ processed. Then it initiates commit processing.
71151+
71152+ Called by the single remaining open "active" txnh, which is closing. Other
71153+ open txnhs belong to processes which wait atom commit in commit_txnh()
71154+ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
71155+ long as we hold the atom lock none of the jnodes can be captured and/or
71156+ locked.
71157+
71158+ Return value is an error code if commit fails.
71159+*/
71160+static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
71161+{
71162+ reiser4_super_info_data *sbinfo = get_current_super_private();
71163+ long ret = 0;
71164+ /* how many times jnode_flush() was called as a part of attempt to
71165+ * commit this atom. */
71166+ int flushiters;
71167+
71168+ assert("zam-888", atom != NULL && *atom != NULL);
71169+ assert_spin_locked(&((*atom)->alock));
71170+ assert("zam-887", get_current_context()->trans->atom == *atom);
71171+ assert("jmacd-151", atom_isopen(*atom));
71172+
71173+ assert("nikita-3184",
71174+ get_current_super_private()->delete_mutex_owner != current);
71175+
71176+ for (flushiters = 0;; ++flushiters) {
71177+ ret =
71178+ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
71179+ JNODE_FLUSH_COMMIT,
71180+ LONG_MAX /* nr_to_write */ ,
71181+ nr_submitted, atom, NULL);
71182+ if (ret != -E_REPEAT)
71183+ break;
71184+
71185+ /* if atom's dirty list contains one znode which is
71186+ HEARD_BANSHEE and is locked we have to allow lock owner to
71187+ continue and uncapture that znode */
71188+ reiser4_preempt_point();
71189+
71190+ *atom = get_current_atom_locked();
71191+ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
71192+ warning("nikita-3176",
71193+ "Flushing like mad: %i", flushiters);
71194+ reiser4_info_atom("atom", *atom);
71195+ DEBUGON(flushiters > (1 << 20));
71196+ }
71197+ }
71198+
71199+ if (ret)
71200+ return ret;
71201+
71202+ assert_spin_locked(&((*atom)->alock));
71203+
71204+ if (!atom_can_be_committed(*atom)) {
71205+ spin_unlock_atom(*atom);
71206+ return RETERR(-E_REPEAT);
71207+ }
71208+
71209+ if ((*atom)->capture_count == 0)
71210+ goto done;
71211+
71212+ /* Up to this point we have been flushing and after flush is called we
71213+ return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
71214+ at this point, commit should be successful. */
71215+ reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
71216+ ON_DEBUG(((*atom)->committer = current));
71217+ spin_unlock_atom(*atom);
71218+
71219+ ret = current_atom_complete_writes();
71220+ if (ret)
71221+ return ret;
71222+
71223+ assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
71224+
71225+ /* isolate critical code path which should be executed by only one
71226+ * thread using tmgr mutex */
71227+ mutex_lock(&sbinfo->tmgr.commit_mutex);
71228+
71229+ ret = reiser4_write_logs(nr_submitted);
71230+ if (ret < 0)
71231+ reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
71232+
71233+ /* The atom->ovrwr_nodes list is processed under commit mutex held
71234+ because of bitmap nodes which are captured by special way in
71235+ reiser4_pre_commit_hook_bitmap(), that way does not include
71236+ capture_fuse_wait() as a capturing of other nodes does -- the commit
71237+ mutex is used for transaction isolation instead. */
71238+ reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
71239+ mutex_unlock(&sbinfo->tmgr.commit_mutex);
71240+
71241+ reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
71242+ reiser4_invalidate_list(ATOM_WB_LIST(*atom));
71243+ assert("zam-927", list_empty(&(*atom)->inodes));
71244+
71245+ spin_lock_atom(*atom);
71246+ done:
71247+ reiser4_atom_set_stage(*atom, ASTAGE_DONE);
71248+ ON_DEBUG((*atom)->committer = NULL);
71249+
71250+ /* Atom's state changes, so wake up everybody waiting for this
71251+ event. */
71252+ wakeup_atom_waiting_list(*atom);
71253+
71254+ /* Decrement the "until commit" reference, at least one txnh (the caller) is
71255+ still open. */
71256+ atomic_dec(&(*atom)->refcount);
71257+
71258+ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
71259+ assert("jmacd-1062", (*atom)->capture_count == 0);
71260+ BUG_ON((*atom)->capture_count != 0);
71261+ assert_spin_locked(&((*atom)->alock));
71262+
71263+ return ret;
71264+}
71265+
71266+/* TXN_TXNH */
71267+
71268+/**
71269+ * force_commit_atom - commit current atom and wait commit completion
71270+ * @txnh:
71271+ *
71272+ * Commits current atom and wait commit completion; current atom and @txnh have
71273+ * to be spinlocked before call, this function unlocks them on exit.
71274+ */
71275+int force_commit_atom(txn_handle *txnh)
71276+{
71277+ txn_atom *atom;
71278+
71279+ assert("zam-837", txnh != NULL);
71280+ assert_spin_locked(&(txnh->hlock));
71281+ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
71282+
71283+ atom = txnh->atom;
71284+
71285+ assert("zam-834", atom != NULL);
71286+ assert_spin_locked(&(atom->alock));
71287+
71288+ /*
71289+ * Set flags for atom and txnh: forcing atom commit and waiting for
71290+ * commit completion
71291+ */
71292+ txnh->flags |= TXNH_WAIT_COMMIT;
71293+ atom->flags |= ATOM_FORCE_COMMIT;
71294+
71295+ spin_unlock_txnh(txnh);
71296+ spin_unlock_atom(atom);
71297+
71298+ /* commit is here */
71299+ reiser4_txn_restart_current();
71300+ return 0;
71301+}
71302+
71303+/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
71304+ * should we commit all atoms including new ones which are created after this
71305+ * functions is called. */
71306+int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
71307+{
71308+ int ret;
71309+ txn_atom *atom;
71310+ txn_mgr *mgr;
71311+ txn_handle *txnh;
71312+ unsigned long start_time = jiffies;
71313+ reiser4_context *ctx = get_current_context();
71314+
71315+ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
71316+ assert("nikita-3058", reiser4_commit_check_locks());
71317+
71318+ reiser4_txn_restart_current();
71319+
71320+ mgr = &get_super_private(super)->tmgr;
71321+
71322+ txnh = ctx->trans;
71323+
71324+ again:
71325+
71326+ spin_lock_txnmgr(mgr);
71327+
71328+ list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
71329+ spin_lock_atom(atom);
71330+
71331+ /* Commit any atom which can be committed. If @commit_new_atoms
71332+ * is not set we commit only atoms which were created before
71333+ * this call is started. */
71334+ if (commit_all_atoms
71335+ || time_before_eq(atom->start_time, start_time)) {
71336+ if (atom->stage <= ASTAGE_POST_COMMIT) {
71337+ spin_unlock_txnmgr(mgr);
71338+
71339+ if (atom->stage < ASTAGE_PRE_COMMIT) {
71340+ spin_lock_txnh(txnh);
71341+ /* Add force-context txnh */
71342+ capture_assign_txnh_nolock(atom, txnh);
71343+ ret = force_commit_atom(txnh);
71344+ if (ret)
71345+ return ret;
71346+ } else
71347+ /* wait atom commit */
71348+ reiser4_atom_wait_event(atom);
71349+
71350+ goto again;
71351+ }
71352+ }
71353+
71354+ spin_unlock_atom(atom);
71355+ }
71356+
71357+#if REISER4_DEBUG
71358+ if (commit_all_atoms) {
71359+ reiser4_super_info_data *sbinfo = get_super_private(super);
71360+ spin_lock_reiser4_super(sbinfo);
71361+ assert("zam-813",
71362+ sbinfo->blocks_fake_allocated_unformatted == 0);
71363+ assert("zam-812", sbinfo->blocks_fake_allocated == 0);
71364+ spin_unlock_reiser4_super(sbinfo);
71365+ }
71366+#endif
71367+
71368+ spin_unlock_txnmgr(mgr);
71369+
71370+ return 0;
71371+}
71372+
71373+/* check whether commit_some_atoms() can commit @atom. Locking is up to the
71374+ * caller */
71375+static int atom_is_committable(txn_atom * atom)
71376+{
71377+ return
71378+ atom->stage < ASTAGE_PRE_COMMIT &&
71379+ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
71380+}
71381+
71382+/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
71383+ * lock at exit */
71384+int commit_some_atoms(txn_mgr * mgr)
71385+{
71386+ int ret = 0;
71387+ txn_atom *atom;
71388+ txn_handle *txnh;
71389+ reiser4_context *ctx;
71390+ struct list_head *pos, *tmp;
71391+
71392+ ctx = get_current_context();
71393+ assert("nikita-2444", ctx != NULL);
71394+
71395+ txnh = ctx->trans;
71396+ spin_lock_txnmgr(mgr);
71397+
71398+ /*
71399+ * this is to avoid gcc complain that atom might be used
71400+ * uninitialized
71401+ */
71402+ atom = NULL;
71403+
71404+ /* look for atom to commit */
71405+ list_for_each_safe(pos, tmp, &mgr->atoms_list) {
71406+ atom = list_entry(pos, txn_atom, atom_link);
71407+ /*
71408+ * first test without taking atom spin lock, whether it is
71409+ * eligible for committing at all
71410+ */
71411+ if (atom_is_committable(atom)) {
71412+ /* now, take spin lock and re-check */
71413+ spin_lock_atom(atom);
71414+ if (atom_is_committable(atom))
71415+ break;
71416+ spin_unlock_atom(atom);
71417+ }
71418+ }
71419+
71420+ ret = (&mgr->atoms_list == pos);
71421+ spin_unlock_txnmgr(mgr);
71422+
71423+ if (ret) {
71424+ /* nothing found */
71425+ spin_unlock(&mgr->daemon->guard);
71426+ return 0;
71427+ }
71428+
71429+ spin_lock_txnh(txnh);
71430+
71431+ BUG_ON(atom == NULL);
71432+ /* Set the atom to force committing */
71433+ atom->flags |= ATOM_FORCE_COMMIT;
71434+
71435+ /* Add force-context txnh */
71436+ capture_assign_txnh_nolock(atom, txnh);
71437+
71438+ spin_unlock_txnh(txnh);
71439+ spin_unlock_atom(atom);
71440+
71441+ /* we are about to release daemon spin lock, notify daemon it
71442+ has to rescan atoms */
71443+ mgr->daemon->rescan = 1;
71444+ spin_unlock(&mgr->daemon->guard);
71445+ reiser4_txn_restart_current();
71446+ return 0;
71447+}
71448+
71449+static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
71450+{
71451+ int atom_stage;
71452+ txn_atom *atom_2;
71453+ int repeat;
71454+
71455+ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
71456+
71457+ atom_stage = atom->stage;
71458+ repeat = 0;
71459+
71460+ if (!spin_trylock_txnmgr(tmgr)) {
71461+ atomic_inc(&atom->refcount);
71462+ spin_unlock_atom(atom);
71463+ spin_lock_txnmgr(tmgr);
71464+ spin_lock_atom(atom);
71465+ repeat = 1;
71466+ if (atom->stage != atom_stage) {
71467+ spin_unlock_txnmgr(tmgr);
71468+ atom_dec_and_unlock(atom);
71469+ return -E_REPEAT;
71470+ }
71471+ atomic_dec(&atom->refcount);
71472+ }
71473+
71474+ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
71475+ if (atom == atom_2)
71476+ continue;
71477+ /*
71478+ * if trylock does not succeed we just do not fuse with that
71479+ * atom.
71480+ */
71481+ if (spin_trylock_atom(atom_2)) {
71482+ if (atom_2->stage < ASTAGE_PRE_COMMIT) {
71483+ spin_unlock_txnmgr(tmgr);
71484+ capture_fuse_into(atom_2, atom);
71485+ /* all locks are lost we can only repeat here */
71486+ return -E_REPEAT;
71487+ }
71488+ spin_unlock_atom(atom_2);
71489+ }
71490+ }
71491+ atom->flags |= ATOM_CANCEL_FUSION;
71492+ spin_unlock_txnmgr(tmgr);
71493+ if (repeat) {
71494+ spin_unlock_atom(atom);
71495+ return -E_REPEAT;
71496+ }
71497+ return 0;
71498+}
71499+
71500+/* Calls jnode_flush for current atom if it exists; if not, just take another
71501+ atom and call jnode_flush() for him. If current transaction handle has
71502+ already assigned atom (current atom) we have to close current transaction
71503+ prior to switch to another atom or do something with current atom. This
71504+ code tries to flush current atom.
71505+
71506+ flush_some_atom() is called as part of memory clearing process. It is
71507+ invoked from balance_dirty_pages(), pdflushd, and entd.
71508+
71509+ If we can flush no nodes, atom is committed, because this frees memory.
71510+
71511+ If atom is too large or too old it is committed also.
71512+*/
71513+int
71514+flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
71515+ int flags)
71516+{
71517+ reiser4_context *ctx = get_current_context();
71518+ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
71519+ txn_handle *txnh = ctx->trans;
71520+ txn_atom *atom;
71521+ int ret;
71522+
71523+ BUG_ON(wbc->nr_to_write == 0);
71524+ BUG_ON(*nr_submitted != 0);
71525+ assert("zam-1042", txnh != NULL);
71526+ repeat:
71527+ if (txnh->atom == NULL) {
71528+ /* current atom is not available, take first from txnmgr */
71529+ spin_lock_txnmgr(tmgr);
71530+
71531+ /* traverse the list of all atoms */
71532+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71533+ /* lock atom before checking its state */
71534+ spin_lock_atom(atom);
71535+
71536+ /*
71537+ * we need an atom which is not being committed and
71538+ * which has no flushers (jnode_flush() add one flusher
71539+ * at the beginning and subtract one at the end).
71540+ */
71541+ if (atom->stage < ASTAGE_PRE_COMMIT &&
71542+ atom->nr_flushers == 0) {
71543+ spin_lock_txnh(txnh);
71544+ capture_assign_txnh_nolock(atom, txnh);
71545+ spin_unlock_txnh(txnh);
71546+
71547+ goto found;
71548+ }
71549+
71550+ spin_unlock_atom(atom);
71551+ }
71552+
71553+ /*
71554+ * Write throttling is case of no one atom can be
71555+ * flushed/committed.
71556+ */
71557+ if (!current_is_pdflush() && !wbc->nonblocking) {
71558+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71559+ spin_lock_atom(atom);
71560+ /* Repeat the check from the above. */
71561+ if (atom->stage < ASTAGE_PRE_COMMIT
71562+ && atom->nr_flushers == 0) {
71563+ spin_lock_txnh(txnh);
71564+ capture_assign_txnh_nolock(atom, txnh);
71565+ spin_unlock_txnh(txnh);
71566+
71567+ goto found;
71568+ }
71569+ if (atom->stage <= ASTAGE_POST_COMMIT) {
71570+ spin_unlock_txnmgr(tmgr);
71571+ /*
71572+ * we just wait until atom's flusher
71573+ * makes a progress in flushing or
71574+ * committing the atom
71575+ */
71576+ reiser4_atom_wait_event(atom);
71577+ goto repeat;
71578+ }
71579+ spin_unlock_atom(atom);
71580+ }
71581+ }
71582+ spin_unlock_txnmgr(tmgr);
71583+ return 0;
71584+ found:
71585+ spin_unlock_txnmgr(tmgr);
71586+ } else
71587+ atom = get_current_atom_locked();
71588+
71589+ BUG_ON(atom->super != ctx->super);
71590+ assert("vs-35", atom->super == ctx->super);
71591+ if (start) {
71592+ spin_lock_jnode(start);
71593+ ret = (atom == start->atom) ? 1 : 0;
71594+ spin_unlock_jnode(start);
71595+ if (ret == 0)
71596+ start = NULL;
71597+ }
71598+ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
71599+ if (ret == 0) {
71600+ /* flush_current_atom returns 0 only if it submitted for write
71601+ nothing */
71602+ BUG_ON(*nr_submitted != 0);
71603+ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
71604+ if (atom->capture_count < tmgr->atom_min_size &&
71605+ !(atom->flags & ATOM_CANCEL_FUSION)) {
71606+ ret = txn_try_to_fuse_small_atom(tmgr, atom);
71607+ if (ret == -E_REPEAT) {
71608+ reiser4_preempt_point();
71609+ goto repeat;
71610+ }
71611+ }
71612+ /* if early flushing could not make more nodes clean,
71613+ * or atom is too old/large,
71614+ * we force current atom to commit */
71615+ /* wait for commit completion but only if this
71616+ * wouldn't stall pdflushd and ent thread. */
71617+ if (!wbc->nonblocking && !ctx->entd)
71618+ txnh->flags |= TXNH_WAIT_COMMIT;
71619+ atom->flags |= ATOM_FORCE_COMMIT;
71620+ }
71621+ spin_unlock_atom(atom);
71622+ } else if (ret == -E_REPEAT) {
71623+ if (*nr_submitted == 0) {
71624+ /* let others who hampers flushing (hold longterm locks,
71625+ for instance) to free the way for flush */
71626+ reiser4_preempt_point();
71627+ goto repeat;
71628+ }
71629+ ret = 0;
71630+ }
71631+/*
71632+ if (*nr_submitted > wbc->nr_to_write)
71633+ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
71634+*/
71635+ reiser4_txn_restart(ctx);
71636+
71637+ return ret;
71638+}
71639+
71640+/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
71641+void reiser4_invalidate_list(struct list_head *head)
71642+{
71643+ while (!list_empty(head)) {
71644+ jnode *node;
71645+
71646+ node = list_entry(head->next, jnode, capture_link);
71647+ spin_lock_jnode(node);
71648+ reiser4_uncapture_block(node);
71649+ jput(node);
71650+ }
71651+}
71652+
71653+static void init_wlinks(txn_wait_links * wlinks)
71654+{
71655+ wlinks->_lock_stack = get_current_lock_stack();
71656+ INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
71657+ INIT_LIST_HEAD(&wlinks->_fwaiting_link);
71658+ wlinks->waitfor_cb = NULL;
71659+ wlinks->waiting_cb = NULL;
71660+}
71661+
71662+/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
71663+void reiser4_atom_wait_event(txn_atom * atom)
71664+{
71665+ txn_wait_links _wlinks;
71666+
71667+ assert_spin_locked(&(atom->alock));
71668+ assert("nikita-3156",
71669+ lock_stack_isclean(get_current_lock_stack()) ||
71670+ atom->nr_running_queues > 0);
71671+
71672+ init_wlinks(&_wlinks);
71673+ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
71674+ atomic_inc(&atom->refcount);
71675+ spin_unlock_atom(atom);
71676+
71677+ reiser4_prepare_to_sleep(_wlinks._lock_stack);
71678+ reiser4_go_to_sleep(_wlinks._lock_stack);
71679+
71680+ spin_lock_atom(atom);
71681+ list_del(&_wlinks._fwaitfor_link);
71682+ atom_dec_and_unlock(atom);
71683+}
71684+
71685+void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
71686+{
71687+ assert("nikita-3535", atom != NULL);
71688+ assert_spin_locked(&(atom->alock));
71689+ assert("nikita-3536", stage <= ASTAGE_INVALID);
71690+ /* Excelsior! */
71691+ assert("nikita-3537", stage >= atom->stage);
71692+ if (atom->stage != stage) {
71693+ atom->stage = stage;
71694+ reiser4_atom_send_event(atom);
71695+ }
71696+}
71697+
71698+/* wake all threads which wait for an event */
71699+void reiser4_atom_send_event(txn_atom * atom)
71700+{
71701+ assert_spin_locked(&(atom->alock));
71702+ wakeup_atom_waitfor_list(atom);
71703+}
71704+
71705+/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
71706+ example, because it does fsync(2)) */
71707+static int should_wait_commit(txn_handle * h)
71708+{
71709+ return h->flags & TXNH_WAIT_COMMIT;
71710+}
71711+
71712+typedef struct commit_data {
71713+ txn_atom *atom;
71714+ txn_handle *txnh;
71715+ long nr_written;
71716+ /* as an optimization we start committing atom by first trying to
71717+ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
71718+ * allows to reduce stalls due to other threads waiting for atom in
71719+ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
71720+ * preliminary flushes. */
71721+ int preflush;
71722+ /* have we waited on atom. */
71723+ int wait;
71724+ int failed;
71725+ int wake_ktxnmgrd_up;
71726+} commit_data;
71727+
71728+/*
71729+ * Called from commit_txnh() repeatedly, until either error happens, or atom
71730+ * commits successfully.
71731+ */
71732+static int try_commit_txnh(commit_data * cd)
71733+{
71734+ int result;
71735+
71736+ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
71737+
71738+ /* Get the atom and txnh locked. */
71739+ cd->atom = txnh_get_atom(cd->txnh);
71740+ assert("jmacd-309", cd->atom != NULL);
71741+ spin_unlock_txnh(cd->txnh);
71742+
71743+ if (cd->wait) {
71744+ cd->atom->nr_waiters--;
71745+ cd->wait = 0;
71746+ }
71747+
71748+ if (cd->atom->stage == ASTAGE_DONE)
71749+ return 0;
71750+
71751+ if (cd->failed)
71752+ return 0;
71753+
71754+ if (atom_should_commit(cd->atom)) {
71755+ /* if atom is _very_ large schedule it for commit as soon as
71756+ * possible. */
71757+ if (atom_should_commit_asap(cd->atom)) {
71758+ /*
71759+ * When atom is in PRE_COMMIT or later stage following
71760+ * invariant (encoded in atom_can_be_committed())
71761+ * holds: there is exactly one non-waiter transaction
71762+ * handle opened on this atom. When thread wants to
71763+ * wait until atom commits (for example sync()) it
71764+ * waits on atom event after increasing
71765+ * atom->nr_waiters (see blow in this function). It
71766+ * cannot be guaranteed that atom is already committed
71767+ * after receiving event, so loop has to be
71768+ * re-started. But if atom switched into PRE_COMMIT
71769+ * stage and became too large, we cannot change its
71770+ * state back to CAPTURE_WAIT (atom stage can only
71771+ * increase monotonically), hence this check.
71772+ */
71773+ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
71774+ reiser4_atom_set_stage(cd->atom,
71775+ ASTAGE_CAPTURE_WAIT);
71776+ cd->atom->flags |= ATOM_FORCE_COMMIT;
71777+ }
71778+ if (cd->txnh->flags & TXNH_DONT_COMMIT) {
71779+ /*
71780+ * this thread (transaction handle that is) doesn't
71781+ * want to commit atom. Notify waiters that handle is
71782+ * closed. This can happen, for example, when we are
71783+ * under VFS directory lock and don't want to commit
71784+ * atom right now to avoid stalling other threads
71785+ * working in the same directory.
71786+ */
71787+
71788+ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
71789+ * commit this atom: no atom waiters and only one
71790+ * (our) open transaction handle. */
71791+ cd->wake_ktxnmgrd_up =
71792+ cd->atom->txnh_count == 1 &&
71793+ cd->atom->nr_waiters == 0;
71794+ reiser4_atom_send_event(cd->atom);
71795+ result = 0;
71796+ } else if (!atom_can_be_committed(cd->atom)) {
71797+ if (should_wait_commit(cd->txnh)) {
71798+ /* sync(): wait for commit */
71799+ cd->atom->nr_waiters++;
71800+ cd->wait = 1;
71801+ reiser4_atom_wait_event(cd->atom);
71802+ result = RETERR(-E_REPEAT);
71803+ } else {
71804+ result = 0;
71805+ }
71806+ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
71807+ /*
71808+ * optimization: flush atom without switching it into
71809+ * ASTAGE_CAPTURE_WAIT.
71810+ *
71811+ * But don't do this for ktxnmgrd, because ktxnmgrd
71812+ * should never block on atom fusion.
71813+ */
71814+ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
71815+ LONG_MAX, &cd->nr_written,
71816+ &cd->atom, NULL);
71817+ if (result == 0) {
71818+ spin_unlock_atom(cd->atom);
71819+ cd->preflush = 0;
71820+ result = RETERR(-E_REPEAT);
71821+ } else /* Atoms wasn't flushed
71822+ * completely. Rinse. Repeat. */
71823+ --cd->preflush;
71824+ } else {
71825+ /* We change atom state to ASTAGE_CAPTURE_WAIT to
71826+ prevent atom fusion and count ourself as an active
71827+ flusher */
71828+ reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
71829+ cd->atom->flags |= ATOM_FORCE_COMMIT;
71830+
71831+ result =
71832+ commit_current_atom(&cd->nr_written, &cd->atom);
71833+ if (result != 0 && result != -E_REPEAT)
71834+ cd->failed = 1;
71835+ }
71836+ } else
71837+ result = 0;
71838+
71839+#if REISER4_DEBUG
71840+ if (result == 0)
71841+ assert_spin_locked(&(cd->atom->alock));
71842+#endif
71843+
71844+ /* perfectly valid assertion, except that when atom/txnh is not locked
71845+ * fusion can take place, and cd->atom points nowhere. */
71846+ /*
71847+ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
71848+ */
71849+ return result;
71850+}
71851+
71852+/* Called to commit a transaction handle. This decrements the atom's number of open
71853+ handles and if it is the last handle to commit and the atom should commit, initiates
71854+ atom commit. if commit does not fail, return number of written blocks */
71855+static int commit_txnh(txn_handle * txnh)
71856+{
71857+ commit_data cd;
71858+ assert("umka-192", txnh != NULL);
71859+
71860+ memset(&cd, 0, sizeof cd);
71861+ cd.txnh = txnh;
71862+ cd.preflush = 10;
71863+
71864+ /* calls try_commit_txnh() until either atom commits, or error
71865+ * happens */
71866+ while (try_commit_txnh(&cd) != 0)
71867+ reiser4_preempt_point();
71868+
71869+ spin_lock_txnh(txnh);
71870+
71871+ cd.atom->txnh_count -= 1;
71872+ txnh->atom = NULL;
71873+ /* remove transaction handle from atom's list of transaction handles */
71874+ list_del_init(&txnh->txnh_link);
71875+
71876+ spin_unlock_txnh(txnh);
71877+ atom_dec_and_unlock(cd.atom);
71878+ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
71879+ * because it takes time) by current thread, we do that work
71880+ * asynchronously by ktxnmgrd daemon. */
71881+ if (cd.wake_ktxnmgrd_up)
71882+ ktxnmgrd_kick(&get_current_super_private()->tmgr);
71883+
71884+ return 0;
71885+}
71886+
71887+/* TRY_CAPTURE */
71888+
71889+/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
71890+ condition indicates that the request should be retried, and it may block if the
71891+ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
71892+
71893+ This routine encodes the basic logic of block capturing described by:
71894+
71895+ http://namesys.com/v4/v4.html
71896+
71897+ Our goal here is to ensure that any two blocks that contain dependent modifications
71898+ should commit at the same time. This function enforces this discipline by initiating
71899+ fusion whenever a transaction handle belonging to one atom requests to read or write a
71900+ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
71901+
71902+ In addition, this routine handles the initial assignment of atoms to blocks and
71903+ transaction handles. These are possible outcomes of this function:
71904+
71905+ 1. The block and handle are already part of the same atom: return immediate success
71906+
71907+ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
71908+ the handle to the block's atom.
71909+
71910+ 3. The handle is assigned but the block is not: call capture_assign_block to assign
71911+ the block to the handle's atom.
71912+
71913+ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
71914+ to fuse atoms.
71915+
71916+ 5. Neither block nor handle are assigned: create a new atom and assign them both.
71917+
71918+ 6. A read request for a non-captured block: return immediate success.
71919+
71920+ This function acquires and releases the handle's spinlock. This function is called
71921+ under the jnode lock and if the return value is 0, it returns with the jnode lock still
71922+ held. If the return is -E_REPEAT or some other error condition, the jnode lock is
71923+ released. The external interface (reiser4_try_capture) manages re-aquiring the jnode
71924+ lock in the failure case.
71925+*/
71926+static int try_capture_block(
71927+ txn_handle * txnh, jnode * node, txn_capture mode,
71928+ txn_atom ** atom_alloc)
71929+{
71930+ txn_atom *block_atom;
71931+ txn_atom *txnh_atom;
71932+
71933+ /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
71934+ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
71935+
71936+ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
71937+ * node->tree somewhere. */
71938+ assert("umka-194", txnh != NULL);
71939+ assert("umka-195", node != NULL);
71940+
71941+ /* The jnode is already locked! Being called from reiser4_try_capture(). */
71942+ assert_spin_locked(&(node->guard));
71943+ block_atom = node->atom;
71944+
71945+ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
71946+ let us touch the atoms themselves. */
71947+ spin_lock_txnh(txnh);
71948+ txnh_atom = txnh->atom;
71949+ /* Process of capturing continues into one of four branches depends on
71950+ which atoms from (block atom (node->atom), current atom (txnh->atom))
71951+ exist. */
71952+ if (txnh_atom == NULL) {
71953+ if (block_atom == NULL) {
71954+ spin_unlock_txnh(txnh);
71955+ spin_unlock_jnode(node);
71956+ /* assign empty atom to the txnh and repeat */
71957+ return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
71958+ } else {
71959+ atomic_inc(&block_atom->refcount);
71960+ /* node spin-lock isn't needed anymore */
71961+ spin_unlock_jnode(node);
71962+ if (!spin_trylock_atom(block_atom)) {
71963+ spin_unlock_txnh(txnh);
71964+ spin_lock_atom(block_atom);
71965+ spin_lock_txnh(txnh);
71966+ }
71967+ /* re-check state after getting txnh and the node
71968+ * atom spin-locked */
71969+ if (node->atom != block_atom || txnh->atom != NULL) {
71970+ spin_unlock_txnh(txnh);
71971+ atom_dec_and_unlock(block_atom);
71972+ return RETERR(-E_REPEAT);
71973+ }
71974+ atomic_dec(&block_atom->refcount);
71975+ if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
71976+ (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
71977+ block_atom->txnh_count != 0))
71978+ return capture_fuse_wait(txnh, block_atom, NULL, mode);
71979+ capture_assign_txnh_nolock(block_atom, txnh);
71980+ spin_unlock_txnh(txnh);
71981+ spin_unlock_atom(block_atom);
71982+ return RETERR(-E_REPEAT);
71983+ }
71984+ } else {
71985+ /* It is time to perform deadlock prevention check over the
71986+ node we want to capture. It is possible this node was locked
71987+ for read without capturing it. The optimization which allows
71988+ to do it helps us in keeping atoms independent as long as
71989+ possible but it may cause lock/fuse deadlock problems.
71990+
71991+ A number of similar deadlock situations with locked but not
71992+ captured nodes were found. In each situation there are two
71993+ or more threads: one of them does flushing while another one
71994+ does routine balancing or tree lookup. The flushing thread
71995+ (F) sleeps in long term locking request for node (N), another
71996+ thread (A) sleeps in trying to capture some node already
71997+ belonging the atom F, F has a state which prevents
71998+ immediately fusion .
71999+
72000+ Deadlocks of this kind cannot happen if node N was properly
72001+ captured by thread A. The F thread fuse atoms before locking
72002+ therefore current atom of thread F and current atom of thread
72003+ A became the same atom and thread A may proceed. This does
72004+ not work if node N was not captured because the fusion of
72005+ atom does not happens.
72006+
72007+ The following scheme solves the deadlock: If
72008+ longterm_lock_znode locks and does not capture a znode, that
72009+ znode is marked as MISSED_IN_CAPTURE. A node marked this way
72010+ is processed by the code below which restores the missed
72011+ capture and fuses current atoms of all the node lock owners
72012+ by calling the fuse_not_fused_lock_owners() function. */
72013+ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
72014+ JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
72015+ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
72016+ spin_unlock_txnh(txnh);
72017+ spin_unlock_jnode(node);
72018+ fuse_not_fused_lock_owners(txnh, JZNODE(node));
72019+ return RETERR(-E_REPEAT);
72020+ }
72021+ }
72022+ if (block_atom == NULL) {
72023+ atomic_inc(&txnh_atom->refcount);
72024+ spin_unlock_txnh(txnh);
72025+ if (!spin_trylock_atom(txnh_atom)) {
72026+ spin_unlock_jnode(node);
72027+ spin_lock_atom(txnh_atom);
72028+ spin_lock_jnode(node);
72029+ }
72030+ if (txnh->atom != txnh_atom || node->atom != NULL
72031+ || JF_ISSET(node, JNODE_IS_DYING)) {
72032+ spin_unlock_jnode(node);
72033+ atom_dec_and_unlock(txnh_atom);
72034+ return RETERR(-E_REPEAT);
72035+ }
72036+ atomic_dec(&txnh_atom->refcount);
72037+ capture_assign_block_nolock(txnh_atom, node);
72038+ spin_unlock_atom(txnh_atom);
72039+ } else {
72040+ if (txnh_atom != block_atom) {
72041+ if (mode & TXN_CAPTURE_DONT_FUSE) {
72042+ spin_unlock_txnh(txnh);
72043+ spin_unlock_jnode(node);
72044+ /* we are in a "no-fusion" mode and @node is
72045+ * already part of transaction. */
72046+ return RETERR(-E_NO_NEIGHBOR);
72047+ }
72048+ return capture_init_fusion(node, txnh, mode);
72049+ }
72050+ spin_unlock_txnh(txnh);
72051+ }
72052+ }
72053+ return 0;
72054+}
72055+
72056+static txn_capture
72057+build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
72058+{
72059+ txn_capture cap_mode;
72060+
72061+ assert_spin_locked(&(node->guard));
72062+
72063+ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
72064+
72065+ if (lock_mode == ZNODE_WRITE_LOCK) {
72066+ cap_mode = TXN_CAPTURE_WRITE;
72067+ } else if (node->atom != NULL) {
72068+ cap_mode = TXN_CAPTURE_WRITE;
72069+ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
72070+ jnode_get_level(node) == LEAF_LEVEL) {
72071+ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
72072+ /* We only need a READ_FUSING capture at the leaf level. This
72073+ is because the internal levels of the tree (twigs included)
72074+ are redundant from the point of the user that asked for a
72075+ read-fusing transcrash. The user only wants to read-fuse
72076+ atoms due to reading uncommitted data that another user has
72077+ written. It is the file system that reads/writes the
72078+ internal tree levels, the user only reads/writes leaves. */
72079+ cap_mode = TXN_CAPTURE_READ_ATOMIC;
72080+ } else {
72081+ /* In this case (read lock at a non-leaf) there's no reason to
72082+ * capture. */
72083+ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
72084+ return 0;
72085+ }
72086+
72087+ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
72088+ assert("nikita-3186", cap_mode != 0);
72089+ return cap_mode;
72090+}
72091+
72092+/* This is an external interface to try_capture_block(), it calls
72093+ try_capture_block() repeatedly as long as -E_REPEAT is returned.
72094+
72095+ @node: node to capture,
72096+ @lock_mode: read or write lock is used in capture mode calculation,
72097+ @flags: see txn_capture flags enumeration,
72098+ @can_coc : can copy-on-capture
72099+
72100+ @return: 0 - node was successfully captured, -E_REPEAT - capture request
72101+ cannot be processed immediately as it was requested in flags,
72102+ < 0 - other errors.
72103+*/
72104+int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
72105+ txn_capture flags)
72106+{
72107+ txn_atom *atom_alloc = NULL;
72108+ txn_capture cap_mode;
72109+ txn_handle *txnh = get_current_context()->trans;
72110+ int ret;
72111+
72112+ assert_spin_locked(&(node->guard));
72113+
72114+ repeat:
72115+ if (JF_ISSET(node, JNODE_IS_DYING))
72116+ return RETERR(-EINVAL);
72117+ if (node->atom != NULL && txnh->atom == node->atom)
72118+ return 0;
72119+ cap_mode = build_capture_mode(node, lock_mode, flags);
72120+ if (cap_mode == 0 ||
72121+ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
72122+ /* Mark this node as "MISSED". It helps in further deadlock
72123+ * analysis */
72124+ if (jnode_is_znode(node))
72125+ JF_SET(node, JNODE_MISSED_IN_CAPTURE);
72126+ return 0;
72127+ }
72128+ /* Repeat try_capture as long as -E_REPEAT is returned. */
72129+ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
72130+ /* Regardless of non_blocking:
72131+
72132+ If ret == 0 then jnode is still locked.
72133+ If ret != 0 then jnode is unlocked.
72134+ */
72135+#if REISER4_DEBUG
72136+ if (ret == 0)
72137+ assert_spin_locked(&(node->guard));
72138+ else
72139+ assert_spin_not_locked(&(node->guard));
72140+#endif
72141+ assert_spin_not_locked(&(txnh->guard));
72142+
72143+ if (ret == -E_REPEAT) {
72144+ /* E_REPEAT implies all locks were released, therefore we need
72145+ to take the jnode's lock again. */
72146+ spin_lock_jnode(node);
72147+
72148+ /* Although this may appear to be a busy loop, it is not.
72149+ There are several conditions that cause E_REPEAT to be
72150+ returned by the call to try_capture_block, all cases
72151+ indicating some kind of state change that means you should
72152+ retry the request and will get a different result. In some
72153+ cases this could be avoided with some extra code, but
72154+ generally it is done because the necessary locks were
72155+ released as a result of the operation and repeating is the
72156+ simplest thing to do (less bug potential). The cases are:
72157+ atom fusion returns E_REPEAT after it completes (jnode and
72158+ txnh were unlocked); race conditions in assign_block,
72159+ assign_txnh, and init_fusion return E_REPEAT (trylock
72160+ failure); after going to sleep in capture_fuse_wait
72161+ (request was blocked but may now succeed). I'm not quite
72162+ sure how capture_copy works yet, but it may also return
72163+ E_REPEAT. When the request is legitimately blocked, the
72164+ requestor goes to sleep in fuse_wait, so this is not a busy
72165+ loop. */
72166+ /* NOTE-NIKITA: still don't understand:
72167+
72168+ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
72169+
72170+ looks like busy loop?
72171+ */
72172+ goto repeat;
72173+ }
72174+
72175+ /* free extra atom object that was possibly allocated by
72176+ try_capture_block().
72177+
72178+ Do this before acquiring jnode spin lock to
72179+ minimize time spent under lock. --nikita */
72180+ if (atom_alloc != NULL) {
72181+ kmem_cache_free(_atom_slab, atom_alloc);
72182+ }
72183+
72184+ if (ret != 0) {
72185+ if (ret == -E_BLOCK) {
72186+ assert("nikita-3360",
72187+ cap_mode & TXN_CAPTURE_NONBLOCKING);
72188+ ret = -E_REPEAT;
72189+ }
72190+
72191+ /* Failure means jnode is not locked. FIXME_LATER_JMACD May
72192+ want to fix the above code to avoid releasing the lock and
72193+ re-acquiring it, but there are cases were failure occurs
72194+ when the lock is not held, and those cases would need to be
72195+ modified to re-take the lock. */
72196+ spin_lock_jnode(node);
72197+ }
72198+
72199+ /* Jnode is still locked. */
72200+ assert_spin_locked(&(node->guard));
72201+ return ret;
72202+}
72203+
72204+static void release_two_atoms(txn_atom *one, txn_atom *two)
72205+{
72206+ spin_unlock_atom(one);
72207+ atom_dec_and_unlock(two);
72208+ spin_lock_atom(one);
72209+ atom_dec_and_unlock(one);
72210+}
72211+
72212+/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
72213+ returned by that routine. The txn_capture request mode is computed here depending on
72214+ the transaction handle's type and the lock request. This is called from the depths of
72215+ the lock manager with the jnode lock held and it always returns with the jnode lock
72216+ held.
72217+*/
72218+
72219+/* fuse all 'active' atoms of lock owners of given node. */
72220+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
72221+{
72222+ lock_handle *lh;
72223+ int repeat;
72224+ txn_atom *atomh, *atomf;
72225+ reiser4_context *me = get_current_context();
72226+ reiser4_context *ctx = NULL;
72227+
72228+ assert_spin_not_locked(&(ZJNODE(node)->guard));
72229+ assert_spin_not_locked(&(txnh->hlock));
72230+
72231+ repeat:
72232+ repeat = 0;
72233+ atomh = txnh_get_atom(txnh);
72234+ spin_unlock_txnh(txnh);
72235+ assert("zam-692", atomh != NULL);
72236+
72237+ spin_lock_zlock(&node->lock);
72238+ /* inspect list of lock owners */
72239+ list_for_each_entry(lh, &node->lock.owners, owners_link) {
72240+ ctx = get_context_by_lock_stack(lh->owner);
72241+ if (ctx == me)
72242+ continue;
72243+ /* below we use two assumptions to avoid addition spin-locks
72244+ for checking the condition :
72245+
72246+ 1) if the lock stack has lock, the transaction should be
72247+ opened, i.e. ctx->trans != NULL;
72248+
72249+ 2) reading of well-aligned ctx->trans->atom is atomic, if it
72250+ equals to the address of spin-locked atomh, we take that
72251+ the atoms are the same, nothing has to be captured. */
72252+ if (atomh != ctx->trans->atom) {
72253+ reiser4_wake_up(lh->owner);
72254+ repeat = 1;
72255+ break;
72256+ }
72257+ }
72258+ if (repeat) {
72259+ if (!spin_trylock_txnh(ctx->trans)) {
72260+ spin_unlock_zlock(&node->lock);
72261+ spin_unlock_atom(atomh);
72262+ goto repeat;
72263+ }
72264+ atomf = ctx->trans->atom;
72265+ if (atomf == NULL) {
72266+ capture_assign_txnh_nolock(atomh, ctx->trans);
72267+ /* release zlock lock _after_ assigning the atom to the
72268+ * transaction handle, otherwise the lock owner thread
72269+ * may unlock all znodes, exit kernel context and here
72270+ * we would access an invalid transaction handle. */
72271+ spin_unlock_zlock(&node->lock);
72272+ spin_unlock_atom(atomh);
72273+ spin_unlock_txnh(ctx->trans);
72274+ goto repeat;
72275+ }
72276+ assert("zam-1059", atomf != atomh);
72277+ spin_unlock_zlock(&node->lock);
72278+ atomic_inc(&atomh->refcount);
72279+ atomic_inc(&atomf->refcount);
72280+ spin_unlock_txnh(ctx->trans);
72281+ if (atomf > atomh) {
72282+ spin_lock_atom_nested(atomf);
72283+ } else {
72284+ spin_unlock_atom(atomh);
72285+ spin_lock_atom(atomf);
72286+ spin_lock_atom_nested(atomh);
72287+ }
72288+ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
72289+ release_two_atoms(atomf, atomh);
72290+ goto repeat;
72291+ }
72292+ atomic_dec(&atomh->refcount);
72293+ atomic_dec(&atomf->refcount);
72294+ capture_fuse_into(atomf, atomh);
72295+ goto repeat;
72296+ }
72297+ spin_unlock_zlock(&node->lock);
72298+ spin_unlock_atom(atomh);
72299+}
72300+
72301+/* This is the interface to capture unformatted nodes via their struct page
72302+ reference. Currently it is only used in reiser4_invalidatepage */
72303+int try_capture_page_to_invalidate(struct page *pg)
72304+{
72305+ int ret;
72306+ jnode *node;
72307+
72308+ assert("umka-292", pg != NULL);
72309+ assert("nikita-2597", PageLocked(pg));
72310+
72311+ if (IS_ERR(node = jnode_of_page(pg))) {
72312+ return PTR_ERR(node);
72313+ }
72314+
72315+ spin_lock_jnode(node);
72316+ unlock_page(pg);
72317+
72318+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
72319+ spin_unlock_jnode(node);
72320+ jput(node);
72321+ lock_page(pg);
72322+ return ret;
72323+}
72324+
72325+/* This informs the transaction manager when a node is deleted. Add the block to the
72326+ atom's delete set and uncapture the block.
72327+
72328+VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
72329+explanations. find all the functions that use it, and unless there is some very
72330+good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
72331+move the loop to inside the function.
72332+
72333+VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
72334+ */
72335+void reiser4_uncapture_page(struct page *pg)
72336+{
72337+ jnode *node;
72338+ txn_atom *atom;
72339+
72340+ assert("umka-199", pg != NULL);
72341+ assert("nikita-3155", PageLocked(pg));
72342+
72343+ clear_page_dirty_for_io(pg);
72344+
72345+ reiser4_wait_page_writeback(pg);
72346+
72347+ node = jprivate(pg);
72348+ BUG_ON(node == NULL);
72349+
72350+ spin_lock_jnode(node);
72351+
72352+ atom = jnode_get_atom(node);
72353+ if (atom == NULL) {
72354+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72355+ spin_unlock_jnode(node);
72356+ return;
72357+ }
72358+
72359+ /* We can remove jnode from transaction even if it is on flush queue
72360+ * prepped list, we only need to be sure that flush queue is not being
72361+ * written by reiser4_write_fq(). reiser4_write_fq() does not use atom
72362+ * spin lock for protection of the prepped nodes list, instead
72363+ * write_fq() increments atom's nr_running_queues counters for the time
72364+ * when prepped list is not protected by spin lock. Here we check this
72365+ * counter if we want to remove jnode from flush queue and, if the
72366+ * counter is not zero, wait all reiser4_write_fq() for this atom to
72367+ * complete. This is not significant overhead. */
72368+ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
72369+ spin_unlock_jnode(node);
72370+ /*
72371+ * at this moment we want to wait for "atom event", viz. wait
72372+ * until @node can be removed from flush queue. But
72373+ * reiser4_atom_wait_event() cannot be called with page locked,
72374+ * because it deadlocks with jnode_extent_write(). Unlock page,
72375+ * after making sure (through page_cache_get()) that it cannot
72376+ * be released from memory.
72377+ */
72378+ page_cache_get(pg);
72379+ unlock_page(pg);
72380+ reiser4_atom_wait_event(atom);
72381+ lock_page(pg);
72382+ /*
72383+ * page may has been detached by ->writepage()->releasepage().
72384+ */
72385+ reiser4_wait_page_writeback(pg);
72386+ spin_lock_jnode(node);
72387+ page_cache_release(pg);
72388+ atom = jnode_get_atom(node);
72389+/* VS-FIXME-HANS: improve the commenting in this function */
72390+ if (atom == NULL) {
72391+ spin_unlock_jnode(node);
72392+ return;
72393+ }
72394+ }
72395+ reiser4_uncapture_block(node);
72396+ spin_unlock_atom(atom);
72397+ jput(node);
72398+}
72399+
72400+/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
72401+ * inode's tree of jnodes */
72402+void reiser4_uncapture_jnode(jnode * node)
72403+{
72404+ txn_atom *atom;
72405+
72406+ assert_spin_locked(&(node->guard));
72407+ assert("", node->pg == 0);
72408+
72409+ atom = jnode_get_atom(node);
72410+ if (atom == NULL) {
72411+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72412+ spin_unlock_jnode(node);
72413+ return;
72414+ }
72415+
72416+ reiser4_uncapture_block(node);
72417+ spin_unlock_atom(atom);
72418+ jput(node);
72419+}
72420+
72421+/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
72422+ increases atom refcount and txnh_count, adds to txnh_list. */
72423+static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
72424+{
72425+ assert("umka-200", atom != NULL);
72426+ assert("umka-201", txnh != NULL);
72427+
72428+ assert_spin_locked(&(txnh->hlock));
72429+ assert_spin_locked(&(atom->alock));
72430+ assert("jmacd-824", txnh->atom == NULL);
72431+ assert("nikita-3540", atom_isopen(atom));
72432+ BUG_ON(txnh->atom != NULL);
72433+
72434+ atomic_inc(&atom->refcount);
72435+ txnh->atom = atom;
72436+ reiser4_ctx_gfp_mask_set();
72437+ list_add_tail(&txnh->txnh_link, &atom->txnh_list);
72438+ atom->txnh_count += 1;
72439+}
72440+
72441+/* No-locking version of assign_block. Sets the block's atom pointer, references the
72442+ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
72443+static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
72444+{
72445+ assert("umka-202", atom != NULL);
72446+ assert("umka-203", node != NULL);
72447+ assert_spin_locked(&(node->guard));
72448+ assert_spin_locked(&(atom->alock));
72449+ assert("jmacd-323", node->atom == NULL);
72450+ BUG_ON(!list_empty_careful(&node->capture_link));
72451+ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
72452+
72453+ /* Pointer from jnode to atom is not counted in atom->refcount. */
72454+ node->atom = atom;
72455+
72456+ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
72457+ atom->capture_count += 1;
72458+ /* reference to jnode is acquired by atom. */
72459+ jref(node);
72460+
72461+ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
72462+
72463+ LOCK_CNT_INC(t_refs);
72464+}
72465+
72466+/* common code for dirtying both unformatted jnodes and formatted znodes. */
72467+static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
72468+{
72469+ assert_spin_locked(&(node->guard));
72470+ assert_spin_locked(&(atom->alock));
72471+ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
72472+
72473+ JF_SET(node, JNODE_DIRTY);
72474+
72475+ get_current_context()->nr_marked_dirty++;
72476+
72477+ /* We grab2flush_reserve one additional block only if node was
72478+ not CREATED and jnode_flush did not sort it into neither
72479+ relocate set nor overwrite one. If node is in overwrite or
72480+ relocate set we assume that atom's flush reserved counter was
72481+ already adjusted. */
72482+ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
72483+ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
72484+ && !jnode_is_cluster_page(node)) {
72485+ assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
72486+ assert("vs-1506", *jnode_get_block(node) != 0);
72487+ grabbed2flush_reserved_nolock(atom, (__u64) 1);
72488+ JF_SET(node, JNODE_FLUSH_RESERVED);
72489+ }
72490+
72491+ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
72492+ /* If the atom is not set yet, it will be added to the appropriate list in
72493+ capture_assign_block_nolock. */
72494+ /* Sometimes a node is set dirty before being captured -- the case for new
72495+ jnodes. In that case the jnode will be added to the appropriate list
72496+ in capture_assign_block_nolock. Another reason not to re-link jnode is
72497+ that jnode is on a flush queue (see flush.c for details) */
72498+
72499+ int level = jnode_get_level(node);
72500+
72501+ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
72502+ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
72503+ assert("nikita-2607", 0 <= level);
72504+ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
72505+
72506+ /* move node to atom's dirty list */
72507+ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
72508+ ON_DEBUG(count_jnode
72509+ (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
72510+ }
72511+}
72512+
72513+/* Set the dirty status for this (spin locked) jnode. */
72514+void jnode_make_dirty_locked(jnode * node)
72515+{
72516+ assert("umka-204", node != NULL);
72517+ assert_spin_locked(&(node->guard));
72518+
72519+ if (REISER4_DEBUG && rofs_jnode(node)) {
72520+ warning("nikita-3365", "Dirtying jnode on rofs");
72521+ dump_stack();
72522+ }
72523+
72524+ /* Fast check for already dirty node */
72525+ if (!JF_ISSET(node, JNODE_DIRTY)) {
72526+ txn_atom *atom;
72527+
72528+ atom = jnode_get_atom(node);
72529+ assert("vs-1094", atom);
72530+ /* Check jnode dirty status again because node spin lock might
72531+ * be released inside jnode_get_atom(). */
72532+ if (likely(!JF_ISSET(node, JNODE_DIRTY)))
72533+ do_jnode_make_dirty(node, atom);
72534+ spin_unlock_atom(atom);
72535+ }
72536+}
72537+
72538+/* Set the dirty status for this znode. */
72539+void znode_make_dirty(znode * z)
72540+{
72541+ jnode *node;
72542+ struct page *page;
72543+
72544+ assert("umka-204", z != NULL);
72545+ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
72546+ assert("nikita-3560", znode_is_write_locked(z));
72547+
72548+ node = ZJNODE(z);
72549+ /* znode is longterm locked, we can check dirty bit without spinlock */
72550+ if (JF_ISSET(node, JNODE_DIRTY)) {
72551+ /* znode is dirty already. All we have to do is to change znode version */
72552+ z->version = znode_build_version(jnode_get_tree(node));
72553+ return;
72554+ }
72555+
72556+ spin_lock_jnode(node);
72557+ jnode_make_dirty_locked(node);
72558+ page = jnode_page(node);
72559+ if (page != NULL) {
72560+ /* this is useful assertion (allows one to check that no
72561+ * modifications are lost due to update of in-flight page),
72562+ * but it requires locking on page to check PG_writeback
72563+ * bit. */
72564+ /* assert("nikita-3292",
72565+ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
72566+ page_cache_get(page);
72567+
72568+ /* jnode lock is not needed for the rest of
72569+ * znode_set_dirty(). */
72570+ spin_unlock_jnode(node);
72571+ /* reiser4 file write code calls set_page_dirty for
72572+ * unformatted nodes, for formatted nodes we do it here. */
72573+ reiser4_set_page_dirty_internal(page);
72574+ page_cache_release(page);
72575+ /* bump version counter in znode */
72576+ z->version = znode_build_version(jnode_get_tree(node));
72577+ } else {
72578+ assert("zam-596", znode_above_root(JZNODE(node)));
72579+ spin_unlock_jnode(node);
72580+ }
72581+
72582+ assert("nikita-1900", znode_is_write_locked(z));
72583+ assert("jmacd-9777", node->atom != NULL);
72584+}
72585+
72586+int reiser4_sync_atom(txn_atom * atom)
72587+{
72588+ int result;
72589+ txn_handle *txnh;
72590+
72591+ txnh = get_current_context()->trans;
72592+
72593+ result = 0;
72594+ if (atom != NULL) {
72595+ if (atom->stage < ASTAGE_PRE_COMMIT) {
72596+ spin_lock_txnh(txnh);
72597+ capture_assign_txnh_nolock(atom, txnh);
72598+ result = force_commit_atom(txnh);
72599+ } else if (atom->stage < ASTAGE_POST_COMMIT) {
72600+ /* wait atom commit */
72601+ reiser4_atom_wait_event(atom);
72602+ /* try once more */
72603+ result = RETERR(-E_REPEAT);
72604+ } else
72605+ spin_unlock_atom(atom);
72606+ }
72607+ return result;
72608+}
72609+
72610+#if REISER4_DEBUG
72611+
72612+/* move jnode form one list to another
72613+ call this after atom->capture_count is updated */
72614+void
72615+count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
72616+ atom_list new_list, int check_lists)
72617+{
72618+ struct list_head *pos;
72619+
72620+ assert("zam-1018", atom_is_protected(atom));
72621+ assert_spin_locked(&(node->guard));
72622+ assert("", NODE_LIST(node) == old_list);
72623+
72624+ switch (NODE_LIST(node)) {
72625+ case NOT_CAPTURED:
72626+ break;
72627+ case DIRTY_LIST:
72628+ assert("", atom->dirty > 0);
72629+ atom->dirty--;
72630+ break;
72631+ case CLEAN_LIST:
72632+ assert("", atom->clean > 0);
72633+ atom->clean--;
72634+ break;
72635+ case FQ_LIST:
72636+ assert("", atom->fq > 0);
72637+ atom->fq--;
72638+ break;
72639+ case WB_LIST:
72640+ assert("", atom->wb > 0);
72641+ atom->wb--;
72642+ break;
72643+ case OVRWR_LIST:
72644+ assert("", atom->ovrwr > 0);
72645+ atom->ovrwr--;
72646+ break;
72647+ default:
72648+ impossible("", "");
72649+ }
72650+
72651+ switch (new_list) {
72652+ case NOT_CAPTURED:
72653+ break;
72654+ case DIRTY_LIST:
72655+ atom->dirty++;
72656+ break;
72657+ case CLEAN_LIST:
72658+ atom->clean++;
72659+ break;
72660+ case FQ_LIST:
72661+ atom->fq++;
72662+ break;
72663+ case WB_LIST:
72664+ atom->wb++;
72665+ break;
72666+ case OVRWR_LIST:
72667+ atom->ovrwr++;
72668+ break;
72669+ default:
72670+ impossible("", "");
72671+ }
72672+ ASSIGN_NODE_LIST(node, new_list);
72673+ if (0 && check_lists) {
72674+ int count;
72675+ tree_level level;
72676+
72677+ count = 0;
72678+
72679+ /* flush queue list */
72680+ /* reiser4_check_fq(atom); */
72681+
72682+ /* dirty list */
72683+ count = 0;
72684+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
72685+ list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
72686+ count++;
72687+ }
72688+ if (count != atom->dirty)
72689+ warning("", "dirty counter %d, real %d\n", atom->dirty,
72690+ count);
72691+
72692+ /* clean list */
72693+ count = 0;
72694+ list_for_each(pos, ATOM_CLEAN_LIST(atom))
72695+ count++;
72696+ if (count != atom->clean)
72697+ warning("", "clean counter %d, real %d\n", atom->clean,
72698+ count);
72699+
72700+ /* wb list */
72701+ count = 0;
72702+ list_for_each(pos, ATOM_WB_LIST(atom))
72703+ count++;
72704+ if (count != atom->wb)
72705+ warning("", "wb counter %d, real %d\n", atom->wb,
72706+ count);
72707+
72708+ /* overwrite list */
72709+ count = 0;
72710+ list_for_each(pos, ATOM_OVRWR_LIST(atom))
72711+ count++;
72712+
72713+ if (count != atom->ovrwr)
72714+ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
72715+ count);
72716+ }
72717+ assert("vs-1624", atom->num_queued == atom->fq);
72718+ if (atom->capture_count !=
72719+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
72720+ printk
72721+ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
72722+ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
72723+ atom->wb, atom->fq);
72724+ assert("vs-1622",
72725+ atom->capture_count ==
72726+ atom->dirty + atom->clean + atom->ovrwr + atom->wb +
72727+ atom->fq);
72728+ }
72729+}
72730+
72731+#endif
72732+
72733+/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
72734+ * lock should be taken before calling this function. */
72735+void jnode_make_wander_nolock(jnode * node)
72736+{
72737+ txn_atom *atom;
72738+
72739+ assert("nikita-2431", node != NULL);
72740+ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
72741+ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
72742+ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
72743+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
72744+
72745+ atom = node->atom;
72746+
72747+ assert("zam-895", atom != NULL);
72748+ assert("zam-894", atom_is_protected(atom));
72749+
72750+ JF_SET(node, JNODE_OVRWR);
72751+ /* move node to atom's overwrite list */
72752+ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
72753+ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
72754+}
72755+
72756+/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
72757+ * this function. */
72758+void jnode_make_wander(jnode * node)
72759+{
72760+ txn_atom *atom;
72761+
72762+ spin_lock_jnode(node);
72763+ atom = jnode_get_atom(node);
72764+ assert("zam-913", atom != NULL);
72765+ assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
72766+
72767+ jnode_make_wander_nolock(node);
72768+ spin_unlock_atom(atom);
72769+ spin_unlock_jnode(node);
72770+}
72771+
72772+/* this just sets RELOC bit */
72773+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
72774+{
72775+ assert_spin_locked(&(node->guard));
72776+ assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
72777+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
72778+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
72779+ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
72780+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
72781+ jnode_set_reloc(node);
72782+}
72783+
72784+/* Make znode RELOC and put it on flush queue */
72785+void znode_make_reloc(znode * z, flush_queue_t * fq)
72786+{
72787+ jnode *node;
72788+ txn_atom *atom;
72789+
72790+ node = ZJNODE(z);
72791+ spin_lock_jnode(node);
72792+
72793+ atom = jnode_get_atom(node);
72794+ assert("zam-919", atom != NULL);
72795+
72796+ jnode_make_reloc_nolock(fq, node);
72797+ queue_jnode(fq, node);
72798+
72799+ spin_unlock_atom(atom);
72800+ spin_unlock_jnode(node);
72801+
72802+}
72803+
72804+/* Make unformatted node RELOC and put it on flush queue */
72805+void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
72806+{
72807+ assert("vs-1479", jnode_is_unformatted(node));
72808+
72809+ jnode_make_reloc_nolock(fq, node);
72810+ queue_jnode(fq, node);
72811+}
72812+
72813+int reiser4_capture_super_block(struct super_block *s)
72814+{
72815+ int result;
72816+ znode *uber;
72817+ lock_handle lh;
72818+
72819+ init_lh(&lh);
72820+ result = get_uber_znode(reiser4_get_tree(s),
72821+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
72822+ if (result)
72823+ return result;
72824+
72825+ uber = lh.node;
72826+ /* Grabbing one block for superblock */
72827+ result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
72828+ if (result != 0)
72829+ return result;
72830+
72831+ znode_make_dirty(uber);
72832+
72833+ done_lh(&lh);
72834+ return 0;
72835+}
72836+
72837+/* Wakeup every handle on the atom's WAITFOR list */
72838+static void wakeup_atom_waitfor_list(txn_atom * atom)
72839+{
72840+ txn_wait_links *wlinks;
72841+
72842+ assert("umka-210", atom != NULL);
72843+
72844+ /* atom is locked */
72845+ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
72846+ if (wlinks->waitfor_cb == NULL ||
72847+ wlinks->waitfor_cb(atom, wlinks))
72848+ /* Wake up. */
72849+ reiser4_wake_up(wlinks->_lock_stack);
72850+ }
72851+}
72852+
72853+/* Wakeup every handle on the atom's WAITING list */
72854+static void wakeup_atom_waiting_list(txn_atom * atom)
72855+{
72856+ txn_wait_links *wlinks;
72857+
72858+ assert("umka-211", atom != NULL);
72859+
72860+ /* atom is locked */
72861+ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
72862+ if (wlinks->waiting_cb == NULL ||
72863+ wlinks->waiting_cb(atom, wlinks))
72864+ /* Wake up. */
72865+ reiser4_wake_up(wlinks->_lock_stack);
72866+ }
72867+}
72868+
72869+/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
72870+static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
72871+{
72872+ assert("nikita-3330", atom != NULL);
72873+ assert_spin_locked(&(atom->alock));
72874+
72875+ /* atom->txnh_count == 1 is for waking waiters up if we are releasing
72876+ * last transaction handle. */
72877+ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
72878+}
72879+
72880+/* The general purpose of this function is to wait on the first of two possible events.
72881+ The situation is that a handle (and its atom atomh) is blocked trying to capture a
72882+ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
72883+ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
72884+ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
72885+ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
72886+ proceed and fuse the two atoms in the CAPTURE_WAIT state.
72887+
72888+ In other words, if either atomh or atomf change state, the handle will be awakened,
72889+ thus there are two lists per atom: WAITING and WAITFOR.
72890+
72891+ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
72892+ close but it is not assigned to an atom of its own.
72893+
72894+ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
72895+ BOTH_ATOM_LOCKS. Result: all four locks are released.
72896+*/
72897+static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
72898+ txn_atom * atomh, txn_capture mode)
72899+{
72900+ int ret;
72901+ txn_wait_links wlinks;
72902+
72903+ assert("umka-213", txnh != NULL);
72904+ assert("umka-214", atomf != NULL);
72905+
72906+ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
72907+ spin_unlock_txnh(txnh);
72908+ spin_unlock_atom(atomf);
72909+
72910+ if (atomh) {
72911+ spin_unlock_atom(atomh);
72912+ }
72913+
72914+ return RETERR(-E_BLOCK);
72915+ }
72916+
72917+ /* Initialize the waiting list links. */
72918+ init_wlinks(&wlinks);
72919+
72920+ /* Add txnh to atomf's waitfor list, unlock atomf. */
72921+ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
72922+ wlinks.waitfor_cb = wait_for_fusion;
72923+ atomic_inc(&atomf->refcount);
72924+ spin_unlock_atom(atomf);
72925+
72926+ if (atomh) {
72927+ /* Add txnh to atomh's waiting list, unlock atomh. */
72928+ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
72929+ atomic_inc(&atomh->refcount);
72930+ spin_unlock_atom(atomh);
72931+ }
72932+
72933+ /* Go to sleep. */
72934+ spin_unlock_txnh(txnh);
72935+
72936+ ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
72937+ if (ret == 0) {
72938+ reiser4_go_to_sleep(wlinks._lock_stack);
72939+ ret = RETERR(-E_REPEAT);
72940+ }
72941+
72942+ /* Remove from the waitfor list. */
72943+ spin_lock_atom(atomf);
72944+
72945+ list_del(&wlinks._fwaitfor_link);
72946+ atom_dec_and_unlock(atomf);
72947+
72948+ if (atomh) {
72949+ /* Remove from the waiting list. */
72950+ spin_lock_atom(atomh);
72951+ list_del(&wlinks._fwaiting_link);
72952+ atom_dec_and_unlock(atomh);
72953+ }
72954+ return ret;
72955+}
72956+
72957+static void lock_two_atoms(txn_atom * one, txn_atom * two)
72958+{
72959+ assert("zam-1067", one != two);
72960+
72961+ /* lock the atom with lesser address first */
72962+ if (one < two) {
72963+ spin_lock_atom(one);
72964+ spin_lock_atom_nested(two);
72965+ } else {
72966+ spin_lock_atom(two);
72967+ spin_lock_atom_nested(one);
72968+ }
72969+}
72970+
72971+/* Perform the necessary work to prepare for fusing two atoms, which involves
72972+ * acquiring two atom locks in the proper order. If one of the node's atom is
72973+ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
72974+ * atom is not then the handle's request is put to sleep. If the node's atom
72975+ * is committing, then the node can be copy-on-captured. Otherwise, pick the
72976+ * atom with fewer pointers to be fused into the atom with more pointer and
72977+ * call capture_fuse_into.
72978+ */
72979+static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
72980+{
72981+ txn_atom * txnh_atom = txnh->atom;
72982+ txn_atom * block_atom = node->atom;
72983+
72984+ atomic_inc(&txnh_atom->refcount);
72985+ atomic_inc(&block_atom->refcount);
72986+
72987+ spin_unlock_txnh(txnh);
72988+ spin_unlock_jnode(node);
72989+
72990+ lock_two_atoms(txnh_atom, block_atom);
72991+
72992+ if (txnh->atom != txnh_atom || node->atom != block_atom ) {
72993+ release_two_atoms(txnh_atom, block_atom);
72994+ return RETERR(-E_REPEAT);
72995+ }
72996+
72997+ atomic_dec(&txnh_atom->refcount);
72998+ atomic_dec(&block_atom->refcount);
72999+
73000+ assert ("zam-1066", atom_isopen(txnh_atom));
73001+
73002+ if (txnh_atom->stage >= block_atom->stage ||
73003+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
73004+ capture_fuse_into(txnh_atom, block_atom);
73005+ return RETERR(-E_REPEAT);
73006+ }
73007+ spin_lock_txnh(txnh);
73008+ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
73009+}
73010+
73011+/* This function splices together two jnode lists (small and large) and sets all jnodes in
73012+ the small list to point to the large atom. Returns the length of the list. */
73013+static int
73014+capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
73015+ struct list_head *small_head)
73016+{
73017+ int count = 0;
73018+ jnode *node;
73019+
73020+ assert("umka-218", large != NULL);
73021+ assert("umka-219", large_head != NULL);
73022+ assert("umka-220", small_head != NULL);
73023+ /* small atom should be locked also. */
73024+ assert_spin_locked(&(large->alock));
73025+
73026+ /* For every jnode on small's capture list... */
73027+ list_for_each_entry(node, small_head, capture_link) {
73028+ count += 1;
73029+
73030+ /* With the jnode lock held, update atom pointer. */
73031+ spin_lock_jnode(node);
73032+ node->atom = large;
73033+ spin_unlock_jnode(node);
73034+ }
73035+
73036+ /* Splice the lists. */
73037+ list_splice_init(small_head, large_head->prev);
73038+
73039+ return count;
73040+}
73041+
73042+/* This function splices together two txnh lists (small and large) and sets all txn handles in
73043+ the small list to point to the large atom. Returns the length of the list. */
73044+static int
73045+capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
73046+ struct list_head *small_head)
73047+{
73048+ int count = 0;
73049+ txn_handle *txnh;
73050+
73051+ assert("umka-221", large != NULL);
73052+ assert("umka-222", large_head != NULL);
73053+ assert("umka-223", small_head != NULL);
73054+
73055+ /* Adjust every txnh to the new atom. */
73056+ list_for_each_entry(txnh, small_head, txnh_link) {
73057+ count += 1;
73058+
73059+ /* With the txnh lock held, update atom pointer. */
73060+ spin_lock_txnh(txnh);
73061+ txnh->atom = large;
73062+ spin_unlock_txnh(txnh);
73063+ }
73064+
73065+ /* Splice the txn_handle list. */
73066+ list_splice_init(small_head, large_head->prev);
73067+
73068+ return count;
73069+}
73070+
73071+/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
73072+ added to LARGE and their ->atom pointers are all updated. The associated counts are
73073+ updated as well, and any waiting handles belonging to either are awakened. Finally the
73074+ smaller atom's refcount is decremented.
73075+*/
73076+static void capture_fuse_into(txn_atom * small, txn_atom * large)
73077+{
73078+ int level;
73079+ unsigned zcount = 0;
73080+ unsigned tcount = 0;
73081+
73082+ assert("umka-224", small != NULL);
73083+ assert("umka-225", small != NULL);
73084+
73085+ assert_spin_locked(&(large->alock));
73086+ assert_spin_locked(&(small->alock));
73087+
73088+ assert("jmacd-201", atom_isopen(small));
73089+ assert("jmacd-202", atom_isopen(large));
73090+
73091+ /* Splice and update the per-level dirty jnode lists */
73092+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73093+ zcount +=
73094+ capture_fuse_jnode_lists(large,
73095+ ATOM_DIRTY_LIST(large, level),
73096+ ATOM_DIRTY_LIST(small, level));
73097+ }
73098+
73099+ /* Splice and update the [clean,dirty] jnode and txnh lists */
73100+ zcount +=
73101+ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
73102+ ATOM_CLEAN_LIST(small));
73103+ zcount +=
73104+ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
73105+ ATOM_OVRWR_LIST(small));
73106+ zcount +=
73107+ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
73108+ ATOM_WB_LIST(small));
73109+ zcount +=
73110+ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
73111+ tcount +=
73112+ capture_fuse_txnh_lists(large, &large->txnh_list,
73113+ &small->txnh_list);
73114+
73115+ /* Check our accounting. */
73116+ assert("jmacd-1063",
73117+ zcount + small->num_queued == small->capture_count);
73118+ assert("jmacd-1065", tcount == small->txnh_count);
73119+
73120+ /* sum numbers of waiters threads */
73121+ large->nr_waiters += small->nr_waiters;
73122+ small->nr_waiters = 0;
73123+
73124+ /* splice flush queues */
73125+ reiser4_fuse_fq(large, small);
73126+
73127+ /* update counter of jnode on every atom' list */
73128+ ON_DEBUG(large->dirty += small->dirty;
73129+ small->dirty = 0;
73130+ large->clean += small->clean;
73131+ small->clean = 0;
73132+ large->ovrwr += small->ovrwr;
73133+ small->ovrwr = 0;
73134+ large->wb += small->wb;
73135+ small->wb = 0;
73136+ large->fq += small->fq;
73137+ small->fq = 0;);
73138+
73139+ /* count flushers in result atom */
73140+ large->nr_flushers += small->nr_flushers;
73141+ small->nr_flushers = 0;
73142+
73143+ /* update counts of flushed nodes */
73144+ large->flushed += small->flushed;
73145+ small->flushed = 0;
73146+
73147+ /* Transfer list counts to large. */
73148+ large->txnh_count += small->txnh_count;
73149+ large->capture_count += small->capture_count;
73150+
73151+ /* Add all txnh references to large. */
73152+ atomic_add(small->txnh_count, &large->refcount);
73153+ atomic_sub(small->txnh_count, &small->refcount);
73154+
73155+ /* Reset small counts */
73156+ small->txnh_count = 0;
73157+ small->capture_count = 0;
73158+
73159+ /* Assign the oldest start_time, merge flags. */
73160+ large->start_time = min(large->start_time, small->start_time);
73161+ large->flags |= small->flags;
73162+
73163+ /* Merge blocknr sets. */
73164+ blocknr_set_merge(&small->delete_set, &large->delete_set);
73165+ blocknr_set_merge(&small->wandered_map, &large->wandered_map);
73166+
73167+ /* Merge allocated/deleted file counts */
73168+ large->nr_objects_deleted += small->nr_objects_deleted;
73169+ large->nr_objects_created += small->nr_objects_created;
73170+
73171+ small->nr_objects_deleted = 0;
73172+ small->nr_objects_created = 0;
73173+
73174+ /* Merge allocated blocks counts */
73175+ large->nr_blocks_allocated += small->nr_blocks_allocated;
73176+
73177+ large->nr_running_queues += small->nr_running_queues;
73178+ small->nr_running_queues = 0;
73179+
73180+ /* Merge blocks reserved for overwrite set. */
73181+ large->flush_reserved += small->flush_reserved;
73182+ small->flush_reserved = 0;
73183+
73184+ if (large->stage < small->stage) {
73185+ /* Large only needs to notify if it has changed state. */
73186+ reiser4_atom_set_stage(large, small->stage);
73187+ wakeup_atom_waiting_list(large);
73188+ }
73189+
73190+ reiser4_atom_set_stage(small, ASTAGE_INVALID);
73191+
73192+ /* Notify any waiters--small needs to unload its wait lists. Waiters
73193+ actually remove themselves from the list before returning from the
73194+ fuse_wait function. */
73195+ wakeup_atom_waiting_list(small);
73196+
73197+ /* Unlock atoms */
73198+ spin_unlock_atom(large);
73199+ atom_dec_and_unlock(small);
73200+}
73201+
73202+/* TXNMGR STUFF */
73203+
73204+/* Release a block from the atom, reversing the effects of being captured,
73205+ do not release atom's reference to jnode due to holding spin-locks.
73206+ Currently this is only called when the atom commits.
73207+
73208+ NOTE: this function does not release a (journal) reference to jnode
73209+ due to locking optimizations, you should call jput() somewhere after
73210+ calling reiser4_uncapture_block(). */
73211+void reiser4_uncapture_block(jnode * node)
73212+{
73213+ txn_atom *atom;
73214+
73215+ assert("umka-226", node != NULL);
73216+ atom = node->atom;
73217+ assert("umka-228", atom != NULL);
73218+
73219+ assert("jmacd-1021", node->atom == atom);
73220+ assert_spin_locked(&(node->guard));
73221+ assert("jmacd-1023", atom_is_protected(atom));
73222+
73223+ JF_CLR(node, JNODE_DIRTY);
73224+ JF_CLR(node, JNODE_RELOC);
73225+ JF_CLR(node, JNODE_OVRWR);
73226+ JF_CLR(node, JNODE_CREATED);
73227+ JF_CLR(node, JNODE_WRITEBACK);
73228+ JF_CLR(node, JNODE_REPACK);
73229+
73230+ list_del_init(&node->capture_link);
73231+ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
73232+ assert("zam-925", atom_isopen(atom));
73233+ assert("vs-1623", NODE_LIST(node) == FQ_LIST);
73234+ ON_DEBUG(atom->num_queued--);
73235+ JF_CLR(node, JNODE_FLUSH_QUEUED);
73236+ }
73237+ atom->capture_count -= 1;
73238+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
73239+ node->atom = NULL;
73240+
73241+ spin_unlock_jnode(node);
73242+ LOCK_CNT_DEC(t_refs);
73243+}
73244+
73245+/* Unconditional insert of jnode into atom's overwrite list. Currently used in
73246+ bitmap-based allocator code for adding modified bitmap blocks the
73247+ transaction. @atom and @node are spin locked */
73248+void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
73249+{
73250+ assert("zam-538", atom_is_protected(atom));
73251+ assert_spin_locked(&(node->guard));
73252+ assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
73253+ assert("zam-543", node->atom == NULL);
73254+ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
73255+
73256+ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
73257+ jref(node);
73258+ node->atom = atom;
73259+ atom->capture_count++;
73260+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
73261+}
73262+
73263+static int count_deleted_blocks_actor(txn_atom * atom,
73264+ const reiser4_block_nr * a,
73265+ const reiser4_block_nr * b, void *data)
73266+{
73267+ reiser4_block_nr *counter = data;
73268+
73269+ assert("zam-995", data != NULL);
73270+ assert("zam-996", a != NULL);
73271+ if (b == NULL)
73272+ *counter += 1;
73273+ else
73274+ *counter += *b;
73275+ return 0;
73276+}
73277+
73278+reiser4_block_nr txnmgr_count_deleted_blocks(void)
73279+{
73280+ reiser4_block_nr result;
73281+ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73282+ txn_atom *atom;
73283+
73284+ result = 0;
73285+
73286+ spin_lock_txnmgr(tmgr);
73287+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73288+ spin_lock_atom(atom);
73289+ if (atom_isopen(atom))
73290+ blocknr_set_iterator(
73291+ atom, &atom->delete_set,
73292+ count_deleted_blocks_actor, &result, 0);
73293+ spin_unlock_atom(atom);
73294+ }
73295+ spin_unlock_txnmgr(tmgr);
73296+
73297+ return result;
73298+}
73299+
73300+/*
73301+ * Local variables:
73302+ * c-indentation-style: "K&R"
73303+ * mode-name: "LC"
73304+ * c-basic-offset: 8
73305+ * tab-width: 8
73306+ * fill-column: 79
73307+ * End:
73308+ */
73309diff -urN linux-2.6.23.orig/fs/reiser4/txnmgr.h linux-2.6.23/fs/reiser4/txnmgr.h
73310--- linux-2.6.23.orig/fs/reiser4/txnmgr.h 1970-01-01 03:00:00.000000000 +0300
73311+++ linux-2.6.23/fs/reiser4/txnmgr.h 2007-12-04 16:49:30.000000000 +0300
73312@@ -0,0 +1,701 @@
73313+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
73314+ * reiser4/README */
73315+
73316+/* data-types and function declarations for transaction manager. See txnmgr.c
73317+ * for details. */
73318+
73319+#ifndef __REISER4_TXNMGR_H__
73320+#define __REISER4_TXNMGR_H__
73321+
73322+#include "forward.h"
73323+#include "dformat.h"
73324+
73325+#include <linux/fs.h>
73326+#include <linux/mm.h>
73327+#include <linux/types.h>
73328+#include <linux/spinlock.h>
73329+#include <asm/atomic.h>
73330+#include <linux/wait.h>
73331+
73332+/* TYPE DECLARATIONS */
73333+
73334+/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
73335+ A capture request dynamically assigns a block to the calling thread's transaction
73336+ handle. */
73337+typedef enum {
73338+ /* A READ_ATOMIC request indicates that a block will be read and that the caller's
73339+ atom should fuse in order to ensure that the block commits atomically with the
73340+ caller. */
73341+ TXN_CAPTURE_READ_ATOMIC = (1 << 0),
73342+
73343+ /* A READ_NONCOM request indicates that a block will be read and that the caller is
73344+ willing to read a non-committed block without causing atoms to fuse. */
73345+ TXN_CAPTURE_READ_NONCOM = (1 << 1),
73346+
73347+ /* A READ_MODIFY request indicates that a block will be read but that the caller
73348+ wishes for the block to be captured as it will be written. This capture request
73349+ mode is not currently used, but eventually it will be useful for preventing
73350+ deadlock in read-modify-write cycles. */
73351+ TXN_CAPTURE_READ_MODIFY = (1 << 2),
73352+
73353+ /* A WRITE capture request indicates that a block will be modified and that atoms
73354+ should fuse to make the commit atomic. */
73355+ TXN_CAPTURE_WRITE = (1 << 3),
73356+
73357+ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
73358+ exclusive type designation from extra bits that may be supplied -- see
73359+ below. */
73360+ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
73361+ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
73362+ TXN_CAPTURE_WRITE),
73363+
73364+ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
73365+ indicate modification will occur. */
73366+ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
73367+
73368+ /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
73369+ prefer not to sleep waiting for an aging atom to commit. */
73370+ TXN_CAPTURE_NONBLOCKING = (1 << 4),
73371+
73372+ /* An option to reiser4_try_capture to prevent atom fusion, just simple
73373+ capturing is allowed */
73374+ TXN_CAPTURE_DONT_FUSE = (1 << 5)
73375+
73376+ /* This macro selects only the exclusive capture request types, stripping out any
73377+ options that were supplied (i.e., NONBLOCKING). */
73378+#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
73379+} txn_capture;
73380+
73381+/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
73382+ difference is in the handling of read requests. A WRITE_FUSING transaction handle
73383+ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
73384+ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
73385+typedef enum {
73386+ TXN_WRITE_FUSING = (1 << 0),
73387+ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
73388+} txn_mode;
73389+
73390+/* Every atom has a stage, which is one of these exclusive values: */
73391+typedef enum {
73392+ /* Initially an atom is free. */
73393+ ASTAGE_FREE = 0,
73394+
73395+ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
73396+ blocks and fuse with other atoms. */
73397+ ASTAGE_CAPTURE_FUSE = 1,
73398+
73399+ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
73400+
73401+ /* When an atom reaches a certain age it must do all it can to commit. An atom in
73402+ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
73403+ atoms in the CAPTURE_FUSE stage. */
73404+ ASTAGE_CAPTURE_WAIT = 2,
73405+
73406+ /* Waiting for I/O before commit. Copy-on-capture (see
73407+ http://namesys.com/v4/v4.html). */
73408+ ASTAGE_PRE_COMMIT = 3,
73409+
73410+ /* Post-commit overwrite I/O. Steal-on-capture. */
73411+ ASTAGE_POST_COMMIT = 4,
73412+
73413+ /* Atom which waits for the removal of the last reference to (it? ) to
73414+ * be deleted from memory */
73415+ ASTAGE_DONE = 5,
73416+
73417+ /* invalid atom. */
73418+ ASTAGE_INVALID = 6,
73419+
73420+} txn_stage;
73421+
73422+/* Certain flags may be set in the txn_atom->flags field. */
73423+typedef enum {
73424+ /* Indicates that the atom should commit as soon as possible. */
73425+ ATOM_FORCE_COMMIT = (1 << 0),
73426+ /* to avoid endless loop, mark the atom (which was considered as too
73427+ * small) after failed attempt to fuse it. */
73428+ ATOM_CANCEL_FUSION = (1 << 1)
73429+} txn_flags;
73430+
73431+/* Flags for controlling commit_txnh */
73432+typedef enum {
73433+ /* Wait commit atom completion in commit_txnh */
73434+ TXNH_WAIT_COMMIT = 0x2,
73435+ /* Don't commit atom when this handle is closed */
73436+ TXNH_DONT_COMMIT = 0x4
73437+} txn_handle_flags_t;
73438+
73439+/* TYPE DEFINITIONS */
73440+
73441+/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
73442+ fields, so typically an operation on the atom through either of these objects must (1)
73443+ lock the object, (2) read the atom pointer, (3) lock the atom.
73444+
73445+ During atom fusion, the process holds locks on both atoms at once. Then, it iterates
73446+ through the list of handles and pages held by the smaller of the two atoms. For each
73447+ handle and page referencing the smaller atom, the fusing process must: (1) lock the
73448+ object, and (2) update the atom pointer.
73449+
73450+ You can see that there is a conflict of lock ordering here, so the more-complex
73451+ procedure should have priority, i.e., the fusing process has priority so that it is
73452+ guaranteed to make progress and to avoid restarts.
73453+
73454+ This decision, however, means additional complexity for aquiring the atom lock in the
73455+ first place.
73456+
73457+ The general original procedure followed in the code was:
73458+
73459+ TXN_OBJECT *obj = ...;
73460+ TXN_ATOM *atom;
73461+
73462+ spin_lock (& obj->_lock);
73463+
73464+ atom = obj->_atom;
73465+
73466+ if (! spin_trylock_atom (atom))
73467+ {
73468+ spin_unlock (& obj->_lock);
73469+ RESTART OPERATION, THERE WAS A RACE;
73470+ }
73471+
73472+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73473+
73474+ It has however been found that this wastes CPU a lot in a manner that is
73475+ hard to profile. So, proper refcounting was added to atoms, and new
73476+ standard locking sequence is like following:
73477+
73478+ TXN_OBJECT *obj = ...;
73479+ TXN_ATOM *atom;
73480+
73481+ spin_lock (& obj->_lock);
73482+
73483+ atom = obj->_atom;
73484+
73485+ if (! spin_trylock_atom (atom))
73486+ {
73487+ atomic_inc (& atom->refcount);
73488+ spin_unlock (& obj->_lock);
73489+ spin_lock (&atom->_lock);
73490+ atomic_dec (& atom->refcount);
73491+ // HERE atom is locked
73492+ spin_unlock (&atom->_lock);
73493+ RESTART OPERATION, THERE WAS A RACE;
73494+ }
73495+
73496+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73497+
73498+ (core of this is implemented in trylock_throttle() function)
73499+
73500+ See the jnode_get_atom() function for a common case.
73501+
73502+ As an additional (and important) optimization allowing to avoid restarts,
73503+ it is possible to re-check required pre-conditions at the HERE point in
73504+ code above and proceed without restarting if they are still satisfied.
73505+*/
73506+
73507+/* An atomic transaction: this is the underlying system representation
73508+ of a transaction, not the one seen by clients.
73509+
73510+ Invariants involving this data-type:
73511+
73512+ [sb-fake-allocated]
73513+*/
73514+struct txn_atom {
73515+ /* The spinlock protecting the atom, held during fusion and various other state
73516+ changes. */
73517+ spinlock_t alock;
73518+
73519+ /* The atom's reference counter, increasing (in case of a duplication
73520+ of an existing reference or when we are sure that some other
73521+ reference exists) may be done without taking spinlock, decrementing
73522+ of the ref. counter requires a spinlock to be held.
73523+
73524+ Each transaction handle counts in ->refcount. All jnodes count as
73525+ one reference acquired in atom_begin_andlock(), released in
73526+ commit_current_atom().
73527+ */
73528+ atomic_t refcount;
73529+
73530+ /* The atom_id identifies the atom in persistent records such as the log. */
73531+ __u32 atom_id;
73532+
73533+ /* Flags holding any of the txn_flags enumerated values (e.g.,
73534+ ATOM_FORCE_COMMIT). */
73535+ __u32 flags;
73536+
73537+ /* Number of open handles. */
73538+ __u32 txnh_count;
73539+
73540+ /* The number of znodes captured by this atom. Equal to the sum of lengths of the
73541+ dirty_nodes[level] and clean_nodes lists. */
73542+ __u32 capture_count;
73543+
73544+#if REISER4_DEBUG
73545+ int clean;
73546+ int dirty;
73547+ int ovrwr;
73548+ int wb;
73549+ int fq;
73550+#endif
73551+
73552+ __u32 flushed;
73553+
73554+ /* Current transaction stage. */
73555+ txn_stage stage;
73556+
73557+ /* Start time. */
73558+ unsigned long start_time;
73559+
73560+ /* The atom's delete set. It collects block numbers of the nodes
73561+ which were deleted during the transaction. */
73562+ struct list_head delete_set;
73563+
73564+ /* The atom's wandered_block mapping. */
73565+ struct list_head wandered_map;
73566+
73567+ /* The transaction's list of dirty captured nodes--per level. Index
73568+ by (level). dirty_nodes[0] is for znode-above-root */
73569+ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
73570+
73571+ /* The transaction's list of clean captured nodes. */
73572+ struct list_head clean_nodes;
73573+
73574+ /* The atom's overwrite set */
73575+ struct list_head ovrwr_nodes;
73576+
73577+ /* nodes which are being written to disk */
73578+ struct list_head writeback_nodes;
73579+
73580+ /* list of inodes */
73581+ struct list_head inodes;
73582+
73583+ /* List of handles associated with this atom. */
73584+ struct list_head txnh_list;
73585+
73586+ /* Transaction list link: list of atoms in the transaction manager. */
73587+ struct list_head atom_link;
73588+
73589+ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
73590+ struct list_head fwaitfor_list;
73591+
73592+ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
73593+ struct list_head fwaiting_list;
73594+
73595+ /* Numbers of objects which were deleted/created in this transaction
73596+ thereby numbers of objects IDs which were released/deallocated. */
73597+ int nr_objects_deleted;
73598+ int nr_objects_created;
73599+ /* number of blocks allocated during the transaction */
73600+ __u64 nr_blocks_allocated;
73601+ /* All atom's flush queue objects are on this list */
73602+ struct list_head flush_queues;
73603+#if REISER4_DEBUG
73604+ /* number of flush queues for this atom. */
73605+ int nr_flush_queues;
73606+ /* Number of jnodes which were removed from atom's lists and put
73607+ on flush_queue */
73608+ int num_queued;
73609+#endif
73610+ /* number of threads who wait for this atom to complete commit */
73611+ int nr_waiters;
73612+ /* number of threads which do jnode_flush() over this atom */
73613+ int nr_flushers;
73614+ /* number of flush queues which are IN_USE and jnodes from fq->prepped
73615+ are submitted to disk by the reiser4_write_fq() routine. */
73616+ int nr_running_queues;
73617+ /* A counter of grabbed unformatted nodes, see a description of the
73618+ * reiser4 space reservation scheme at block_alloc.c */
73619+ reiser4_block_nr flush_reserved;
73620+#if REISER4_DEBUG
73621+ void *committer;
73622+#endif
73623+ struct super_block *super;
73624+};
73625+
73626+#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
73627+#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
73628+#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
73629+#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
73630+#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
73631+
73632+#define NODE_LIST(node) (node)->list
73633+#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
73634+ON_DEBUG(void
73635+ count_jnode(txn_atom *, jnode *, atom_list old_list,
73636+ atom_list new_list, int check_lists));
73637+
73638+/* A transaction handle: the client obtains and commits this handle which is assigned by
73639+ the system to a txn_atom. */
73640+struct txn_handle {
73641+ /* Spinlock protecting ->atom pointer */
73642+ spinlock_t hlock;
73643+
73644+ /* Flags for controlling commit_txnh() behavior */
73645+ /* from txn_handle_flags_t */
73646+ txn_handle_flags_t flags;
73647+
73648+ /* Whether it is READ_FUSING or WRITE_FUSING. */
73649+ txn_mode mode;
73650+
73651+ /* If assigned, the atom it is part of. */
73652+ txn_atom *atom;
73653+
73654+ /* Transaction list link. Head is in txn_atom. */
73655+ struct list_head txnh_link;
73656+};
73657+
73658+/* The transaction manager: one is contained in the reiser4_super_info_data */
73659+struct txn_mgr {
73660+ /* A spinlock protecting the atom list, id_count, flush_control */
73661+ spinlock_t tmgr_lock;
73662+
73663+ /* List of atoms. */
73664+ struct list_head atoms_list;
73665+
73666+ /* Number of atoms. */
73667+ int atom_count;
73668+
73669+ /* A counter used to assign atom->atom_id values. */
73670+ __u32 id_count;
73671+
73672+ /* a mutex object for commit serialization */
73673+ struct mutex commit_mutex;
73674+
73675+ /* a list of all txnmrgs served by particular daemon. */
73676+ struct list_head linkage;
73677+
73678+ /* description of daemon for this txnmgr */
73679+ ktxnmgrd_context *daemon;
73680+
73681+ /* parameters. Adjustable through mount options. */
73682+ unsigned int atom_max_size;
73683+ unsigned int atom_max_age;
73684+ unsigned int atom_min_size;
73685+ /* max number of concurrent flushers for one atom, 0 - unlimited. */
73686+ unsigned int atom_max_flushers;
73687+ struct dentry *debugfs_atom_count;
73688+ struct dentry *debugfs_id_count;
73689+};
73690+
73691+/* FUNCTION DECLARATIONS */
73692+
73693+/* These are the externally (within Reiser4) visible transaction functions, therefore they
73694+ are prefixed with "txn_". For comments, see txnmgr.c. */
73695+
73696+extern int init_txnmgr_static(void);
73697+extern void done_txnmgr_static(void);
73698+
73699+extern void reiser4_init_txnmgr(txn_mgr *);
73700+extern void reiser4_done_txnmgr(txn_mgr *);
73701+
73702+extern int reiser4_txn_reserve(int reserved);
73703+
73704+extern void reiser4_txn_begin(reiser4_context * context);
73705+extern int reiser4_txn_end(reiser4_context * context);
73706+
73707+extern void reiser4_txn_restart(reiser4_context * context);
73708+extern void reiser4_txn_restart_current(void);
73709+
73710+extern int txnmgr_force_commit_all(struct super_block *, int);
73711+extern int current_atom_should_commit(void);
73712+
73713+extern jnode *find_first_dirty_jnode(txn_atom *, int);
73714+
73715+extern int commit_some_atoms(txn_mgr *);
73716+extern int force_commit_atom(txn_handle *);
73717+extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
73718+
73719+extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
73720+
73721+extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
73722+
73723+extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
73724+ int alloc_value);
73725+extern void atom_dec_and_unlock(txn_atom * atom);
73726+
73727+extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
73728+extern int try_capture_page_to_invalidate(struct page *pg);
73729+
73730+extern void reiser4_uncapture_page(struct page *pg);
73731+extern void reiser4_uncapture_block(jnode *);
73732+extern void reiser4_uncapture_jnode(jnode *);
73733+
73734+extern int reiser4_capture_inode(struct inode *);
73735+extern int reiser4_uncapture_inode(struct inode *);
73736+
73737+extern txn_atom *get_current_atom_locked_nocheck(void);
73738+
73739+#if REISER4_DEBUG
73740+
73741+/**
73742+ * atom_is_protected - make sure that nobody but us can do anything with atom
73743+ * @atom: atom to be checked
73744+ *
73745+ * This is used to assert that atom either entered commit stages or is spin
73746+ * locked.
73747+ */
73748+static inline int atom_is_protected(txn_atom *atom)
73749+{
73750+ if (atom->stage >= ASTAGE_PRE_COMMIT)
73751+ return 1;
73752+ assert_spin_locked(&(atom->alock));
73753+ return 1;
73754+}
73755+
73756+#endif
73757+
73758+/* Get the current atom and spinlock it if current atom present. May not return NULL */
73759+static inline txn_atom *get_current_atom_locked(void)
73760+{
73761+ txn_atom *atom;
73762+
73763+ atom = get_current_atom_locked_nocheck();
73764+ assert("zam-761", atom != NULL);
73765+
73766+ return atom;
73767+}
73768+
73769+extern txn_atom *jnode_get_atom(jnode *);
73770+
73771+extern void reiser4_atom_wait_event(txn_atom *);
73772+extern void reiser4_atom_send_event(txn_atom *);
73773+
73774+extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
73775+extern int reiser4_capture_super_block(struct super_block *s);
73776+int capture_bulk(jnode **, int count);
73777+
73778+/* See the comment on the function blocknrset.c:blocknr_set_add for the
73779+ calling convention of these three routines. */
73780+extern void blocknr_set_init(struct list_head * bset);
73781+extern void blocknr_set_destroy(struct list_head * bset);
73782+extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
73783+extern int blocknr_set_add_extent(txn_atom * atom,
73784+ struct list_head * bset,
73785+ blocknr_set_entry ** new_bsep,
73786+ const reiser4_block_nr * start,
73787+ const reiser4_block_nr * len);
73788+extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
73789+ blocknr_set_entry ** new_bsep,
73790+ const reiser4_block_nr * a,
73791+ const reiser4_block_nr * b);
73792+
73793+typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
73794+ const reiser4_block_nr *, void *);
73795+
73796+extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
73797+ blocknr_set_actor_f actor, void *data,
73798+ int delete);
73799+
73800+/* flush code takes care about how to fuse flush queues */
73801+extern void flush_init_atom(txn_atom * atom);
73802+extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
73803+
73804+static inline void spin_lock_atom(txn_atom *atom)
73805+{
73806+ /* check that spinlocks of lower priorities are not held */
73807+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
73808+ LOCK_CNT_NIL(spin_locked_atom) &&
73809+ LOCK_CNT_NIL(spin_locked_jnode) &&
73810+ LOCK_CNT_NIL(spin_locked_zlock) &&
73811+ LOCK_CNT_NIL(rw_locked_dk) &&
73812+ LOCK_CNT_NIL(rw_locked_tree)));
73813+
73814+ spin_lock(&(atom->alock));
73815+
73816+ LOCK_CNT_INC(spin_locked_atom);
73817+ LOCK_CNT_INC(spin_locked);
73818+}
73819+
73820+static inline void spin_lock_atom_nested(txn_atom *atom)
73821+{
73822+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
73823+ LOCK_CNT_NIL(spin_locked_jnode) &&
73824+ LOCK_CNT_NIL(spin_locked_zlock) &&
73825+ LOCK_CNT_NIL(rw_locked_dk) &&
73826+ LOCK_CNT_NIL(rw_locked_tree)));
73827+
73828+ spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
73829+
73830+ LOCK_CNT_INC(spin_locked_atom);
73831+ LOCK_CNT_INC(spin_locked);
73832+}
73833+
73834+static inline int spin_trylock_atom(txn_atom *atom)
73835+{
73836+ if (spin_trylock(&(atom->alock))) {
73837+ LOCK_CNT_INC(spin_locked_atom);
73838+ LOCK_CNT_INC(spin_locked);
73839+ return 1;
73840+ }
73841+ return 0;
73842+}
73843+
73844+static inline void spin_unlock_atom(txn_atom *atom)
73845+{
73846+ assert_spin_locked(&(atom->alock));
73847+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
73848+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73849+
73850+ LOCK_CNT_DEC(spin_locked_atom);
73851+ LOCK_CNT_DEC(spin_locked);
73852+
73853+ spin_unlock(&(atom->alock));
73854+}
73855+
73856+static inline void spin_lock_txnh(txn_handle *txnh)
73857+{
73858+ /* check that spinlocks of lower priorities are not held */
73859+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
73860+ LOCK_CNT_NIL(spin_locked_zlock) &&
73861+ LOCK_CNT_NIL(rw_locked_tree)));
73862+
73863+ spin_lock(&(txnh->hlock));
73864+
73865+ LOCK_CNT_INC(spin_locked_txnh);
73866+ LOCK_CNT_INC(spin_locked);
73867+}
73868+
73869+static inline int spin_trylock_txnh(txn_handle *txnh)
73870+{
73871+ if (spin_trylock(&(txnh->hlock))) {
73872+ LOCK_CNT_INC(spin_locked_txnh);
73873+ LOCK_CNT_INC(spin_locked);
73874+ return 1;
73875+ }
73876+ return 0;
73877+}
73878+
73879+static inline void spin_unlock_txnh(txn_handle *txnh)
73880+{
73881+ assert_spin_locked(&(txnh->hlock));
73882+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
73883+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73884+
73885+ LOCK_CNT_DEC(spin_locked_txnh);
73886+ LOCK_CNT_DEC(spin_locked);
73887+
73888+ spin_unlock(&(txnh->hlock));
73889+}
73890+
73891+#define spin_ordering_pred_txnmgr(tmgr) \
73892+ ( LOCK_CNT_NIL(spin_locked_atom) && \
73893+ LOCK_CNT_NIL(spin_locked_txnh) && \
73894+ LOCK_CNT_NIL(spin_locked_jnode) && \
73895+ LOCK_CNT_NIL(rw_locked_zlock) && \
73896+ LOCK_CNT_NIL(rw_locked_dk) && \
73897+ LOCK_CNT_NIL(rw_locked_tree) )
73898+
73899+static inline void spin_lock_txnmgr(txn_mgr *mgr)
73900+{
73901+ /* check that spinlocks of lower priorities are not held */
73902+ assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
73903+ LOCK_CNT_NIL(spin_locked_txnh) &&
73904+ LOCK_CNT_NIL(spin_locked_jnode) &&
73905+ LOCK_CNT_NIL(spin_locked_zlock) &&
73906+ LOCK_CNT_NIL(rw_locked_dk) &&
73907+ LOCK_CNT_NIL(rw_locked_tree)));
73908+
73909+ spin_lock(&(mgr->tmgr_lock));
73910+
73911+ LOCK_CNT_INC(spin_locked_txnmgr);
73912+ LOCK_CNT_INC(spin_locked);
73913+}
73914+
73915+static inline int spin_trylock_txnmgr(txn_mgr *mgr)
73916+{
73917+ if (spin_trylock(&(mgr->tmgr_lock))) {
73918+ LOCK_CNT_INC(spin_locked_txnmgr);
73919+ LOCK_CNT_INC(spin_locked);
73920+ return 1;
73921+ }
73922+ return 0;
73923+}
73924+
73925+static inline void spin_unlock_txnmgr(txn_mgr *mgr)
73926+{
73927+ assert_spin_locked(&(mgr->tmgr_lock));
73928+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
73929+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73930+
73931+ LOCK_CNT_DEC(spin_locked_txnmgr);
73932+ LOCK_CNT_DEC(spin_locked);
73933+
73934+ spin_unlock(&(mgr->tmgr_lock));
73935+}
73936+
73937+typedef enum {
73938+ FQ_IN_USE = 0x1
73939+} flush_queue_state_t;
73940+
73941+typedef struct flush_queue flush_queue_t;
73942+
73943+/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
73944+ is filled by the jnode_flush() routine, and written to disk under memory
73945+ pressure or at atom commit time. */
73946+/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
73947+ field and fq->prepped list can be modified if atom is spin-locked and fq
73948+ object is "in-use" state. For read-only traversal of the fq->prepped list
73949+ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
73950+ only have atom spin-locked. */
73951+struct flush_queue {
73952+ /* linkage element is the first in this structure to make debugging
73953+ easier. See field in atom struct for description of list. */
73954+ struct list_head alink;
73955+ /* A spinlock to protect changes of fq state and fq->atom pointer */
73956+ spinlock_t guard;
73957+ /* flush_queue state: [in_use | ready] */
73958+ flush_queue_state_t state;
73959+ /* A list which contains queued nodes, queued nodes are removed from any
73960+ * atom's list and put on this ->prepped one. */
73961+ struct list_head prepped;
73962+ /* number of submitted i/o requests */
73963+ atomic_t nr_submitted;
73964+ /* number of i/o errors */
73965+ atomic_t nr_errors;
73966+ /* An atom this flush queue is attached to */
73967+ txn_atom *atom;
73968+ /* A wait queue head to wait on i/o completion */
73969+ wait_queue_head_t wait;
73970+#if REISER4_DEBUG
73971+ /* A thread which took this fq in exclusive use, NULL if fq is free,
73972+ * used for debugging. */
73973+ struct task_struct *owner;
73974+#endif
73975+};
73976+
73977+extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
73978+extern void reiser4_fq_put_nolock(flush_queue_t *);
73979+extern void reiser4_fq_put(flush_queue_t *);
73980+extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
73981+extern void queue_jnode(flush_queue_t *, jnode *);
73982+
73983+extern int reiser4_write_fq(flush_queue_t *, long *, int);
73984+extern int current_atom_finish_all_fq(void);
73985+extern void init_atom_fq_parts(txn_atom *);
73986+
73987+extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
73988+
73989+extern void znode_make_dirty(znode * node);
73990+extern void jnode_make_dirty_locked(jnode * node);
73991+
73992+extern int reiser4_sync_atom(txn_atom * atom);
73993+
73994+#if REISER4_DEBUG
73995+extern int atom_fq_parts_are_clean(txn_atom *);
73996+#endif
73997+
73998+extern void add_fq_to_bio(flush_queue_t *, struct bio *);
73999+extern flush_queue_t *get_fq_for_current_atom(void);
74000+
74001+void reiser4_invalidate_list(struct list_head * head);
74002+
74003+# endif /* __REISER4_TXNMGR_H__ */
74004+
74005+/* Make Linus happy.
74006+ Local variables:
74007+ c-indentation-style: "K&R"
74008+ mode-name: "LC"
74009+ c-basic-offset: 8
74010+ tab-width: 8
74011+ fill-column: 120
74012+ End:
74013+*/
74014diff -urN linux-2.6.23.orig/fs/reiser4/type_safe_hash.h linux-2.6.23/fs/reiser4/type_safe_hash.h
74015--- linux-2.6.23.orig/fs/reiser4/type_safe_hash.h 1970-01-01 03:00:00.000000000 +0300
74016+++ linux-2.6.23/fs/reiser4/type_safe_hash.h 2007-12-04 16:49:30.000000000 +0300
74017@@ -0,0 +1,320 @@
74018+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74019+ * reiser4/README */
74020+
74021+/* A hash table class that uses hash chains (singly-linked) and is
74022+ parametrized to provide type safety. */
74023+
74024+#ifndef __REISER4_TYPE_SAFE_HASH_H__
74025+#define __REISER4_TYPE_SAFE_HASH_H__
74026+
74027+#include "debug.h"
74028+
74029+#include <asm/errno.h>
74030+/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
74031+ based on the object type. You need to declare the item type before
74032+ this definition, define it after this definition. */
74033+#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
74034+ \
74035+typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
74036+typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
74037+ \
74038+struct PREFIX##_hash_table_ \
74039+{ \
74040+ ITEM_TYPE **_table; \
74041+ __u32 _buckets; \
74042+}; \
74043+ \
74044+struct PREFIX##_hash_link_ \
74045+{ \
74046+ ITEM_TYPE *_next; \
74047+}
74048+
74049+/* Step 2: Define the object type of the hash: give it field of type
74050+ PREFIX_hash_link. */
74051+
74052+/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
74053+ the type and field name used in step 3. The arguments are:
74054+
74055+ ITEM_TYPE The item type being hashed
74056+ KEY_TYPE The type of key being hashed
74057+ KEY_NAME The name of the key field within the item
74058+ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
74059+ HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
74060+ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
74061+
74062+ It implements these functions:
74063+
74064+ prefix_hash_init Initialize the table given its size.
74065+ prefix_hash_insert Insert an item
74066+ prefix_hash_insert_index Insert an item w/ precomputed hash_index
74067+ prefix_hash_find Find an item by key
74068+ prefix_hash_find_index Find an item w/ precomputed hash_index
74069+ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
74070+ prefix_hash_remove_index Remove an item w/ precomputed hash_index
74071+
74072+ If you'd like something to be done differently, feel free to ask me
74073+ for modifications. Additional features that could be added but
74074+ have not been:
74075+
74076+ prefix_hash_remove_key Find and remove an item by key
74077+ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
74078+
74079+ The hash_function currently receives only the key as an argument,
74080+ meaning it must somehow know the number of buckets. If this is a
74081+ problem let me know.
74082+
74083+ This hash table uses a single-linked hash chain. This means
74084+ insertion is fast but deletion requires searching the chain.
74085+
74086+ There is also the doubly-linked hash chain approach, under which
74087+ deletion requires no search but the code is longer and it takes two
74088+ pointers per item.
74089+
74090+ The circularly-linked approach has the shortest code but requires
74091+ two pointers per bucket, doubling the size of the bucket array (in
74092+ addition to two pointers per item).
74093+*/
74094+#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
74095+ \
74096+static __inline__ void \
74097+PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
74098+ __u32 hash UNUSED_ARG) \
74099+{ \
74100+ assert("nikita-2780", hash < table->_buckets); \
74101+} \
74102+ \
74103+static __inline__ int \
74104+PREFIX##_hash_init (PREFIX##_hash_table *hash, \
74105+ __u32 buckets) \
74106+{ \
74107+ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
74108+ hash->_buckets = buckets; \
74109+ if (hash->_table == NULL) \
74110+ { \
74111+ return RETERR(-ENOMEM); \
74112+ } \
74113+ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
74114+ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
74115+ return 0; \
74116+} \
74117+ \
74118+static __inline__ void \
74119+PREFIX##_hash_done (PREFIX##_hash_table *hash) \
74120+{ \
74121+ if (REISER4_DEBUG && hash->_table != NULL) { \
74122+ __u32 i; \
74123+ for (i = 0 ; i < hash->_buckets ; ++ i) \
74124+ assert("nikita-2905", hash->_table[i] == NULL); \
74125+ } \
74126+ if (hash->_table != NULL) \
74127+ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
74128+ hash->_table = NULL; \
74129+} \
74130+ \
74131+static __inline__ void \
74132+PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
74133+{ \
74134+ prefetch(item->LINK_NAME._next); \
74135+} \
74136+ \
74137+static __inline__ void \
74138+PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
74139+ __u32 index) \
74140+{ \
74141+ prefetch(hash->_table[index]); \
74142+} \
74143+ \
74144+static __inline__ ITEM_TYPE* \
74145+PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
74146+ __u32 hash_index, \
74147+ KEY_TYPE const *find_key) \
74148+{ \
74149+ ITEM_TYPE *item; \
74150+ \
74151+ PREFIX##_check_hash(hash, hash_index); \
74152+ \
74153+ for (item = hash->_table[hash_index]; \
74154+ item != NULL; \
74155+ item = item->LINK_NAME._next) \
74156+ { \
74157+ prefetch(item->LINK_NAME._next); \
74158+ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
74159+ if (EQ_FUNC (& item->KEY_NAME, find_key)) \
74160+ { \
74161+ return item; \
74162+ } \
74163+ } \
74164+ \
74165+ return NULL; \
74166+} \
74167+ \
74168+static __inline__ ITEM_TYPE* \
74169+PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
74170+ __u32 hash_index, \
74171+ KEY_TYPE const *find_key) \
74172+{ \
74173+ ITEM_TYPE ** item = &hash->_table[hash_index]; \
74174+ \
74175+ PREFIX##_check_hash(hash, hash_index); \
74176+ \
74177+ while (*item != NULL) { \
74178+ prefetch(&(*item)->LINK_NAME._next); \
74179+ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
74180+ ITEM_TYPE *found; \
74181+ \
74182+ found = *item; \
74183+ *item = found->LINK_NAME._next; \
74184+ found->LINK_NAME._next = hash->_table[hash_index]; \
74185+ hash->_table[hash_index] = found; \
74186+ return found; \
74187+ } \
74188+ item = &(*item)->LINK_NAME._next; \
74189+ } \
74190+ return NULL; \
74191+} \
74192+ \
74193+static __inline__ int \
74194+PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
74195+ __u32 hash_index, \
74196+ ITEM_TYPE *del_item) \
74197+{ \
74198+ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
74199+ \
74200+ PREFIX##_check_hash(hash, hash_index); \
74201+ \
74202+ while (*hash_item_p != NULL) { \
74203+ prefetch(&(*hash_item_p)->LINK_NAME._next); \
74204+ if (*hash_item_p == del_item) { \
74205+ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
74206+ return 1; \
74207+ } \
74208+ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
74209+ } \
74210+ return 0; \
74211+} \
74212+ \
74213+static __inline__ void \
74214+PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
74215+ __u32 hash_index, \
74216+ ITEM_TYPE *ins_item) \
74217+{ \
74218+ PREFIX##_check_hash(hash, hash_index); \
74219+ \
74220+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
74221+ hash->_table[hash_index] = ins_item; \
74222+} \
74223+ \
74224+static __inline__ void \
74225+PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
74226+ __u32 hash_index, \
74227+ ITEM_TYPE *ins_item) \
74228+{ \
74229+ PREFIX##_check_hash(hash, hash_index); \
74230+ \
74231+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
74232+ smp_wmb(); \
74233+ hash->_table[hash_index] = ins_item; \
74234+} \
74235+ \
74236+static __inline__ ITEM_TYPE* \
74237+PREFIX##_hash_find (PREFIX##_hash_table *hash, \
74238+ KEY_TYPE const *find_key) \
74239+{ \
74240+ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
74241+} \
74242+ \
74243+static __inline__ ITEM_TYPE* \
74244+PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
74245+ KEY_TYPE const *find_key) \
74246+{ \
74247+ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
74248+} \
74249+ \
74250+static __inline__ int \
74251+PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
74252+ ITEM_TYPE *del_item) \
74253+{ \
74254+ return PREFIX##_hash_remove_index (hash, \
74255+ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
74256+} \
74257+ \
74258+static __inline__ int \
74259+PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
74260+ ITEM_TYPE *del_item) \
74261+{ \
74262+ return PREFIX##_hash_remove (hash, del_item); \
74263+} \
74264+ \
74265+static __inline__ void \
74266+PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
74267+ ITEM_TYPE *ins_item) \
74268+{ \
74269+ return PREFIX##_hash_insert_index (hash, \
74270+ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
74271+} \
74272+ \
74273+static __inline__ void \
74274+PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
74275+ ITEM_TYPE *ins_item) \
74276+{ \
74277+ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
74278+ ins_item); \
74279+} \
74280+ \
74281+static __inline__ ITEM_TYPE * \
74282+PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
74283+{ \
74284+ ITEM_TYPE *first; \
74285+ \
74286+ for (first = NULL; ind < hash->_buckets; ++ ind) { \
74287+ first = hash->_table[ind]; \
74288+ if (first != NULL) \
74289+ break; \
74290+ } \
74291+ return first; \
74292+} \
74293+ \
74294+static __inline__ ITEM_TYPE * \
74295+PREFIX##_hash_next (PREFIX##_hash_table *hash, \
74296+ ITEM_TYPE *item) \
74297+{ \
74298+ ITEM_TYPE *next; \
74299+ \
74300+ if (item == NULL) \
74301+ return NULL; \
74302+ next = item->LINK_NAME._next; \
74303+ if (next == NULL) \
74304+ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
74305+ return next; \
74306+} \
74307+ \
74308+typedef struct {} PREFIX##_hash_dummy
74309+
74310+#define for_all_ht_buckets(table, head) \
74311+for ((head) = &(table) -> _table[ 0 ] ; \
74312+ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
74313+
74314+#define for_all_in_bucket(bucket, item, next, field) \
74315+for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
74316+ (item) != NULL ; \
74317+ (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
74318+
74319+#define for_all_in_htable(table, prefix, item, next) \
74320+for ((item) = prefix ## _hash_first ((table), 0), \
74321+ (next) = prefix ## _hash_next ((table), (item)) ; \
74322+ (item) != NULL ; \
74323+ (item) = (next), \
74324+ (next) = prefix ## _hash_next ((table), (item)))
74325+
74326+/* __REISER4_TYPE_SAFE_HASH_H__ */
74327+#endif
74328+
74329+/* Make Linus happy.
74330+ Local variables:
74331+ c-indentation-style: "K&R"
74332+ mode-name: "LC"
74333+ c-basic-offset: 8
74334+ tab-width: 8
74335+ fill-column: 120
74336+ End:
74337+*/
74338diff -urN linux-2.6.23.orig/fs/reiser4/vfs_ops.c linux-2.6.23/fs/reiser4/vfs_ops.c
74339--- linux-2.6.23.orig/fs/reiser4/vfs_ops.c 1970-01-01 03:00:00.000000000 +0300
74340+++ linux-2.6.23/fs/reiser4/vfs_ops.c 2007-12-04 16:49:30.000000000 +0300
74341@@ -0,0 +1,259 @@
74342+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74343+ * reiser4/README */
74344+
74345+/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
74346+ here. */
74347+
74348+#include "forward.h"
74349+#include "debug.h"
74350+#include "dformat.h"
74351+#include "coord.h"
74352+#include "plugin/item/item.h"
74353+#include "plugin/file/file.h"
74354+#include "plugin/security/perm.h"
74355+#include "plugin/disk_format/disk_format.h"
74356+#include "plugin/plugin.h"
74357+#include "plugin/plugin_set.h"
74358+#include "plugin/object.h"
74359+#include "txnmgr.h"
74360+#include "jnode.h"
74361+#include "znode.h"
74362+#include "block_alloc.h"
74363+#include "tree.h"
74364+#include "vfs_ops.h"
74365+#include "inode.h"
74366+#include "page_cache.h"
74367+#include "ktxnmgrd.h"
74368+#include "super.h"
74369+#include "reiser4.h"
74370+#include "entd.h"
74371+#include "status_flags.h"
74372+#include "flush.h"
74373+#include "dscale.h"
74374+
74375+#include <linux/profile.h>
74376+#include <linux/types.h>
74377+#include <linux/mount.h>
74378+#include <linux/vfs.h>
74379+#include <linux/mm.h>
74380+#include <linux/buffer_head.h>
74381+#include <linux/dcache.h>
74382+#include <linux/list.h>
74383+#include <linux/pagemap.h>
74384+#include <linux/slab.h>
74385+#include <linux/seq_file.h>
74386+#include <linux/init.h>
74387+#include <linux/module.h>
74388+#include <linux/writeback.h>
74389+#include <linux/blkdev.h>
74390+#include <linux/quotaops.h>
74391+#include <linux/security.h>
74392+#include <linux/reboot.h>
74393+#include <linux/rcupdate.h>
74394+
74395+/* update inode stat-data by calling plugin */
74396+int reiser4_update_sd(struct inode *object)
74397+{
74398+ file_plugin *fplug;
74399+
74400+ assert("nikita-2338", object != NULL);
74401+ /* check for read-only file system. */
74402+ if (IS_RDONLY(object))
74403+ return 0;
74404+
74405+ fplug = inode_file_plugin(object);
74406+ assert("nikita-2339", fplug != NULL);
74407+ return fplug->write_sd_by_inode(object);
74408+}
74409+
74410+/* helper function: increase inode nlink count and call plugin method to save
74411+ updated stat-data.
74412+
74413+ Used by link/create and during creation of dot and dotdot in mkdir
74414+*/
74415+int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
74416+ struct inode *parent /* parent where new entry will be */
74417+ ,
74418+ int write_sd_p /* true if stat-data has to be
74419+ * updated */ )
74420+{
74421+ file_plugin *fplug;
74422+ int result;
74423+
74424+ assert("nikita-1351", object != NULL);
74425+
74426+ fplug = inode_file_plugin(object);
74427+ assert("nikita-1445", fplug != NULL);
74428+
74429+ /* ask plugin whether it can add yet another link to this
74430+ object */
74431+ if (!fplug->can_add_link(object))
74432+ return RETERR(-EMLINK);
74433+
74434+ assert("nikita-2211", fplug->add_link != NULL);
74435+ /* call plugin to do actual addition of link */
74436+ result = fplug->add_link(object, parent);
74437+
74438+ /* optionally update stat data */
74439+ if (result == 0 && write_sd_p)
74440+ result = fplug->write_sd_by_inode(object);
74441+ return result;
74442+}
74443+
74444+/* helper function: decrease inode nlink count and call plugin method to save
74445+ updated stat-data.
74446+
74447+ Used by unlink/create
74448+*/
74449+int reiser4_del_nlink(struct inode *object /* object from which link is
74450+ * removed */ ,
74451+ struct inode *parent /* parent where entry was */ ,
74452+ int write_sd_p /* true is stat-data has to be
74453+ * updated */ )
74454+{
74455+ file_plugin *fplug;
74456+ int result;
74457+
74458+ assert("nikita-1349", object != NULL);
74459+
74460+ fplug = inode_file_plugin(object);
74461+ assert("nikita-1350", fplug != NULL);
74462+ assert("nikita-1446", object->i_nlink > 0);
74463+ assert("nikita-2210", fplug->rem_link != NULL);
74464+
74465+ /* call plugin to do actual deletion of link */
74466+ result = fplug->rem_link(object, parent);
74467+
74468+ /* optionally update stat data */
74469+ if (result == 0 && write_sd_p)
74470+ result = fplug->write_sd_by_inode(object);
74471+ return result;
74472+}
74473+
74474+/* Release reiser4 dentry. This is d_op->d_release() method. */
74475+static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
74476+{
74477+ reiser4_free_dentry_fsdata(dentry);
74478+}
74479+
74480+/*
74481+ * Called by reiser4_sync_inodes(), during speculative write-back (through
74482+ * pdflush, or balance_dirty_pages()).
74483+ */
74484+void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
74485+{
74486+ long written = 0;
74487+ int repeats = 0;
74488+ int result;
74489+ struct address_space *mapping;
74490+
74491+ /*
74492+ * Performs early flushing, trying to free some memory. If there is
74493+ * nothing to flush, commits some atoms.
74494+ */
74495+
74496+ /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
74497+ sys_fsync(). */
74498+ if (wbc->sync_mode != WB_SYNC_NONE) {
74499+ txnmgr_force_commit_all(sb, 0);
74500+ return;
74501+ }
74502+
74503+ BUG_ON(reiser4_get_super_fake(sb) == NULL);
74504+ mapping = reiser4_get_super_fake(sb)->i_mapping;
74505+ do {
74506+ long nr_submitted = 0;
74507+ jnode *node = NULL;
74508+
74509+ /* do not put more requests to overload write queue */
74510+ if (wbc->nonblocking &&
74511+ bdi_write_congested(mapping->backing_dev_info)) {
74512+ blk_run_address_space(mapping);
74513+ wbc->encountered_congestion = 1;
74514+ break;
74515+ }
74516+ repeats++;
74517+ BUG_ON(wbc->nr_to_write <= 0);
74518+
74519+ if (get_current_context()->entd) {
74520+ entd_context *ent = get_entd_context(sb);
74521+
74522+ if (ent->cur_request->node)
74523+ /*
74524+ * this is ent thread and it managed to capture
74525+ * requested page itself - start flush from
74526+ * that page
74527+ */
74528+ node = jref(ent->cur_request->node);
74529+ }
74530+
74531+ result = flush_some_atom(node, &nr_submitted, wbc,
74532+ JNODE_FLUSH_WRITE_BLOCKS);
74533+ if (result != 0)
74534+ warning("nikita-31001", "Flush failed: %i", result);
74535+ if (node)
74536+ jput(node);
74537+ if (!nr_submitted)
74538+ break;
74539+
74540+ wbc->nr_to_write -= nr_submitted;
74541+ written += nr_submitted;
74542+ } while (wbc->nr_to_write > 0);
74543+}
74544+
74545+void reiser4_throttle_write(struct inode *inode)
74546+{
74547+ reiser4_txn_restart_current();
74548+ balance_dirty_pages_ratelimited(inode->i_mapping);
74549+}
74550+
74551+const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
74552+const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
74553+ * beginning of device */
74554+
74555+/*
74556+ * Reiser4 initialization/shutdown.
74557+ *
74558+ * Code below performs global reiser4 initialization that is done either as
74559+ * part of kernel initialization (when reiser4 is statically built-in), or
74560+ * during reiser4 module load (when compiled as module).
74561+ */
74562+
74563+void reiser4_handle_error(void)
74564+{
74565+ struct super_block *sb = reiser4_get_current_sb();
74566+
74567+ if (!sb)
74568+ return;
74569+ reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
74570+ "Filesystem error occured");
74571+ switch (get_super_private(sb)->onerror) {
74572+ case 0:
74573+ reiser4_panic("foobar-42", "Filesystem error occured\n");
74574+ case 1:
74575+ default:
74576+ if (sb->s_flags & MS_RDONLY)
74577+ return;
74578+ sb->s_flags |= MS_RDONLY;
74579+ break;
74580+ }
74581+}
74582+
74583+struct dentry_operations reiser4_dentry_operations = {
74584+ .d_revalidate = NULL,
74585+ .d_hash = NULL,
74586+ .d_compare = NULL,
74587+ .d_delete = NULL,
74588+ .d_release = reiser4_d_release,
74589+ .d_iput = NULL,
74590+};
74591+
74592+/* Make Linus happy.
74593+ Local variables:
74594+ c-indentation-style: "K&R"
74595+ mode-name: "LC"
74596+ c-basic-offset: 8
74597+ tab-width: 8
74598+ fill-column: 120
74599+ End:
74600+*/
74601diff -urN linux-2.6.23.orig/fs/reiser4/vfs_ops.h linux-2.6.23/fs/reiser4/vfs_ops.h
74602--- linux-2.6.23.orig/fs/reiser4/vfs_ops.h 1970-01-01 03:00:00.000000000 +0300
74603+++ linux-2.6.23/fs/reiser4/vfs_ops.h 2007-12-04 16:49:30.000000000 +0300
74604@@ -0,0 +1,53 @@
74605+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74606+ * reiser4/README */
74607+
74608+/* vfs_ops.c's exported symbols */
74609+
74610+#if !defined( __FS_REISER4_VFS_OPS_H__ )
74611+#define __FS_REISER4_VFS_OPS_H__
74612+
74613+#include "forward.h"
74614+#include "coord.h"
74615+#include "seal.h"
74616+#include "plugin/file/file.h"
74617+#include "super.h"
74618+#include "readahead.h"
74619+
74620+#include <linux/types.h> /* for loff_t */
74621+#include <linux/fs.h> /* for struct address_space */
74622+#include <linux/dcache.h> /* for struct dentry */
74623+#include <linux/mm.h>
74624+#include <linux/backing-dev.h>
74625+
74626+/* address space operations */
74627+int reiser4_writepage(struct page *, struct writeback_control *);
74628+int reiser4_set_page_dirty(struct page *);
74629+void reiser4_invalidatepage(struct page *, unsigned long offset);
74630+int reiser4_releasepage(struct page *, gfp_t);
74631+
74632+extern int reiser4_update_sd(struct inode *);
74633+extern int reiser4_add_nlink(struct inode *, struct inode *, int);
74634+extern int reiser4_del_nlink(struct inode *, struct inode *, int);
74635+
74636+extern int reiser4_start_up_io(struct page *page);
74637+extern void reiser4_throttle_write(struct inode *);
74638+extern int jnode_is_releasable(jnode *);
74639+
74640+#define CAPTURE_APAGE_BURST (1024l)
74641+void reiser4_writeout(struct super_block *, struct writeback_control *);
74642+
74643+extern void reiser4_handle_error(void);
74644+
74645+/* __FS_REISER4_VFS_OPS_H__ */
74646+#endif
74647+
74648+/* Make Linus happy.
74649+ Local variables:
74650+ c-indentation-style: "K&R"
74651+ mode-name: "LC"
74652+ c-basic-offset: 8
74653+ tab-width: 8
74654+ fill-column: 120
74655+ scroll-step: 1
74656+ End:
74657+*/
74658diff -urN linux-2.6.23.orig/fs/reiser4/wander.c linux-2.6.23/fs/reiser4/wander.c
74659--- linux-2.6.23.orig/fs/reiser4/wander.c 1970-01-01 03:00:00.000000000 +0300
74660+++ linux-2.6.23/fs/reiser4/wander.c 2007-12-04 16:49:30.000000000 +0300
74661@@ -0,0 +1,1797 @@
74662+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74663+ * reiser4/README */
74664+
74665+/* Reiser4 Wandering Log */
74666+
74667+/* You should read http://www.namesys.com/txn-doc.html
74668+
74669+ That describes how filesystem operations are performed as atomic
74670+ transactions, and how we try to arrange it so that we can write most of the
74671+ data only once while performing the operation atomically.
74672+
74673+ For the purposes of this code, it is enough for it to understand that it
74674+ has been told a given block should be written either once, or twice (if
74675+ twice then once to the wandered location and once to the real location).
74676+
74677+ This code guarantees that those blocks that are defined to be part of an
74678+ atom either all take effect or none of them take effect.
74679+
74680+ Relocate set nodes are submitted to write by the jnode_flush() routine, and
74681+ the overwrite set is submitted by reiser4_write_log(). This is because with
74682+ the overwrite set we seek to optimize writes, and with the relocate set we
74683+ seek to cause disk order to correlate with the parent first pre-order.
74684+
74685+ reiser4_write_log() allocates and writes wandered blocks and maintains
74686+ additional on-disk structures of the atom as wander records (each wander
74687+ record occupies one block) for storing of the "wandered map" (a table which
74688+ contains a relation between wandered and real block numbers) and other
74689+ information which might be needed at transaction recovery time.
74690+
74691+ The wander records are unidirectionally linked into a circle: each wander
74692+ record contains a block number of the next wander record, the last wander
74693+ record points to the first one.
74694+
74695+ One wander record (named "tx head" in this file) has a format which is
74696+ different from the other wander records. The "tx head" has a reference to the
74697+ "tx head" block of the previously committed atom. Also, "tx head" contains
74698+ fs information (the free blocks counter, and the oid allocator state) which
74699+ is logged in a special way .
74700+
74701+ There are two journal control blocks, named journal header and journal
74702+ footer which have fixed on-disk locations. The journal header has a
74703+ reference to the "tx head" block of the last committed atom. The journal
74704+ footer points to the "tx head" of the last flushed atom. The atom is
74705+ "played" when all blocks from its overwrite set are written to disk the
74706+ second time (i.e. written to their real locations).
74707+
74708+ NOTE: People who know reiserfs internals and its journal structure might be
74709+ confused with these terms journal footer and journal header. There is a table
74710+ with terms of similar semantics in reiserfs (reiser3) and reiser4:
74711+
74712+ REISER3 TERM | REISER4 TERM | DESCRIPTION
74713+ --------------------+-----------------------+----------------------------
74714+ commit record | journal header | atomic write of this record
74715+ | | ends transaction commit
74716+ --------------------+-----------------------+----------------------------
74717+ journal header | journal footer | atomic write of this record
74718+ | | ends post-commit writes.
74719+ | | After successful
74720+ | | writing of this journal
74721+ | | blocks (in reiser3) or
74722+ | | wandered blocks/records are
74723+ | | free for re-use.
74724+ --------------------+-----------------------+----------------------------
74725+
74726+ The atom commit process is the following:
74727+
74728+ 1. The overwrite set is taken from atom's clean list, and its size is
74729+ counted.
74730+
74731+ 2. The number of necessary wander records (including tx head) is calculated,
74732+ and the wander record blocks are allocated.
74733+
74734+ 3. Allocate wandered blocks and populate wander records by wandered map.
74735+
74736+ 4. submit write requests for wander records and wandered blocks.
74737+
74738+ 5. wait until submitted write requests complete.
74739+
74740+ 6. update journal header: change the pointer to the block number of just
74741+ written tx head, submit an i/o for modified journal header block and wait
74742+ for i/o completion.
74743+
74744+ NOTE: The special logging for bitmap blocks and some reiser4 super block
74745+ fields makes processes of atom commit, flush and recovering a bit more
74746+ complex (see comments in the source code for details).
74747+
74748+ The atom playing process is the following:
74749+
74750+ 1. Write atom's overwrite set in-place.
74751+
74752+ 2. Wait on i/o.
74753+
74754+ 3. Update journal footer: change the pointer to block number of tx head
74755+ block of the atom we currently flushing, submit an i/o, wait on i/o
74756+ completion.
74757+
74758+ 4. Free disk space which was used for wandered blocks and wander records.
74759+
74760+ After the freeing of wandered blocks and wander records we have that journal
74761+ footer points to the on-disk structure which might be overwritten soon.
74762+ Neither the log writer nor the journal recovery procedure use that pointer
74763+ for accessing the data. When the journal recovery procedure finds the oldest
74764+ transaction it compares the journal footer pointer value with the "prev_tx"
74765+ pointer value in tx head, if values are equal the oldest not flushed
74766+ transaction is found.
74767+
74768+ NOTE on disk space leakage: the information about of what blocks and how many
74769+ blocks are allocated for wandered blocks, wandered records is not written to
74770+ the disk because of special logging for bitmaps and some super blocks
74771+ counters. After a system crash we the reiser4 does not remember those
74772+ objects allocation, thus we have no such a kind of disk space leakage.
74773+*/
74774+
74775+/* Special logging of reiser4 super block fields. */
74776+
74777+/* There are some reiser4 super block fields (free block count and OID allocator
74778+ state (number of files and next free OID) which are logged separately from
74779+ super block to avoid unnecessary atom fusion.
74780+
74781+ So, the reiser4 super block can be not captured by a transaction with
74782+ allocates/deallocates disk blocks or create/delete file objects. Moreover,
74783+ the reiser4 on-disk super block is not touched when such a transaction is
74784+ committed and flushed. Those "counters logged specially" are logged in "tx
74785+ head" blocks and in the journal footer block.
74786+
74787+ A step-by-step description of special logging:
74788+
74789+ 0. The per-atom information about deleted or created files and allocated or
74790+ freed blocks is collected during the transaction. The atom's
74791+ ->nr_objects_created and ->nr_objects_deleted are for object
74792+ deletion/creation tracking, the numbers of allocated and freed blocks are
74793+ calculated using atom's delete set and atom's capture list -- all new and
74794+ relocated nodes should be on atom's clean list and should have JNODE_RELOC
74795+ bit set.
74796+
74797+ 1. The "logged specially" reiser4 super block fields have their "committed"
74798+ versions in the reiser4 in-memory super block. They get modified only at
74799+ atom commit time. The atom's commit thread has an exclusive access to those
74800+ "committed" fields because the log writer implementation supports only one
74801+ atom commit a time (there is a per-fs "commit" mutex). At
74802+ that time "committed" counters are modified using per-atom information
74803+ collected during the transaction. These counters are stored on disk as a
74804+ part of tx head block when atom is committed.
74805+
74806+ 2. When the atom is flushed the value of the free block counter and the OID
74807+ allocator state get written to the journal footer block. A special journal
74808+ procedure (journal_recover_sb_data()) takes those values from the journal
74809+ footer and updates the reiser4 in-memory super block.
74810+
74811+ NOTE: That means free block count and OID allocator state are logged
74812+ separately from the reiser4 super block regardless of the fact that the
74813+ reiser4 super block has fields to store both the free block counter and the
74814+ OID allocator.
74815+
74816+ Writing the whole super block at commit time requires knowing true values of
74817+ all its fields without changes made by not yet committed transactions. It is
74818+ possible by having their "committed" version of the super block like the
74819+ reiser4 bitmap blocks have "committed" and "working" versions. However,
74820+ another scheme was implemented which stores special logged values in the
74821+ unused free space inside transaction head block. In my opinion it has an
74822+ advantage of not writing whole super block when only part of it was
74823+ modified. */
74824+
74825+#include "debug.h"
74826+#include "dformat.h"
74827+#include "txnmgr.h"
74828+#include "jnode.h"
74829+#include "znode.h"
74830+#include "block_alloc.h"
74831+#include "page_cache.h"
74832+#include "wander.h"
74833+#include "reiser4.h"
74834+#include "super.h"
74835+#include "vfs_ops.h"
74836+#include "writeout.h"
74837+#include "inode.h"
74838+#include "entd.h"
74839+
74840+#include <linux/types.h>
74841+#include <linux/fs.h> /* for struct super_block */
74842+#include <linux/mm.h> /* for struct page */
74843+#include <linux/pagemap.h>
74844+#include <linux/bio.h> /* for struct bio */
74845+#include <linux/blkdev.h>
74846+
74847+static int write_jnodes_to_disk_extent(
74848+ jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
74849+
74850+/* The commit_handle is a container for objects needed at atom commit time */
74851+struct commit_handle {
74852+ /* A pointer to atom's list of OVRWR nodes */
74853+ struct list_head *overwrite_set;
74854+ /* atom's overwrite set size */
74855+ int overwrite_set_size;
74856+ /* jnodes for wander record blocks */
74857+ struct list_head tx_list;
74858+ /* number of wander records */
74859+ __u32 tx_size;
74860+ /* 'committed' sb counters are saved here until atom is completely
74861+ flushed */
74862+ __u64 free_blocks;
74863+ __u64 nr_files;
74864+ __u64 next_oid;
74865+ /* A pointer to the atom which is being committed */
74866+ txn_atom *atom;
74867+ /* A pointer to current super block */
74868+ struct super_block *super;
74869+ /* The counter of modified bitmaps */
74870+ reiser4_block_nr nr_bitmap;
74871+};
74872+
74873+static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
74874+{
74875+ memset(ch, 0, sizeof(struct commit_handle));
74876+ INIT_LIST_HEAD(&ch->tx_list);
74877+
74878+ ch->atom = atom;
74879+ ch->super = reiser4_get_current_sb();
74880+}
74881+
74882+static void done_commit_handle(struct commit_handle *ch)
74883+{
74884+ assert("zam-690", list_empty(&ch->tx_list));
74885+}
74886+
74887+static inline int reiser4_use_write_barrier(struct super_block * s)
74888+{
74889+ return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
74890+}
74891+
74892+static void disable_write_barrier(struct super_block * s)
74893+{
74894+ notice("zam-1055", "%s does not support write barriers,"
74895+ " using synchronous write instead.", s->s_id);
74896+ set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
74897+}
74898+
74899+/* fill journal header block data */
74900+static void format_journal_header(struct commit_handle *ch)
74901+{
74902+ struct reiser4_super_info_data *sbinfo;
74903+ struct journal_header *header;
74904+ jnode *txhead;
74905+
74906+ sbinfo = get_super_private(ch->super);
74907+ assert("zam-479", sbinfo != NULL);
74908+ assert("zam-480", sbinfo->journal_header != NULL);
74909+
74910+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
74911+
74912+ jload(sbinfo->journal_header);
74913+
74914+ header = (struct journal_header *)jdata(sbinfo->journal_header);
74915+ assert("zam-484", header != NULL);
74916+
74917+ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
74918+ &header->last_committed_tx);
74919+
74920+ jrelse(sbinfo->journal_header);
74921+}
74922+
74923+/* fill journal footer block data */
74924+static void format_journal_footer(struct commit_handle *ch)
74925+{
74926+ struct reiser4_super_info_data *sbinfo;
74927+ struct journal_footer *footer;
74928+ jnode *tx_head;
74929+
74930+ sbinfo = get_super_private(ch->super);
74931+
74932+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
74933+
74934+ assert("zam-493", sbinfo != NULL);
74935+ assert("zam-494", sbinfo->journal_header != NULL);
74936+
74937+ check_me("zam-691", jload(sbinfo->journal_footer) == 0);
74938+
74939+ footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
74940+ assert("zam-495", footer != NULL);
74941+
74942+ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
74943+ &footer->last_flushed_tx);
74944+ put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
74945+
74946+ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
74947+ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
74948+
74949+ jrelse(sbinfo->journal_footer);
74950+}
74951+
74952+/* wander record capacity depends on current block size */
74953+static int wander_record_capacity(const struct super_block *super)
74954+{
74955+ return (super->s_blocksize -
74956+ sizeof(struct wander_record_header)) /
74957+ sizeof(struct wander_entry);
74958+}
74959+
74960+/* Fill first wander record (tx head) in accordance with supplied given data */
74961+static void format_tx_head(struct commit_handle *ch)
74962+{
74963+ jnode *tx_head;
74964+ jnode *next;
74965+ struct tx_header *header;
74966+
74967+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
74968+ assert("zam-692", &ch->tx_list != &tx_head->capture_link);
74969+
74970+ next = list_entry(tx_head->capture_link.next, jnode, capture_link);
74971+ if (&ch->tx_list == &next->capture_link)
74972+ next = tx_head;
74973+
74974+ header = (struct tx_header *)jdata(tx_head);
74975+
74976+ assert("zam-460", header != NULL);
74977+ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
74978+
74979+ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
74980+ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
74981+
74982+ put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
74983+ put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
74984+ &header->prev_tx);
74985+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
74986+ put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
74987+ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
74988+ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
74989+}
74990+
74991+/* prepare ordinary wander record block (fill all service fields) */
74992+static void
74993+format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
74994+{
74995+ struct wander_record_header *LRH;
74996+ jnode *next;
74997+
74998+ assert("zam-464", node != NULL);
74999+
75000+ LRH = (struct wander_record_header *)jdata(node);
75001+ next = list_entry(node->capture_link.next, jnode, capture_link);
75002+
75003+ if (&ch->tx_list == &next->capture_link)
75004+ next = list_entry(ch->tx_list.next, jnode, capture_link);
75005+
75006+ assert("zam-465", LRH != NULL);
75007+ assert("zam-463",
75008+ ch->super->s_blocksize > sizeof(struct wander_record_header));
75009+
75010+ memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
75011+ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
75012+
75013+ put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
75014+ put_unaligned(cpu_to_le32(serial), &LRH->serial);
75015+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
75016+}
75017+
75018+/* add one wandered map entry to formatted wander record */
75019+static void
75020+store_entry(jnode * node, int index, const reiser4_block_nr * a,
75021+ const reiser4_block_nr * b)
75022+{
75023+ char *data;
75024+ struct wander_entry *pairs;
75025+
75026+ data = jdata(node);
75027+ assert("zam-451", data != NULL);
75028+
75029+ pairs =
75030+ (struct wander_entry *)(data + sizeof(struct wander_record_header));
75031+
75032+ put_unaligned(cpu_to_le64(*a), &pairs[index].original);
75033+ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
75034+}
75035+
75036+/* currently, wander records contains contain only wandered map, which depend on
75037+ overwrite set size */
75038+static void get_tx_size(struct commit_handle *ch)
75039+{
75040+ assert("zam-440", ch->overwrite_set_size != 0);
75041+ assert("zam-695", ch->tx_size == 0);
75042+
75043+ /* count all ordinary wander records
75044+ (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
75045+ for tx head block */
75046+ ch->tx_size =
75047+ (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
75048+ 2;
75049+}
75050+
75051+/* A special structure for using in store_wmap_actor() for saving its state
75052+ between calls */
75053+struct store_wmap_params {
75054+ jnode *cur; /* jnode of current wander record to fill */
75055+ int idx; /* free element index in wander record */
75056+ int capacity; /* capacity */
75057+
75058+#if REISER4_DEBUG
75059+ struct list_head *tx_list;
75060+#endif
75061+};
75062+
75063+/* an actor for use in blocknr_set_iterator routine which populates the list
75064+ of pre-formatted wander records by wandered map info */
75065+static int
75066+store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
75067+ const reiser4_block_nr * b, void *data)
75068+{
75069+ struct store_wmap_params *params = data;
75070+
75071+ if (params->idx >= params->capacity) {
75072+ /* a new wander record should be taken from the tx_list */
75073+ params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
75074+ assert("zam-454",
75075+ params->tx_list != &params->cur->capture_link);
75076+
75077+ params->idx = 0;
75078+ }
75079+
75080+ store_entry(params->cur, params->idx, a, b);
75081+ params->idx++;
75082+
75083+ return 0;
75084+}
75085+
75086+/* This function is called after Relocate set gets written to disk, Overwrite
75087+ set is written to wandered locations and all wander records are written
75088+ also. Updated journal header blocks contains a pointer (block number) to
75089+ first wander record of the just written transaction */
75090+static int update_journal_header(struct commit_handle *ch, int use_barrier)
75091+{
75092+ struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75093+ jnode *jh = sbinfo->journal_header;
75094+ jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
75095+ int ret;
75096+
75097+ format_journal_header(ch);
75098+
75099+ ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
75100+ use_barrier ? WRITEOUT_BARRIER : 0);
75101+ if (ret)
75102+ return ret;
75103+
75104+ // blk_run_address_space(sbinfo->fake->i_mapping);
75105+ /*blk_run_queues(); */
75106+
75107+ ret = jwait_io(jh, WRITE);
75108+
75109+ if (ret)
75110+ return ret;
75111+
75112+ sbinfo->last_committed_tx = *jnode_get_block(head);
75113+
75114+ return 0;
75115+}
75116+
75117+/* This function is called after write-back is finished. We update journal
75118+ footer block and free blocks which were occupied by wandered blocks and
75119+ transaction wander records */
75120+static int update_journal_footer(struct commit_handle *ch, int use_barrier)
75121+{
75122+ reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75123+
75124+ jnode *jf = sbinfo->journal_footer;
75125+
75126+ int ret;
75127+
75128+ format_journal_footer(ch);
75129+
75130+ ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
75131+ use_barrier ? WRITEOUT_BARRIER : 0);
75132+ if (ret)
75133+ return ret;
75134+
75135+ // blk_run_address_space(sbinfo->fake->i_mapping);
75136+ /*blk_run_queue(); */
75137+
75138+ ret = jwait_io(jf, WRITE);
75139+ if (ret)
75140+ return ret;
75141+
75142+ return 0;
75143+}
75144+
75145+/* free block numbers of wander records of already written in place transaction */
75146+static void dealloc_tx_list(struct commit_handle *ch)
75147+{
75148+ while (!list_empty(&ch->tx_list)) {
75149+ jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
75150+ list_del(&cur->capture_link);
75151+ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
75152+ reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
75153+ BA_FORMATTED);
75154+
75155+ unpin_jnode_data(cur);
75156+ reiser4_drop_io_head(cur);
75157+ }
75158+}
75159+
75160+/* An actor for use in block_nr_iterator() routine which frees wandered blocks
75161+ from atom's overwrite set. */
75162+static int
75163+dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
75164+ const reiser4_block_nr * a UNUSED_ARG,
75165+ const reiser4_block_nr * b, void *data UNUSED_ARG)
75166+{
75167+
75168+ assert("zam-499", b != NULL);
75169+ assert("zam-500", *b != 0);
75170+ assert("zam-501", !reiser4_blocknr_is_fake(b));
75171+
75172+ reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
75173+ return 0;
75174+}
75175+
75176+/* free wandered block locations of already written in place transaction */
75177+static void dealloc_wmap(struct commit_handle *ch)
75178+{
75179+ assert("zam-696", ch->atom != NULL);
75180+
75181+ blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
75182+ dealloc_wmap_actor, NULL, 1);
75183+}
75184+
75185+/* helper function for alloc wandered blocks, which refill set of block
75186+ numbers needed for wandered blocks */
75187+static int
75188+get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
75189+{
75190+ reiser4_blocknr_hint hint;
75191+ int ret;
75192+
75193+ reiser4_block_nr wide_len = count;
75194+
75195+ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
75196+ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
75197+ reserved allocation area so as to get the best qualities of fixed
75198+ journals? */
75199+ reiser4_blocknr_hint_init(&hint);
75200+ hint.block_stage = BLOCK_GRABBED;
75201+
75202+ ret = reiser4_alloc_blocks(&hint, start, &wide_len,
75203+ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
75204+ *len = (int)wide_len;
75205+
75206+ return ret;
75207+}
75208+
75209+/*
75210+ * roll back changes made before issuing BIO in the case of IO error.
75211+ */
75212+static void undo_bio(struct bio *bio)
75213+{
75214+ int i;
75215+
75216+ for (i = 0; i < bio->bi_vcnt; ++i) {
75217+ struct page *pg;
75218+ jnode *node;
75219+
75220+ pg = bio->bi_io_vec[i].bv_page;
75221+ end_page_writeback(pg);
75222+ node = jprivate(pg);
75223+ spin_lock_jnode(node);
75224+ JF_CLR(node, JNODE_WRITEBACK);
75225+ JF_SET(node, JNODE_DIRTY);
75226+ spin_unlock_jnode(node);
75227+ }
75228+ bio_put(bio);
75229+}
75230+
75231+/* put overwrite set back to atom's clean list */
75232+static void put_overwrite_set(struct commit_handle *ch)
75233+{
75234+ jnode *cur;
75235+
75236+ list_for_each_entry(cur, ch->overwrite_set, capture_link)
75237+ jrelse_tail(cur);
75238+}
75239+
75240+/* Count overwrite set size, grab disk space for wandered blocks allocation.
75241+ Since we have a separate list for atom's overwrite set we just scan the list,
75242+ count bitmap and other not leaf nodes which wandered blocks allocation we
75243+ have to grab space for. */
75244+static int get_overwrite_set(struct commit_handle *ch)
75245+{
75246+ int ret;
75247+ jnode *cur;
75248+ __u64 nr_not_leaves = 0;
75249+#if REISER4_DEBUG
75250+ __u64 nr_formatted_leaves = 0;
75251+ __u64 nr_unformatted_leaves = 0;
75252+#endif
75253+
75254+ assert("zam-697", ch->overwrite_set_size == 0);
75255+
75256+ ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
75257+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75258+
75259+ while (ch->overwrite_set != &cur->capture_link) {
75260+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
75261+
75262+ /* Count bitmap locks for getting correct statistics what number
75263+ * of blocks were cleared by the transaction commit. */
75264+ if (jnode_get_type(cur) == JNODE_BITMAP)
75265+ ch->nr_bitmap++;
75266+
75267+ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
75268+ || jnode_get_type(cur) == JNODE_BITMAP);
75269+
75270+ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
75271+ /* we replace fake znode by another (real)
75272+ znode which is suggested by disk_layout
75273+ plugin */
75274+
75275+ /* FIXME: it looks like fake znode should be
75276+ replaced by jnode supplied by
75277+ disk_layout. */
75278+
75279+ struct super_block *s = reiser4_get_current_sb();
75280+ reiser4_super_info_data *sbinfo =
75281+ get_current_super_private();
75282+
75283+ if (sbinfo->df_plug->log_super) {
75284+ jnode *sj = sbinfo->df_plug->log_super(s);
75285+
75286+ assert("zam-593", sj != NULL);
75287+
75288+ if (IS_ERR(sj))
75289+ return PTR_ERR(sj);
75290+
75291+ spin_lock_jnode(sj);
75292+ JF_SET(sj, JNODE_OVRWR);
75293+ insert_into_atom_ovrwr_list(ch->atom, sj);
75294+ spin_unlock_jnode(sj);
75295+
75296+ /* jload it as the rest of overwrite set */
75297+ jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
75298+
75299+ ch->overwrite_set_size++;
75300+ }
75301+ spin_lock_jnode(cur);
75302+ reiser4_uncapture_block(cur);
75303+ jput(cur);
75304+
75305+ } else {
75306+ int ret;
75307+ ch->overwrite_set_size++;
75308+ ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
75309+ if (ret)
75310+ reiser4_panic("zam-783",
75311+ "cannot load e-flushed jnode back (ret = %d)\n",
75312+ ret);
75313+ }
75314+
75315+ /* Count not leaves here because we have to grab disk space
75316+ * for wandered blocks. They were not counted as "flush
75317+ * reserved". Counting should be done _after_ nodes are pinned
75318+ * into memory by jload(). */
75319+ if (!jnode_is_leaf(cur))
75320+ nr_not_leaves++;
75321+ else {
75322+#if REISER4_DEBUG
75323+ /* at this point @cur either has JNODE_FLUSH_RESERVED
75324+ * or is eflushed. Locking is not strong enough to
75325+ * write an assertion checking for this. */
75326+ if (jnode_is_znode(cur))
75327+ nr_formatted_leaves++;
75328+ else
75329+ nr_unformatted_leaves++;
75330+#endif
75331+ JF_CLR(cur, JNODE_FLUSH_RESERVED);
75332+ }
75333+
75334+ cur = next;
75335+ }
75336+
75337+ /* Grab space for writing (wandered blocks) of not leaves found in
75338+ * overwrite set. */
75339+ ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
75340+ if (ret)
75341+ return ret;
75342+
75343+ /* Disk space for allocation of wandered blocks of leaf nodes already
75344+ * reserved as "flush reserved", move it to grabbed space counter. */
75345+ spin_lock_atom(ch->atom);
75346+ assert("zam-940",
75347+ nr_formatted_leaves + nr_unformatted_leaves <=
75348+ ch->atom->flush_reserved);
75349+ flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
75350+ spin_unlock_atom(ch->atom);
75351+
75352+ return ch->overwrite_set_size;
75353+}
75354+
75355+/**
75356+ * write_jnodes_to_disk_extent - submit write request
75357+ * @head:
75358+ * @first: first jnode of the list
75359+ * @nr: number of jnodes on the list
75360+ * @block_p:
75361+ * @fq:
75362+ * @flags: used to decide whether page is to get PG_reclaim flag
75363+ *
75364+ * Submits a write request for @nr jnodes beginning from the @first, other
75365+ * jnodes are after the @first on the double-linked "capture" list. All jnodes
75366+ * will be written to the disk region of @nr blocks starting with @block_p block
75367+ * number. If @fq is not NULL it means that waiting for i/o completion will be
75368+ * done more efficiently by using flush_queue_t objects.
75369+ * This function is the one which writes list of jnodes in batch mode. It does
75370+ * all low-level things as bio construction and page states manipulation.
75371+ *
75372+ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
75373+ * aggregated in this function instead of being left to the layers below
75374+ *
75375+ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
75376+ * Why that layer needed? Why BIOs cannot be constructed here?
75377+ */
75378+static int write_jnodes_to_disk_extent(
75379+ jnode *first, int nr, const reiser4_block_nr *block_p,
75380+ flush_queue_t *fq, int flags)
75381+{
75382+ struct super_block *super = reiser4_get_current_sb();
75383+ int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
75384+ int max_blocks;
75385+ jnode *cur = first;
75386+ reiser4_block_nr block;
75387+
75388+ assert("zam-571", first != NULL);
75389+ assert("zam-572", block_p != NULL);
75390+ assert("zam-570", nr > 0);
75391+
75392+ block = *block_p;
75393+ max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
75394+
75395+ while (nr > 0) {
75396+ struct bio *bio;
75397+ int nr_blocks = min(nr, max_blocks);
75398+ int i;
75399+ int nr_used;
75400+
75401+ bio = bio_alloc(GFP_NOIO, nr_blocks);
75402+ if (!bio)
75403+ return RETERR(-ENOMEM);
75404+
75405+ bio->bi_bdev = super->s_bdev;
75406+ bio->bi_sector = block * (super->s_blocksize >> 9);
75407+ for (nr_used = 0, i = 0; i < nr_blocks; i++) {
75408+ struct page *pg;
75409+
75410+ pg = jnode_page(cur);
75411+ assert("zam-573", pg != NULL);
75412+
75413+ page_cache_get(pg);
75414+
75415+ lock_and_wait_page_writeback(pg);
75416+
75417+ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
75418+ /*
75419+ * underlying device is satiated. Stop adding
75420+ * pages to the bio.
75421+ */
75422+ unlock_page(pg);
75423+ page_cache_release(pg);
75424+ break;
75425+ }
75426+
75427+ spin_lock_jnode(cur);
75428+ assert("nikita-3166",
75429+ pg->mapping == jnode_get_mapping(cur));
75430+ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
75431+#if REISER4_DEBUG
75432+ spin_lock(&cur->load);
75433+ assert("nikita-3165", !jnode_is_releasable(cur));
75434+ spin_unlock(&cur->load);
75435+#endif
75436+ JF_SET(cur, JNODE_WRITEBACK);
75437+ JF_CLR(cur, JNODE_DIRTY);
75438+ ON_DEBUG(cur->written++);
75439+ spin_unlock_jnode(cur);
75440+
75441+ ClearPageError(pg);
75442+ set_page_writeback(pg);
75443+
75444+ if (get_current_context()->entd) {
75445+ /* this is ent thread */
75446+ entd_context *ent = get_entd_context(super);
75447+ struct wbq *rq, *next;
75448+
75449+ spin_lock(&ent->guard);
75450+
75451+ if (pg == ent->cur_request->page) {
75452+ /*
75453+ * entd is called for this page. This
75454+ * request is not in th etodo list
75455+ */
75456+ ent->cur_request->written = 1;
75457+ } else {
75458+ /*
75459+ * if we have written a page for which writepage
75460+ * is called for - move request to another list.
75461+ */
75462+ list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
75463+ assert("", rq->magic == WBQ_MAGIC);
75464+ if (pg == rq->page) {
75465+ /*
75466+ * remove request from
75467+ * entd's queue, but do
75468+ * not wake up a thread
75469+ * which put this
75470+ * request
75471+ */
75472+ list_del_init(&rq->link);
75473+ ent->nr_todo_reqs --;
75474+ list_add_tail(&rq->link, &ent->done_list);
75475+ ent->nr_done_reqs ++;
75476+ rq->written = 1;
75477+ break;
75478+ }
75479+ }
75480+ }
75481+ spin_unlock(&ent->guard);
75482+ }
75483+
75484+ clear_page_dirty_for_io(pg);
75485+
75486+ unlock_page(pg);
75487+
75488+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75489+ nr_used++;
75490+ }
75491+ if (nr_used > 0) {
75492+ assert("nikita-3453",
75493+ bio->bi_size == super->s_blocksize * nr_used);
75494+ assert("nikita-3454", bio->bi_vcnt == nr_used);
75495+
75496+ /* Check if we are allowed to write at all */
75497+ if (super->s_flags & MS_RDONLY)
75498+ undo_bio(bio);
75499+ else {
75500+ int not_supported;
75501+
75502+ add_fq_to_bio(fq, bio);
75503+ bio_get(bio);
75504+ reiser4_submit_bio(write_op, bio);
75505+ not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
75506+ bio_put(bio);
75507+ if (not_supported)
75508+ return -EOPNOTSUPP;
75509+ }
75510+
75511+ block += nr_used - 1;
75512+ update_blocknr_hint_default(super, &block);
75513+ block += 1;
75514+ } else {
75515+ bio_put(bio);
75516+ }
75517+ nr -= nr_used;
75518+ }
75519+
75520+ return 0;
75521+}
75522+
75523+/* This is a procedure which recovers a contiguous sequences of disk block
75524+ numbers in the given list of j-nodes and submits write requests on this
75525+ per-sequence basis */
75526+int
75527+write_jnode_list(struct list_head *head, flush_queue_t *fq,
75528+ long *nr_submitted, int flags)
75529+{
75530+ int ret;
75531+ jnode *beg = list_entry(head->next, jnode, capture_link);
75532+
75533+ while (head != &beg->capture_link) {
75534+ int nr = 1;
75535+ jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
75536+
75537+ while (head != &cur->capture_link) {
75538+ if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
75539+ break;
75540+ ++nr;
75541+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75542+ }
75543+
75544+ ret = write_jnodes_to_disk_extent(
75545+ beg, nr, jnode_get_block(beg), fq, flags);
75546+ if (ret)
75547+ return ret;
75548+
75549+ if (nr_submitted)
75550+ *nr_submitted += nr;
75551+
75552+ beg = cur;
75553+ }
75554+
75555+ return 0;
75556+}
75557+
75558+/* add given wandered mapping to atom's wandered map */
75559+static int
75560+add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
75561+{
75562+ int ret;
75563+ blocknr_set_entry *new_bsep = NULL;
75564+ reiser4_block_nr block;
75565+
75566+ txn_atom *atom;
75567+
75568+ assert("zam-568", block_p != NULL);
75569+ block = *block_p;
75570+ assert("zam-569", len > 0);
75571+
75572+ while ((len--) > 0) {
75573+ do {
75574+ atom = get_current_atom_locked();
75575+ assert("zam-536",
75576+ !reiser4_blocknr_is_fake(jnode_get_block(cur)));
75577+ ret =
75578+ blocknr_set_add_pair(atom, &atom->wandered_map,
75579+ &new_bsep,
75580+ jnode_get_block(cur), &block);
75581+ } while (ret == -E_REPEAT);
75582+
75583+ if (ret) {
75584+ /* deallocate blocks which were not added to wandered
75585+ map */
75586+ reiser4_block_nr wide_len = len;
75587+
75588+ reiser4_dealloc_blocks(&block, &wide_len,
75589+ BLOCK_NOT_COUNTED,
75590+ BA_FORMATTED
75591+ /* formatted, without defer */ );
75592+
75593+ return ret;
75594+ }
75595+
75596+ spin_unlock_atom(atom);
75597+
75598+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75599+ ++block;
75600+ }
75601+
75602+ return 0;
75603+}
75604+
75605+/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
75606+ submit IO for allocated blocks. We assume that current atom is in a stage
75607+ when any atom fusion is impossible and atom is unlocked and it is safe. */
75608+static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
75609+{
75610+ reiser4_block_nr block;
75611+
75612+ int rest;
75613+ int len;
75614+ int ret;
75615+
75616+ jnode *cur;
75617+
75618+ assert("zam-534", ch->overwrite_set_size > 0);
75619+
75620+ rest = ch->overwrite_set_size;
75621+
75622+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75623+ while (ch->overwrite_set != &cur->capture_link) {
75624+ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
75625+
75626+ ret = get_more_wandered_blocks(rest, &block, &len);
75627+ if (ret)
75628+ return ret;
75629+
75630+ rest -= len;
75631+
75632+ ret = add_region_to_wmap(cur, len, &block);
75633+ if (ret)
75634+ return ret;
75635+
75636+ ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
75637+ if (ret)
75638+ return ret;
75639+
75640+ while ((len--) > 0) {
75641+ assert("zam-604",
75642+ ch->overwrite_set != &cur->capture_link);
75643+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75644+ }
75645+ }
75646+
75647+ return 0;
75648+}
75649+
75650+/* allocate given number of nodes over the journal area and link them into a
75651+ list, return pointer to the first jnode in the list */
75652+static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
75653+{
75654+ reiser4_blocknr_hint hint;
75655+ reiser4_block_nr allocated = 0;
75656+ reiser4_block_nr first, len;
75657+ jnode *cur;
75658+ jnode *txhead;
75659+ int ret;
75660+ reiser4_context *ctx;
75661+ reiser4_super_info_data *sbinfo;
75662+
75663+ assert("zam-698", ch->tx_size > 0);
75664+ assert("zam-699", list_empty_careful(&ch->tx_list));
75665+
75666+ ctx = get_current_context();
75667+ sbinfo = get_super_private(ctx->super);
75668+
75669+ while (allocated < (unsigned)ch->tx_size) {
75670+ len = (ch->tx_size - allocated);
75671+
75672+ reiser4_blocknr_hint_init(&hint);
75673+
75674+ hint.block_stage = BLOCK_GRABBED;
75675+
75676+ /* FIXME: there should be some block allocation policy for
75677+ nodes which contain wander records */
75678+
75679+ /* We assume that disk space for wandered record blocks can be
75680+ * taken from reserved area. */
75681+ ret = reiser4_alloc_blocks(&hint, &first, &len,
75682+ BA_FORMATTED | BA_RESERVED |
75683+ BA_USE_DEFAULT_SEARCH_START);
75684+ reiser4_blocknr_hint_done(&hint);
75685+
75686+ if (ret)
75687+ return ret;
75688+
75689+ allocated += len;
75690+
75691+ /* create jnodes for all wander records */
75692+ while (len--) {
75693+ cur = reiser4_alloc_io_head(&first);
75694+
75695+ if (cur == NULL) {
75696+ ret = RETERR(-ENOMEM);
75697+ goto free_not_assigned;
75698+ }
75699+
75700+ ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
75701+
75702+ if (ret != 0) {
75703+ jfree(cur);
75704+ goto free_not_assigned;
75705+ }
75706+
75707+ pin_jnode_data(cur);
75708+
75709+ list_add_tail(&cur->capture_link, &ch->tx_list);
75710+
75711+ first++;
75712+ }
75713+ }
75714+
75715+ { /* format a on-disk linked list of wander records */
75716+ int serial = 1;
75717+
75718+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
75719+ format_tx_head(ch);
75720+
75721+ cur = list_entry(txhead->capture_link.next, jnode, capture_link);
75722+ while (&ch->tx_list != &cur->capture_link) {
75723+ format_wander_record(ch, cur, serial++);
75724+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75725+ }
75726+ }
75727+
75728+ { /* Fill wander records with Wandered Set */
75729+ struct store_wmap_params params;
75730+ txn_atom *atom;
75731+
75732+ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
75733+
75734+ params.idx = 0;
75735+ params.capacity =
75736+ wander_record_capacity(reiser4_get_current_sb());
75737+
75738+ atom = get_current_atom_locked();
75739+ blocknr_set_iterator(atom, &atom->wandered_map,
75740+ &store_wmap_actor, &params, 0);
75741+ spin_unlock_atom(atom);
75742+ }
75743+
75744+ { /* relse all jnodes from tx_list */
75745+ cur = list_entry(ch->tx_list.next, jnode, capture_link);
75746+ while (&ch->tx_list != &cur->capture_link) {
75747+ jrelse(cur);
75748+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75749+ }
75750+ }
75751+
75752+ ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
75753+
75754+ return ret;
75755+
75756+ free_not_assigned:
75757+ /* We deallocate blocks not yet assigned to jnodes on tx_list. The
75758+ caller takes care about invalidating of tx list */
75759+ reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
75760+
75761+ return ret;
75762+}
75763+
75764+static int commit_tx(struct commit_handle *ch)
75765+{
75766+ flush_queue_t *fq;
75767+ int barrier;
75768+ int ret;
75769+
75770+ /* Grab more space for wandered records. */
75771+ ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
75772+ if (ret)
75773+ return ret;
75774+
75775+ fq = get_fq_for_current_atom();
75776+ if (IS_ERR(fq))
75777+ return PTR_ERR(fq);
75778+
75779+ spin_unlock_atom(fq->atom);
75780+ do {
75781+ ret = alloc_wandered_blocks(ch, fq);
75782+ if (ret)
75783+ break;
75784+ ret = alloc_tx(ch, fq);
75785+ if (ret)
75786+ break;
75787+ } while (0);
75788+
75789+ reiser4_fq_put(fq);
75790+ if (ret)
75791+ return ret;
75792+ repeat_wo_barrier:
75793+ barrier = reiser4_use_write_barrier(ch->super);
75794+ if (!barrier) {
75795+ ret = current_atom_finish_all_fq();
75796+ if (ret)
75797+ return ret;
75798+ }
75799+ ret = update_journal_header(ch, barrier);
75800+ if (barrier) {
75801+ if (ret) {
75802+ if (ret == -EOPNOTSUPP) {
75803+ disable_write_barrier(ch->super);
75804+ goto repeat_wo_barrier;
75805+ }
75806+ return ret;
75807+ }
75808+ ret = current_atom_finish_all_fq();
75809+ }
75810+ return ret;
75811+}
75812+
75813+static int write_tx_back(struct commit_handle * ch)
75814+{
75815+ flush_queue_t *fq;
75816+ int ret;
75817+ int barrier;
75818+
75819+ reiser4_post_commit_hook();
75820+ fq = get_fq_for_current_atom();
75821+ if (IS_ERR(fq))
75822+ return PTR_ERR(fq);
75823+ spin_unlock_atom(fq->atom);
75824+ ret = write_jnode_list(
75825+ ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
75826+ reiser4_fq_put(fq);
75827+ if (ret)
75828+ return ret;
75829+ repeat_wo_barrier:
75830+ barrier = reiser4_use_write_barrier(ch->super);
75831+ if (!barrier) {
75832+ ret = current_atom_finish_all_fq();
75833+ if (ret)
75834+ return ret;
75835+ }
75836+ ret = update_journal_footer(ch, barrier);
75837+ if (barrier) {
75838+ if (ret) {
75839+ if (ret == -EOPNOTSUPP) {
75840+ disable_write_barrier(ch->super);
75841+ goto repeat_wo_barrier;
75842+ }
75843+ return ret;
75844+ }
75845+ ret = current_atom_finish_all_fq();
75846+ }
75847+ if (ret)
75848+ return ret;
75849+ reiser4_post_write_back_hook();
75850+ return 0;
75851+}
75852+
75853+/* We assume that at this moment all captured blocks are marked as RELOC or
75854+ WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
75855+ are submitted to write.
75856+*/
75857+
75858+int reiser4_write_logs(long *nr_submitted)
75859+{
75860+ txn_atom *atom;
75861+ struct super_block *super = reiser4_get_current_sb();
75862+ reiser4_super_info_data *sbinfo = get_super_private(super);
75863+ struct commit_handle ch;
75864+ int ret;
75865+
75866+ writeout_mode_enable();
75867+
75868+ /* block allocator may add j-nodes to the clean_list */
75869+ ret = reiser4_pre_commit_hook();
75870+ if (ret)
75871+ return ret;
75872+
75873+ /* No locks are required if we take atom which stage >=
75874+ * ASTAGE_PRE_COMMIT */
75875+ atom = get_current_context()->trans->atom;
75876+ assert("zam-965", atom != NULL);
75877+
75878+ /* relocate set is on the atom->clean_nodes list after
75879+ * current_atom_complete_writes() finishes. It can be safely
75880+ * uncaptured after commit_mutex is locked, because any atom that
75881+ * captures these nodes is guaranteed to commit after current one.
75882+ *
75883+ * This can only be done after reiser4_pre_commit_hook(), because it is where
75884+ * early flushed jnodes with CREATED bit are transferred to the
75885+ * overwrite list. */
75886+ reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
75887+ spin_lock_atom(atom);
75888+ /* There might be waiters for the relocate nodes which we have
75889+ * released, wake them up. */
75890+ reiser4_atom_send_event(atom);
75891+ spin_unlock_atom(atom);
75892+
75893+ if (REISER4_DEBUG) {
75894+ int level;
75895+
75896+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
75897+ assert("nikita-3352",
75898+ list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
75899+ }
75900+
75901+ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
75902+ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
75903+
75904+ init_commit_handle(&ch, atom);
75905+
75906+ ch.free_blocks = sbinfo->blocks_free_committed;
75907+ ch.nr_files = sbinfo->nr_files_committed;
75908+ /* ZAM-FIXME-HANS: email me what the contention level is for the super
75909+ * lock. */
75910+ ch.next_oid = oid_next(super);
75911+
75912+ /* count overwrite set and place it in a separate list */
75913+ ret = get_overwrite_set(&ch);
75914+
75915+ if (ret <= 0) {
75916+ /* It is possible that overwrite set is empty here, it means
75917+ all captured nodes are clean */
75918+ goto up_and_ret;
75919+ }
75920+
75921+ /* Inform the caller about what number of dirty pages will be
75922+ * submitted to disk. */
75923+ *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
75924+
75925+ /* count all records needed for storing of the wandered set */
75926+ get_tx_size(&ch);
75927+
75928+ ret = commit_tx(&ch);
75929+ if (ret)
75930+ goto up_and_ret;
75931+
75932+ spin_lock_atom(atom);
75933+ reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
75934+ spin_unlock_atom(atom);
75935+
75936+ ret = write_tx_back(&ch);
75937+ reiser4_post_write_back_hook();
75938+
75939+ up_and_ret:
75940+ if (ret) {
75941+ /* there could be fq attached to current atom; the only way to
75942+ remove them is: */
75943+ current_atom_finish_all_fq();
75944+ }
75945+
75946+ /* free blocks of flushed transaction */
75947+ dealloc_tx_list(&ch);
75948+ dealloc_wmap(&ch);
75949+
75950+ put_overwrite_set(&ch);
75951+
75952+ done_commit_handle(&ch);
75953+
75954+ writeout_mode_disable();
75955+
75956+ return ret;
75957+}
75958+
75959+/* consistency checks for journal data/control blocks: header, footer, log
75960+ records, transactions head blocks. All functions return zero on success. */
75961+
75962+static int check_journal_header(const jnode * node UNUSED_ARG)
75963+{
75964+ /* FIXME: journal header has no magic field yet. */
75965+ return 0;
75966+}
75967+
75968+/* wait for write completion for all jnodes from given list */
75969+static int wait_on_jnode_list(struct list_head *head)
75970+{
75971+ jnode *scan;
75972+ int ret = 0;
75973+
75974+ list_for_each_entry(scan, head, capture_link) {
75975+ struct page *pg = jnode_page(scan);
75976+
75977+ if (pg) {
75978+ if (PageWriteback(pg))
75979+ wait_on_page_writeback(pg);
75980+
75981+ if (PageError(pg))
75982+ ret++;
75983+ }
75984+ }
75985+
75986+ return ret;
75987+}
75988+
75989+static int check_journal_footer(const jnode * node UNUSED_ARG)
75990+{
75991+ /* FIXME: journal footer has no magic field yet. */
75992+ return 0;
75993+}
75994+
75995+static int check_tx_head(const jnode * node)
75996+{
75997+ struct tx_header *header = (struct tx_header *)jdata(node);
75998+
75999+ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
76000+ warning("zam-627", "tx head at block %s corrupted\n",
76001+ sprint_address(jnode_get_block(node)));
76002+ return RETERR(-EIO);
76003+ }
76004+
76005+ return 0;
76006+}
76007+
76008+static int check_wander_record(const jnode * node)
76009+{
76010+ struct wander_record_header *RH =
76011+ (struct wander_record_header *)jdata(node);
76012+
76013+ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
76014+ 0) {
76015+ warning("zam-628", "wander record at block %s corrupted\n",
76016+ sprint_address(jnode_get_block(node)));
76017+ return RETERR(-EIO);
76018+ }
76019+
76020+ return 0;
76021+}
76022+
76023+/* fill commit_handler structure by everything what is needed for update_journal_footer */
76024+static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
76025+{
76026+ struct tx_header *TXH;
76027+ int ret;
76028+
76029+ ret = jload(tx_head);
76030+ if (ret)
76031+ return ret;
76032+
76033+ TXH = (struct tx_header *)jdata(tx_head);
76034+
76035+ ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
76036+ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
76037+ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
76038+
76039+ jrelse(tx_head);
76040+
76041+ list_add(&tx_head->capture_link, &ch->tx_list);
76042+
76043+ return 0;
76044+}
76045+
76046+/* replay one transaction: restore and write overwrite set in place */
76047+static int replay_transaction(const struct super_block *s,
76048+ jnode * tx_head,
76049+ const reiser4_block_nr * log_rec_block_p,
76050+ const reiser4_block_nr * end_block,
76051+ unsigned int nr_wander_records)
76052+{
76053+ reiser4_block_nr log_rec_block = *log_rec_block_p;
76054+ struct commit_handle ch;
76055+ LIST_HEAD(overwrite_set);
76056+ jnode *log;
76057+ int ret;
76058+
76059+ init_commit_handle(&ch, NULL);
76060+ ch.overwrite_set = &overwrite_set;
76061+
76062+ restore_commit_handle(&ch, tx_head);
76063+
76064+ while (log_rec_block != *end_block) {
76065+ struct wander_record_header *header;
76066+ struct wander_entry *entry;
76067+
76068+ int i;
76069+
76070+ if (nr_wander_records == 0) {
76071+ warning("zam-631",
76072+ "number of wander records in the linked list"
76073+ " greater than number stored in tx head.\n");
76074+ ret = RETERR(-EIO);
76075+ goto free_ow_set;
76076+ }
76077+
76078+ log = reiser4_alloc_io_head(&log_rec_block);
76079+ if (log == NULL)
76080+ return RETERR(-ENOMEM);
76081+
76082+ ret = jload(log);
76083+ if (ret < 0) {
76084+ reiser4_drop_io_head(log);
76085+ return ret;
76086+ }
76087+
76088+ ret = check_wander_record(log);
76089+ if (ret) {
76090+ jrelse(log);
76091+ reiser4_drop_io_head(log);
76092+ return ret;
76093+ }
76094+
76095+ header = (struct wander_record_header *)jdata(log);
76096+ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
76097+
76098+ entry = (struct wander_entry *)(header + 1);
76099+
76100+ /* restore overwrite set from wander record content */
76101+ for (i = 0; i < wander_record_capacity(s); i++) {
76102+ reiser4_block_nr block;
76103+ jnode *node;
76104+
76105+ block = le64_to_cpu(get_unaligned(&entry->wandered));
76106+ if (block == 0)
76107+ break;
76108+
76109+ node = reiser4_alloc_io_head(&block);
76110+ if (node == NULL) {
76111+ ret = RETERR(-ENOMEM);
76112+ /*
76113+ * FIXME-VS:???
76114+ */
76115+ jrelse(log);
76116+ reiser4_drop_io_head(log);
76117+ goto free_ow_set;
76118+ }
76119+
76120+ ret = jload(node);
76121+
76122+ if (ret < 0) {
76123+ reiser4_drop_io_head(node);
76124+ /*
76125+ * FIXME-VS:???
76126+ */
76127+ jrelse(log);
76128+ reiser4_drop_io_head(log);
76129+ goto free_ow_set;
76130+ }
76131+
76132+ block = le64_to_cpu(get_unaligned(&entry->original));
76133+
76134+ assert("zam-603", block != 0);
76135+
76136+ jnode_set_block(node, &block);
76137+
76138+ list_add_tail(&node->capture_link, ch.overwrite_set);
76139+
76140+ ++entry;
76141+ }
76142+
76143+ jrelse(log);
76144+ reiser4_drop_io_head(log);
76145+
76146+ --nr_wander_records;
76147+ }
76148+
76149+ if (nr_wander_records != 0) {
76150+ warning("zam-632", "number of wander records in the linked list"
76151+ " less than number stored in tx head.\n");
76152+ ret = RETERR(-EIO);
76153+ goto free_ow_set;
76154+ }
76155+
76156+ { /* write wandered set in place */
76157+ write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
76158+ ret = wait_on_jnode_list(ch.overwrite_set);
76159+
76160+ if (ret) {
76161+ ret = RETERR(-EIO);
76162+ goto free_ow_set;
76163+ }
76164+ }
76165+
76166+ ret = update_journal_footer(&ch, 0);
76167+
76168+ free_ow_set:
76169+
76170+ while (!list_empty(ch.overwrite_set)) {
76171+ jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
76172+ list_del_init(&cur->capture_link);
76173+ jrelse(cur);
76174+ reiser4_drop_io_head(cur);
76175+ }
76176+
76177+ list_del_init(&tx_head->capture_link);
76178+
76179+ done_commit_handle(&ch);
76180+
76181+ return ret;
76182+}
76183+
76184+/* find oldest committed and not played transaction and play it. The transaction
76185+ * was committed and journal header block was updated but the blocks from the
76186+ * process of writing the atom's overwrite set in-place and updating of journal
76187+ * footer block were not completed. This function completes the process by
76188+ * recovering the atom's overwrite set from their wandered locations and writes
76189+ * them in-place and updating the journal footer. */
76190+static int replay_oldest_transaction(struct super_block *s)
76191+{
76192+ reiser4_super_info_data *sbinfo = get_super_private(s);
76193+ jnode *jf = sbinfo->journal_footer;
76194+ unsigned int total;
76195+ struct journal_footer *F;
76196+ struct tx_header *T;
76197+
76198+ reiser4_block_nr prev_tx;
76199+ reiser4_block_nr last_flushed_tx;
76200+ reiser4_block_nr log_rec_block = 0;
76201+
76202+ jnode *tx_head;
76203+
76204+ int ret;
76205+
76206+ if ((ret = jload(jf)) < 0)
76207+ return ret;
76208+
76209+ F = (struct journal_footer *)jdata(jf);
76210+
76211+ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
76212+
76213+ jrelse(jf);
76214+
76215+ if (sbinfo->last_committed_tx == last_flushed_tx) {
76216+ /* all transactions are replayed */
76217+ return 0;
76218+ }
76219+
76220+ prev_tx = sbinfo->last_committed_tx;
76221+
76222+ /* searching for oldest not flushed transaction */
76223+ while (1) {
76224+ tx_head = reiser4_alloc_io_head(&prev_tx);
76225+ if (!tx_head)
76226+ return RETERR(-ENOMEM);
76227+
76228+ ret = jload(tx_head);
76229+ if (ret < 0) {
76230+ reiser4_drop_io_head(tx_head);
76231+ return ret;
76232+ }
76233+
76234+ ret = check_tx_head(tx_head);
76235+ if (ret) {
76236+ jrelse(tx_head);
76237+ reiser4_drop_io_head(tx_head);
76238+ return ret;
76239+ }
76240+
76241+ T = (struct tx_header *)jdata(tx_head);
76242+
76243+ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
76244+
76245+ if (prev_tx == last_flushed_tx)
76246+ break;
76247+
76248+ jrelse(tx_head);
76249+ reiser4_drop_io_head(tx_head);
76250+ }
76251+
76252+ total = le32_to_cpu(get_unaligned(&T->total));
76253+ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
76254+
76255+ pin_jnode_data(tx_head);
76256+ jrelse(tx_head);
76257+
76258+ ret =
76259+ replay_transaction(s, tx_head, &log_rec_block,
76260+ jnode_get_block(tx_head), total - 1);
76261+
76262+ unpin_jnode_data(tx_head);
76263+ reiser4_drop_io_head(tx_head);
76264+
76265+ if (ret)
76266+ return ret;
76267+ return -E_REPEAT;
76268+}
76269+
76270+/* The reiser4 journal current implementation was optimized to not to capture
76271+ super block if certain super blocks fields are modified. Currently, the set
76272+ is (<free block count>, <OID allocator>). These fields are logged by
76273+ special way which includes storing them in each transaction head block at
76274+ atom commit time and writing that information to journal footer block at
76275+ atom flush time. For getting info from journal footer block to the
76276+ in-memory super block there is a special function
76277+ reiser4_journal_recover_sb_data() which should be called after disk format
76278+ plugin re-reads super block after journal replaying.
76279+*/
76280+
76281+/* get the information from journal footer in-memory super block */
76282+int reiser4_journal_recover_sb_data(struct super_block *s)
76283+{
76284+ reiser4_super_info_data *sbinfo = get_super_private(s);
76285+ struct journal_footer *jf;
76286+ int ret;
76287+
76288+ assert("zam-673", sbinfo->journal_footer != NULL);
76289+
76290+ ret = jload(sbinfo->journal_footer);
76291+ if (ret != 0)
76292+ return ret;
76293+
76294+ ret = check_journal_footer(sbinfo->journal_footer);
76295+ if (ret != 0)
76296+ goto out;
76297+
76298+ jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
76299+
76300+ /* was there at least one flushed transaction? */
76301+ if (jf->last_flushed_tx) {
76302+
76303+ /* restore free block counter logged in this transaction */
76304+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
76305+
76306+ /* restore oid allocator state */
76307+ oid_init_allocator(s,
76308+ le64_to_cpu(get_unaligned(&jf->nr_files)),
76309+ le64_to_cpu(get_unaligned(&jf->next_oid)));
76310+ }
76311+ out:
76312+ jrelse(sbinfo->journal_footer);
76313+ return ret;
76314+}
76315+
76316+/* reiser4 replay journal procedure */
76317+int reiser4_journal_replay(struct super_block *s)
76318+{
76319+ reiser4_super_info_data *sbinfo = get_super_private(s);
76320+ jnode *jh, *jf;
76321+ struct journal_header *header;
76322+ int nr_tx_replayed = 0;
76323+ int ret;
76324+
76325+ assert("zam-582", sbinfo != NULL);
76326+
76327+ jh = sbinfo->journal_header;
76328+ jf = sbinfo->journal_footer;
76329+
76330+ if (!jh || !jf) {
76331+ /* it is possible that disk layout does not support journal
76332+ structures, we just warn about this */
76333+ warning("zam-583",
76334+ "journal control blocks were not loaded by disk layout plugin. "
76335+ "journal replaying is not possible.\n");
76336+ return 0;
76337+ }
76338+
76339+ /* Take free block count from journal footer block. The free block
76340+ counter value corresponds the last flushed transaction state */
76341+ ret = jload(jf);
76342+ if (ret < 0)
76343+ return ret;
76344+
76345+ ret = check_journal_footer(jf);
76346+ if (ret) {
76347+ jrelse(jf);
76348+ return ret;
76349+ }
76350+
76351+ jrelse(jf);
76352+
76353+ /* store last committed transaction info in reiser4 in-memory super
76354+ block */
76355+ ret = jload(jh);
76356+ if (ret < 0)
76357+ return ret;
76358+
76359+ ret = check_journal_header(jh);
76360+ if (ret) {
76361+ jrelse(jh);
76362+ return ret;
76363+ }
76364+
76365+ header = (struct journal_header *)jdata(jh);
76366+ sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
76367+
76368+ jrelse(jh);
76369+
76370+ /* replay committed transactions */
76371+ while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
76372+ nr_tx_replayed++;
76373+
76374+ return ret;
76375+}
76376+
76377+/* load journal control block (either journal header or journal footer block) */
76378+static int
76379+load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
76380+{
76381+ int ret;
76382+
76383+ *node = reiser4_alloc_io_head(block);
76384+ if (!(*node))
76385+ return RETERR(-ENOMEM);
76386+
76387+ ret = jload(*node);
76388+
76389+ if (ret) {
76390+ reiser4_drop_io_head(*node);
76391+ *node = NULL;
76392+ return ret;
76393+ }
76394+
76395+ pin_jnode_data(*node);
76396+ jrelse(*node);
76397+
76398+ return 0;
76399+}
76400+
76401+/* unload journal header or footer and free jnode */
76402+static void unload_journal_control_block(jnode ** node)
76403+{
76404+ if (*node) {
76405+ unpin_jnode_data(*node);
76406+ reiser4_drop_io_head(*node);
76407+ *node = NULL;
76408+ }
76409+}
76410+
76411+/* release journal control blocks */
76412+void reiser4_done_journal_info(struct super_block *s)
76413+{
76414+ reiser4_super_info_data *sbinfo = get_super_private(s);
76415+
76416+ assert("zam-476", sbinfo != NULL);
76417+
76418+ unload_journal_control_block(&sbinfo->journal_header);
76419+ unload_journal_control_block(&sbinfo->journal_footer);
76420+ rcu_barrier();
76421+}
76422+
76423+/* load journal control blocks */
76424+int reiser4_init_journal_info(struct super_block *s)
76425+{
76426+ reiser4_super_info_data *sbinfo = get_super_private(s);
76427+ journal_location *loc;
76428+ int ret;
76429+
76430+ loc = &sbinfo->jloc;
76431+
76432+ assert("zam-651", loc != NULL);
76433+ assert("zam-652", loc->header != 0);
76434+ assert("zam-653", loc->footer != 0);
76435+
76436+ ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
76437+
76438+ if (ret)
76439+ return ret;
76440+
76441+ ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
76442+
76443+ if (ret) {
76444+ unload_journal_control_block(&sbinfo->journal_header);
76445+ }
76446+
76447+ return ret;
76448+}
76449+
76450+/* Make Linus happy.
76451+ Local variables:
76452+ c-indentation-style: "K&R"
76453+ mode-name: "LC"
76454+ c-basic-offset: 8
76455+ tab-width: 8
76456+ fill-column: 80
76457+ End:
76458+*/
76459diff -urN linux-2.6.23.orig/fs/reiser4/wander.h linux-2.6.23/fs/reiser4/wander.h
76460--- linux-2.6.23.orig/fs/reiser4/wander.h 1970-01-01 03:00:00.000000000 +0300
76461+++ linux-2.6.23/fs/reiser4/wander.h 2007-12-04 16:49:30.000000000 +0300
76462@@ -0,0 +1,135 @@
76463+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
76464+
76465+#if !defined (__FS_REISER4_WANDER_H__)
76466+#define __FS_REISER4_WANDER_H__
76467+
76468+#include "dformat.h"
76469+
76470+#include <linux/fs.h> /* for struct super_block */
76471+
76472+/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
76473+
76474+#define TX_HEADER_MAGIC "TxMagic4"
76475+#define WANDER_RECORD_MAGIC "LogMagc4"
76476+
76477+#define TX_HEADER_MAGIC_SIZE (8)
76478+#define WANDER_RECORD_MAGIC_SIZE (8)
76479+
76480+/* journal header block format */
76481+struct journal_header {
76482+ /* last written transaction head location */
76483+ d64 last_committed_tx;
76484+};
76485+
76486+typedef struct journal_location {
76487+ reiser4_block_nr footer;
76488+ reiser4_block_nr header;
76489+} journal_location;
76490+
76491+/* The wander.c head comment describes usage and semantic of all these structures */
76492+/* journal footer block format */
76493+struct journal_footer {
76494+ /* last flushed transaction location. */
76495+ /* This block number is no more valid after the transaction it points
76496+ to gets flushed, this number is used only at journal replaying time
76497+ for detection of the end of on-disk list of committed transactions
76498+ which were not flushed completely */
76499+ d64 last_flushed_tx;
76500+
76501+ /* free block counter is written in journal footer at transaction
76502+ flushing , not in super block because free blocks counter is logged
76503+ by another way than super block fields (root pointer, for
76504+ example). */
76505+ d64 free_blocks;
76506+
76507+ /* number of used OIDs and maximal used OID are logged separately from
76508+ super block */
76509+ d64 nr_files;
76510+ d64 next_oid;
76511+};
76512+
76513+/* Each wander record (except the first one) has unified format with wander
76514+ record header followed by an array of log entries */
76515+struct wander_record_header {
76516+ /* when there is no predefined location for wander records, this magic
76517+ string should help reiser4fsck. */
76518+ char magic[WANDER_RECORD_MAGIC_SIZE];
76519+
76520+ /* transaction id */
76521+ d64 id;
76522+
76523+ /* total number of wander records in current transaction */
76524+ d32 total;
76525+
76526+ /* this block number in transaction */
76527+ d32 serial;
76528+
76529+ /* number of previous block in commit */
76530+ d64 next_block;
76531+};
76532+
76533+/* The first wander record (transaction head) of written transaction has the
76534+ special format */
76535+struct tx_header {
76536+ /* magic string makes first block in transaction different from other
76537+ logged blocks, it should help fsck. */
76538+ char magic[TX_HEADER_MAGIC_SIZE];
76539+
76540+ /* transaction id */
76541+ d64 id;
76542+
76543+ /* total number of records (including this first tx head) in the
76544+ transaction */
76545+ d32 total;
76546+
76547+ /* align next field to 8-byte boundary; this field always is zero */
76548+ d32 padding;
76549+
76550+ /* block number of previous transaction head */
76551+ d64 prev_tx;
76552+
76553+ /* next wander record location */
76554+ d64 next_block;
76555+
76556+ /* committed versions of free blocks counter */
76557+ d64 free_blocks;
76558+
76559+ /* number of used OIDs (nr_files) and maximal used OID are logged
76560+ separately from super block */
76561+ d64 nr_files;
76562+ d64 next_oid;
76563+};
76564+
76565+/* A transaction gets written to disk as a set of wander records (each wander
76566+ record size is fs block) */
76567+
76568+/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
76569+ by zeroes */
76570+struct wander_entry {
76571+ d64 original; /* block original location */
76572+ d64 wandered; /* block wandered location */
76573+};
76574+
76575+/* REISER4 JOURNAL WRITER FUNCTIONS */
76576+
76577+extern int reiser4_write_logs(long *);
76578+extern int reiser4_journal_replay(struct super_block *);
76579+extern int reiser4_journal_recover_sb_data(struct super_block *);
76580+
76581+extern int reiser4_init_journal_info(struct super_block *);
76582+extern void reiser4_done_journal_info(struct super_block *);
76583+
76584+extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
76585+
76586+#endif /* __FS_REISER4_WANDER_H__ */
76587+
76588+/* Make Linus happy.
76589+ Local variables:
76590+ c-indentation-style: "K&R"
76591+ mode-name: "LC"
76592+ c-basic-offset: 8
76593+ tab-width: 8
76594+ fill-column: 80
76595+ scroll-step: 1
76596+ End:
76597+*/
76598diff -urN linux-2.6.23.orig/fs/reiser4/writeout.h linux-2.6.23/fs/reiser4/writeout.h
76599--- linux-2.6.23.orig/fs/reiser4/writeout.h 1970-01-01 03:00:00.000000000 +0300
76600+++ linux-2.6.23/fs/reiser4/writeout.h 2007-12-04 16:49:30.000000000 +0300
76601@@ -0,0 +1,21 @@
76602+/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
76603+
76604+#if !defined (__FS_REISER4_WRITEOUT_H__)
76605+
76606+#define WRITEOUT_SINGLE_STREAM (0x1)
76607+#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
76608+#define WRITEOUT_BARRIER (0x4)
76609+
76610+extern int reiser4_get_writeout_flags(void);
76611+
76612+#endif /* __FS_REISER4_WRITEOUT_H__ */
76613+
76614+/* Make Linus happy.
76615+ Local variables:
76616+ c-indentation-style: "K&R"
76617+ mode-name: "LC"
76618+ c-basic-offset: 8
76619+ tab-width: 8
76620+ fill-column: 80
76621+ End:
76622+*/
76623diff -urN linux-2.6.23.orig/fs/reiser4/znode.c linux-2.6.23/fs/reiser4/znode.c
76624--- linux-2.6.23.orig/fs/reiser4/znode.c 1970-01-01 03:00:00.000000000 +0300
76625+++ linux-2.6.23/fs/reiser4/znode.c 2007-12-04 16:49:30.000000000 +0300
76626@@ -0,0 +1,1029 @@
76627+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76628+ * reiser4/README */
76629+/* Znode manipulation functions. */
76630+/* Znode is the in-memory header for a tree node. It is stored
76631+ separately from the node itself so that it does not get written to
76632+ disk. In this respect znode is like buffer head or page head. We
76633+ also use znodes for additional reiser4 specific purposes:
76634+
76635+ . they are organized into tree structure which is a part of whole
76636+ reiser4 tree.
76637+ . they are used to implement node grained locking
76638+ . they are used to keep additional state associated with a
76639+ node
76640+ . they contain links to lists used by the transaction manager
76641+
76642+ Znode is attached to some variable "block number" which is instance of
76643+ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
76644+ appropriate node being actually loaded in memory. Existence of znode itself
76645+ is regulated by reference count (->x_count) in it. Each time thread
76646+ acquires reference to znode through call to zget(), ->x_count is
76647+ incremented and decremented on call to zput(). Data (content of node) are
76648+ brought in memory through call to zload(), which also increments ->d_count
76649+ reference counter. zload can block waiting on IO. Call to zrelse()
76650+ decreases this counter. Also, ->c_count keeps track of number of child
76651+ znodes and prevents parent znode from being recycled until all of its
76652+ children are. ->c_count is decremented whenever child goes out of existence
76653+ (being actually recycled in zdestroy()) which can be some time after last
76654+ reference to this child dies if we support some form of LRU cache for
76655+ znodes.
76656+
76657+*/
76658+/* EVERY ZNODE'S STORY
76659+
76660+ 1. His infancy.
76661+
76662+ Once upon a time, the znode was born deep inside of zget() by call to
76663+ zalloc(). At the return from zget() znode had:
76664+
76665+ . reference counter (x_count) of 1
76666+ . assigned block number, marked as used in bitmap
76667+ . pointer to parent znode. Root znode parent pointer points
76668+ to its father: "fake" znode. This, in turn, has NULL parent pointer.
76669+ . hash table linkage
76670+ . no data loaded from disk
76671+ . no node plugin
76672+ . no sibling linkage
76673+
76674+ 2. His childhood
76675+
76676+ Each node is either brought into memory as a result of tree traversal, or
76677+ created afresh, creation of the root being a special case of the latter. In
76678+ either case it's inserted into sibling list. This will typically require
76679+ some ancillary tree traversing, but ultimately both sibling pointers will
76680+ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
76681+ zjnode.state.
76682+
76683+ 3. His youth.
76684+
76685+ If znode is bound to already existing node in a tree, its content is read
76686+ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
76687+ in zjnode.state and zdata() function starts to return non null for this
76688+ znode. zload() further calls zparse() that determines which node layout
76689+ this node is rendered in, and sets ->nplug on success.
76690+
76691+ If znode is for new node just created, memory for it is allocated and
76692+ zinit_new() function is called to initialise data, according to selected
76693+ node layout.
76694+
76695+ 4. His maturity.
76696+
76697+ After this point, znode lingers in memory for some time. Threads can
76698+ acquire references to znode either by blocknr through call to zget(), or by
76699+ following a pointer to unallocated znode from internal item. Each time
76700+ reference to znode is obtained, x_count is increased. Thread can read/write
76701+ lock znode. Znode data can be loaded through calls to zload(), d_count will
76702+ be increased appropriately. If all references to znode are released
76703+ (x_count drops to 0), znode is not recycled immediately. Rather, it is
76704+ still cached in the hash table in the hope that it will be accessed
76705+ shortly.
76706+
76707+ There are two ways in which znode existence can be terminated:
76708+
76709+ . sudden death: node bound to this znode is removed from the tree
76710+ . overpopulation: znode is purged out of memory due to memory pressure
76711+
76712+ 5. His death.
76713+
76714+ Death is complex process.
76715+
76716+ When we irrevocably commit ourselves to decision to remove node from the
76717+ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
76718+ znode. This is done either in ->kill_hook() of internal item or in
76719+ reiser4_kill_root() function when tree root is removed.
76720+
76721+ At this moment znode still has:
76722+
76723+ . locks held on it, necessary write ones
76724+ . references to it
76725+ . disk block assigned to it
76726+ . data loaded from the disk
76727+ . pending requests for lock
76728+
76729+ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
76730+ deletion. Node deletion includes two phases. First all ways to get
76731+ references to that znode (sibling and parent links and hash lookup using
76732+ block number stored in parent node) should be deleted -- it is done through
76733+ sibling_list_remove(), also we assume that nobody uses down link from
76734+ parent node due to its nonexistence or proper parent node locking and
76735+ nobody uses parent pointers from children due to absence of them. Second we
76736+ invalidate all pending lock requests which still are on znode's lock
76737+ request queue, this is done by reiser4_invalidate_lock(). Another
76738+ JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
76739+ Once it set all requesters are forced to return -EINVAL from
76740+ longterm_lock_znode(). Future locking attempts are not possible because all
76741+ ways to get references to that znode are removed already. Last, node is
76742+ uncaptured from transaction.
76743+
76744+ When last reference to the dying znode is just about to be released,
76745+ block number for this lock is released and znode is removed from the
76746+ hash table.
76747+
76748+ Now znode can be recycled.
76749+
76750+ [it's possible to free bitmap block and remove znode from the hash
76751+ table when last lock is released. This will result in having
76752+ referenced but completely orphaned znode]
76753+
76754+ 6. Limbo
76755+
76756+ As have been mentioned above znodes with reference counter 0 are
76757+ still cached in a hash table. Once memory pressure increases they are
76758+ purged out of there [this requires something like LRU list for
76759+ efficient implementation. LRU list would also greatly simplify
76760+ implementation of coord cache that would in this case morph to just
76761+ scanning some initial segment of LRU list]. Data loaded into
76762+ unreferenced znode are flushed back to the durable storage if
76763+ necessary and memory is freed. Znodes themselves can be recycled at
76764+ this point too.
76765+
76766+*/
76767+
76768+#include "debug.h"
76769+#include "dformat.h"
76770+#include "key.h"
76771+#include "coord.h"
76772+#include "plugin/plugin_header.h"
76773+#include "plugin/node/node.h"
76774+#include "plugin/plugin.h"
76775+#include "txnmgr.h"
76776+#include "jnode.h"
76777+#include "znode.h"
76778+#include "block_alloc.h"
76779+#include "tree.h"
76780+#include "tree_walk.h"
76781+#include "super.h"
76782+#include "reiser4.h"
76783+
76784+#include <linux/pagemap.h>
76785+#include <linux/spinlock.h>
76786+#include <linux/slab.h>
76787+#include <linux/err.h>
76788+
76789+static z_hash_table *get_htable(reiser4_tree *,
76790+ const reiser4_block_nr * const blocknr);
76791+static z_hash_table *znode_get_htable(const znode *);
76792+static void zdrop(znode *);
76793+
76794+/* hash table support */
76795+
76796+/* compare two block numbers for equality. Used by hash-table macros */
76797+static inline int
76798+blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
76799+{
76800+ assert("nikita-534", b1 != NULL);
76801+ assert("nikita-535", b2 != NULL);
76802+
76803+ return *b1 == *b2;
76804+}
76805+
76806+/* Hash znode by block number. Used by hash-table macros */
76807+/* Audited by: umka (2002.06.11) */
76808+static inline __u32
76809+blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
76810+{
76811+ assert("nikita-536", b != NULL);
76812+
76813+ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
76814+}
76815+
76816+/* The hash table definition */
76817+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
76818+#define KFREE(ptr, size) kfree(ptr)
76819+TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
76820+ blknrhashfn, blknreq);
76821+#undef KFREE
76822+#undef KMALLOC
76823+
76824+/* slab for znodes */
76825+static struct kmem_cache *znode_cache;
76826+
76827+int znode_shift_order;
76828+
76829+/**
76830+ * init_znodes - create znode cache
76831+ *
76832+ * Initializes slab cache of znodes. It is part of reiser4 module initialization.
76833+ */
76834+int init_znodes(void)
76835+{
76836+ znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
76837+ SLAB_HWCACHE_ALIGN |
76838+ SLAB_RECLAIM_ACCOUNT, NULL);
76839+ if (znode_cache == NULL)
76840+ return RETERR(-ENOMEM);
76841+
76842+ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
76843+ ++znode_shift_order);
76844+ --znode_shift_order;
76845+ return 0;
76846+}
76847+
76848+/**
76849+ * done_znodes - delete znode cache
76850+ *
76851+ * This is called on reiser4 module unloading or system shutdown.
76852+ */
76853+void done_znodes(void)
76854+{
76855+ destroy_reiser4_cache(&znode_cache);
76856+}
76857+
76858+/* call this to initialise tree of znodes */
76859+int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
76860+{
76861+ int result;
76862+ assert("umka-050", tree != NULL);
76863+
76864+ rwlock_init(&tree->dk_lock);
76865+
76866+ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
76867+ if (result != 0)
76868+ return result;
76869+ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
76870+ return result;
76871+}
76872+
76873+/* free this znode */
76874+void zfree(znode * node /* znode to free */ )
76875+{
76876+ assert("nikita-465", node != NULL);
76877+ assert("nikita-2120", znode_page(node) == NULL);
76878+ assert("nikita-2301", list_empty_careful(&node->lock.owners));
76879+ assert("nikita-2302", list_empty_careful(&node->lock.requestors));
76880+ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
76881+ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
76882+ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
76883+ assert("nikita-3293", !znode_is_right_connected(node));
76884+ assert("nikita-3294", !znode_is_left_connected(node));
76885+ assert("nikita-3295", node->left == NULL);
76886+ assert("nikita-3296", node->right == NULL);
76887+
76888+ /* not yet phash_jnode_destroy(ZJNODE(node)); */
76889+
76890+ kmem_cache_free(znode_cache, node);
76891+}
76892+
76893+/* call this to free tree of znodes */
76894+void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
76895+{
76896+ znode *node;
76897+ znode *next;
76898+ z_hash_table *ztable;
76899+
76900+ /* scan znode hash-tables and kill all znodes, then free hash tables
76901+ * themselves. */
76902+
76903+ assert("nikita-795", tree != NULL);
76904+
76905+ ztable = &tree->zhash_table;
76906+
76907+ if (ztable->_table != NULL) {
76908+ for_all_in_htable(ztable, z, node, next) {
76909+ node->c_count = 0;
76910+ node->in_parent.node = NULL;
76911+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
76912+ zdrop(node);
76913+ }
76914+
76915+ z_hash_done(&tree->zhash_table);
76916+ }
76917+
76918+ ztable = &tree->zfake_table;
76919+
76920+ if (ztable->_table != NULL) {
76921+ for_all_in_htable(ztable, z, node, next) {
76922+ node->c_count = 0;
76923+ node->in_parent.node = NULL;
76924+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
76925+ zdrop(node);
76926+ }
76927+
76928+ z_hash_done(&tree->zfake_table);
76929+ }
76930+}
76931+
76932+/* ZNODE STRUCTURES */
76933+
76934+/* allocate fresh znode */
76935+znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
76936+{
76937+ znode *node;
76938+
76939+ node = kmem_cache_alloc(znode_cache, gfp_flag);
76940+ return node;
76941+}
76942+
76943+/* Initialize fields of znode
76944+ @node: znode to initialize;
76945+ @parent: parent znode;
76946+ @tree: tree we are in. */
76947+void zinit(znode * node, const znode * parent, reiser4_tree * tree)
76948+{
76949+ assert("nikita-466", node != NULL);
76950+ assert("umka-268", current_tree != NULL);
76951+
76952+ memset(node, 0, sizeof *node);
76953+
76954+ assert("umka-051", tree != NULL);
76955+
76956+ jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
76957+ reiser4_init_lock(&node->lock);
76958+ init_parent_coord(&node->in_parent, parent);
76959+}
76960+
76961+/*
76962+ * remove znode from indices. This is called jput() when last reference on
76963+ * znode is released.
76964+ */
76965+void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
76966+{
76967+ assert("nikita-2108", node != NULL);
76968+ assert("nikita-470", node->c_count == 0);
76969+ assert_rw_write_locked(&(tree->tree_lock));
76970+
76971+ /* remove reference to this znode from cbk cache */
76972+ cbk_cache_invalidate(node, tree);
76973+
76974+ /* update c_count of parent */
76975+ if (znode_parent(node) != NULL) {
76976+ assert("nikita-472", znode_parent(node)->c_count > 0);
76977+ /* father, onto your hands I forward my spirit... */
76978+ znode_parent(node)->c_count--;
76979+ node->in_parent.node = NULL;
76980+ } else {
76981+ /* orphaned znode?! Root? */
76982+ }
76983+
76984+ /* remove znode from hash-table */
76985+ z_hash_remove_rcu(znode_get_htable(node), node);
76986+}
76987+
76988+/* zdrop() -- Remove znode from the tree.
76989+
76990+ This is called when znode is removed from the memory. */
76991+static void zdrop(znode * node /* znode to finish with */ )
76992+{
76993+ jdrop(ZJNODE(node));
76994+}
76995+
76996+/*
76997+ * put znode into right place in the hash table. This is called by relocate
76998+ * code.
76999+ */
77000+int znode_rehash(znode * node /* node to rehash */ ,
77001+ const reiser4_block_nr * new_block_nr /* new block number */ )
77002+{
77003+ z_hash_table *oldtable;
77004+ z_hash_table *newtable;
77005+ reiser4_tree *tree;
77006+
77007+ assert("nikita-2018", node != NULL);
77008+
77009+ tree = znode_get_tree(node);
77010+ oldtable = znode_get_htable(node);
77011+ newtable = get_htable(tree, new_block_nr);
77012+
77013+ write_lock_tree(tree);
77014+ /* remove znode from hash-table */
77015+ z_hash_remove_rcu(oldtable, node);
77016+
77017+ /* assertion no longer valid due to RCU */
77018+ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
77019+
77020+ /* update blocknr */
77021+ znode_set_block(node, new_block_nr);
77022+ node->zjnode.key.z = *new_block_nr;
77023+
77024+ /* insert it into hash */
77025+ z_hash_insert_rcu(newtable, node);
77026+ write_unlock_tree(tree);
77027+ return 0;
77028+}
77029+
77030+/* ZNODE LOOKUP, GET, PUT */
77031+
77032+/* zlook() - get znode with given block_nr in a hash table or return NULL
77033+
77034+ If result is non-NULL then the znode's x_count is incremented. Internal version
77035+ accepts pre-computed hash index. The hash table is accessed under caller's
77036+ tree->hash_lock.
77037+*/
77038+znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
77039+{
77040+ znode *result;
77041+ __u32 hash;
77042+ z_hash_table *htable;
77043+
77044+ assert("jmacd-506", tree != NULL);
77045+ assert("jmacd-507", blocknr != NULL);
77046+
77047+ htable = get_htable(tree, blocknr);
77048+ hash = blknrhashfn(htable, blocknr);
77049+
77050+ rcu_read_lock();
77051+ result = z_hash_find_index(htable, hash, blocknr);
77052+
77053+ if (result != NULL) {
77054+ add_x_ref(ZJNODE(result));
77055+ result = znode_rip_check(tree, result);
77056+ }
77057+ rcu_read_unlock();
77058+
77059+ return result;
77060+}
77061+
77062+/* return hash table where znode with block @blocknr is (or should be)
77063+ * stored */
77064+static z_hash_table *get_htable(reiser4_tree * tree,
77065+ const reiser4_block_nr * const blocknr)
77066+{
77067+ z_hash_table *table;
77068+ if (is_disk_addr_unallocated(blocknr))
77069+ table = &tree->zfake_table;
77070+ else
77071+ table = &tree->zhash_table;
77072+ return table;
77073+}
77074+
77075+/* return hash table where znode @node is (or should be) stored */
77076+static z_hash_table *znode_get_htable(const znode * node)
77077+{
77078+ return get_htable(znode_get_tree(node), znode_get_block(node));
77079+}
77080+
77081+/* zget() - get znode from hash table, allocating it if necessary.
77082+
77083+ First a call to zlook, locating a x-referenced znode if one
77084+ exists. If znode is not found, allocate new one and return. Result
77085+ is returned with x_count reference increased.
77086+
77087+ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
77088+ LOCK ORDERING: NONE
77089+*/
77090+znode *zget(reiser4_tree * tree,
77091+ const reiser4_block_nr * const blocknr,
77092+ znode * parent, tree_level level, gfp_t gfp_flag)
77093+{
77094+ znode *result;
77095+ __u32 hashi;
77096+
77097+ z_hash_table *zth;
77098+
77099+ assert("jmacd-512", tree != NULL);
77100+ assert("jmacd-513", blocknr != NULL);
77101+ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
77102+
77103+ zth = get_htable(tree, blocknr);
77104+ hashi = blknrhashfn(zth, blocknr);
77105+
77106+ /* NOTE-NIKITA address-as-unallocated-blocknr still is not
77107+ implemented. */
77108+
77109+ z_hash_prefetch_bucket(zth, hashi);
77110+
77111+ rcu_read_lock();
77112+ /* Find a matching BLOCKNR in the hash table. If the znode is found,
77113+ we obtain an reference (x_count) but the znode remains unlocked.
77114+ Have to worry about race conditions later. */
77115+ result = z_hash_find_index(zth, hashi, blocknr);
77116+ /* According to the current design, the hash table lock protects new
77117+ znode references. */
77118+ if (result != NULL) {
77119+ add_x_ref(ZJNODE(result));
77120+ /* NOTE-NIKITA it should be so, but special case during
77121+ creation of new root makes such assertion highly
77122+ complicated. */
77123+ assert("nikita-2131", 1 || znode_parent(result) == parent ||
77124+ (ZF_ISSET(result, JNODE_ORPHAN)
77125+ && (znode_parent(result) == NULL)));
77126+ result = znode_rip_check(tree, result);
77127+ }
77128+
77129+ rcu_read_unlock();
77130+
77131+ if (!result) {
77132+ znode *shadow;
77133+
77134+ result = zalloc(gfp_flag);
77135+ if (!result) {
77136+ return ERR_PTR(RETERR(-ENOMEM));
77137+ }
77138+
77139+ zinit(result, parent, tree);
77140+ ZJNODE(result)->blocknr = *blocknr;
77141+ ZJNODE(result)->key.z = *blocknr;
77142+ result->level = level;
77143+
77144+ write_lock_tree(tree);
77145+
77146+ shadow = z_hash_find_index(zth, hashi, blocknr);
77147+ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
77148+ jnode_list_remove(ZJNODE(result));
77149+ zfree(result);
77150+ result = shadow;
77151+ } else {
77152+ result->version = znode_build_version(tree);
77153+ z_hash_insert_index_rcu(zth, hashi, result);
77154+
77155+ if (parent != NULL)
77156+ ++parent->c_count;
77157+ }
77158+
77159+ add_x_ref(ZJNODE(result));
77160+
77161+ write_unlock_tree(tree);
77162+ }
77163+#if REISER4_DEBUG
77164+ if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
77165+ reiser4_check_block(blocknr, 1);
77166+#endif
77167+ /* Check for invalid tree level, return -EIO */
77168+ if (unlikely(znode_get_level(result) != level)) {
77169+ warning("jmacd-504",
77170+ "Wrong level for cached block %llu: %i expecting %i",
77171+ (unsigned long long)(*blocknr), znode_get_level(result),
77172+ level);
77173+ zput(result);
77174+ return ERR_PTR(RETERR(-EIO));
77175+ }
77176+
77177+ assert("nikita-1227", znode_invariant(result));
77178+
77179+ return result;
77180+}
77181+
77182+/* ZNODE PLUGINS/DATA */
77183+
77184+/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
77185+ stored at the fixed offset from the beginning of the node. */
77186+static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
77187+ * plugin of */ )
77188+{
77189+ reiser4_tree *tree;
77190+
77191+ assert("nikita-1053", node != NULL);
77192+ assert("nikita-1055", zdata(node) != NULL);
77193+
77194+ tree = znode_get_tree(node);
77195+ assert("umka-053", tree != NULL);
77196+
77197+ if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
77198+ return tree->nplug;
77199+ } else {
77200+ return node_plugin_by_disk_id
77201+ (tree, &((common_node_header *) zdata(node))->plugin_id);
77202+#ifdef GUESS_EXISTS
77203+ reiser4_plugin *plugin;
77204+
77205+ /* NOTE-NIKITA add locking here when dynamic plugins will be
77206+ * implemented */
77207+ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
77208+ if ((plugin->u.node.guess != NULL)
77209+ && plugin->u.node.guess(node))
77210+ return plugin;
77211+ }
77212+ warning("nikita-1057", "Cannot guess node plugin");
77213+ print_znode("node", node);
77214+ return NULL;
77215+#endif
77216+ }
77217+}
77218+
77219+/* parse node header and install ->node_plugin */
77220+int zparse(znode * node /* znode to parse */ )
77221+{
77222+ int result;
77223+
77224+ assert("nikita-1233", node != NULL);
77225+ assert("nikita-2370", zdata(node) != NULL);
77226+
77227+ if (node->nplug == NULL) {
77228+ node_plugin *nplug;
77229+
77230+ nplug = znode_guess_plugin(node);
77231+ if (likely(nplug != NULL)) {
77232+ result = nplug->parse(node);
77233+ if (likely(result == 0))
77234+ node->nplug = nplug;
77235+ } else {
77236+ result = RETERR(-EIO);
77237+ }
77238+ } else
77239+ result = 0;
77240+ return result;
77241+}
77242+
77243+/* zload with readahead */
77244+int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
77245+{
77246+ int result;
77247+
77248+ assert("nikita-484", node != NULL);
77249+ assert("nikita-1377", znode_invariant(node));
77250+ assert("jmacd-7771", !znode_above_root(node));
77251+ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
77252+ assert("nikita-3016", reiser4_schedulable());
77253+
77254+ if (info)
77255+ formatted_readahead(node, info);
77256+
77257+ result = jload(ZJNODE(node));
77258+ assert("nikita-1378", znode_invariant(node));
77259+ return result;
77260+}
77261+
77262+/* load content of node into memory */
77263+int zload(znode * node)
77264+{
77265+ return zload_ra(node, NULL);
77266+}
77267+
77268+/* call node plugin to initialise newly allocated node. */
77269+int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
77270+{
77271+ return jinit_new(ZJNODE(node), gfp_flags);
77272+}
77273+
77274+/* drop reference to node data. When last reference is dropped, data are
77275+ unloaded. */
77276+void zrelse(znode * node /* znode to release references to */ )
77277+{
77278+ assert("nikita-1381", znode_invariant(node));
77279+
77280+ jrelse(ZJNODE(node));
77281+}
77282+
77283+/* returns free space in node */
77284+unsigned znode_free_space(znode * node /* znode to query */ )
77285+{
77286+ assert("nikita-852", node != NULL);
77287+ return node_plugin_by_node(node)->free_space(node);
77288+}
77289+
77290+/* left delimiting key of znode */
77291+reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
77292+{
77293+ assert("nikita-958", node != NULL);
77294+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77295+ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
77296+ assert("nikita-30671", node->rd_key_version != 0);
77297+ return &node->rd_key;
77298+}
77299+
77300+/* right delimiting key of znode */
77301+reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
77302+{
77303+ assert("nikita-974", node != NULL);
77304+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77305+ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
77306+ assert("nikita-30681", node->ld_key_version != 0);
77307+ return &node->ld_key;
77308+}
77309+
77310+ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
77311+ )
77312+
77313+/* update right-delimiting key of @node */
77314+reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
77315+{
77316+ assert("nikita-2937", node != NULL);
77317+ assert("nikita-2939", key != NULL);
77318+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77319+ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
77320+ assert("nikita-2944",
77321+ znode_is_any_locked(node) ||
77322+ znode_get_level(node) != LEAF_LEVEL ||
77323+ keyge(key, &node->rd_key) ||
77324+ keyeq(&node->rd_key, reiser4_min_key()) ||
77325+ ZF_ISSET(node, JNODE_HEARD_BANSHEE));
77326+
77327+ node->rd_key = *key;
77328+ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
77329+ return &node->rd_key;
77330+}
77331+
77332+/* update left-delimiting key of @node */
77333+reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
77334+{
77335+ assert("nikita-2940", node != NULL);
77336+ assert("nikita-2941", key != NULL);
77337+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77338+ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
77339+ assert("nikita-2943",
77340+ znode_is_any_locked(node) || keyeq(&node->ld_key,
77341+ reiser4_min_key()));
77342+
77343+ node->ld_key = *key;
77344+ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
77345+ return &node->ld_key;
77346+}
77347+
77348+/* true if @key is inside key range for @node */
77349+int znode_contains_key(znode * node /* znode to look in */ ,
77350+ const reiser4_key * key /* key to look for */ )
77351+{
77352+ assert("nikita-1237", node != NULL);
77353+ assert("nikita-1238", key != NULL);
77354+
77355+ /* left_delimiting_key <= key <= right_delimiting_key */
77356+ return keyle(znode_get_ld_key(node), key)
77357+ && keyle(key, znode_get_rd_key(node));
77358+}
77359+
77360+/* same as znode_contains_key(), but lock dk lock */
77361+int znode_contains_key_lock(znode * node /* znode to look in */ ,
77362+ const reiser4_key * key /* key to look for */ )
77363+{
77364+ int result;
77365+
77366+ assert("umka-056", node != NULL);
77367+ assert("umka-057", key != NULL);
77368+
77369+ read_lock_dk(znode_get_tree(node));
77370+ result = znode_contains_key(node, key);
77371+ read_unlock_dk(znode_get_tree(node));
77372+ return result;
77373+}
77374+
77375+/* get parent pointer, assuming tree is not locked */
77376+znode *znode_parent_nolock(const znode * node /* child znode */ )
77377+{
77378+ assert("nikita-1444", node != NULL);
77379+ return node->in_parent.node;
77380+}
77381+
77382+/* get parent pointer of znode */
77383+znode *znode_parent(const znode * node /* child znode */ )
77384+{
77385+ assert("nikita-1226", node != NULL);
77386+ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
77387+ return znode_parent_nolock(node);
77388+}
77389+
77390+/* detect uber znode used to protect in-superblock tree root pointer */
77391+int znode_above_root(const znode * node /* znode to query */ )
77392+{
77393+ assert("umka-059", node != NULL);
77394+
77395+ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
77396+}
77397+
77398+/* check that @node is root---that its block number is recorder in the tree as
77399+ that of root node */
77400+#if REISER4_DEBUG
77401+static int znode_is_true_root(const znode * node /* znode to query */ )
77402+{
77403+ assert("umka-060", node != NULL);
77404+ assert("umka-061", current_tree != NULL);
77405+
77406+ return disk_addr_eq(znode_get_block(node),
77407+ &znode_get_tree(node)->root_block);
77408+}
77409+#endif
77410+
77411+/* check that @node is root */
77412+int znode_is_root(const znode * node /* znode to query */ )
77413+{
77414+ assert("nikita-1206", node != NULL);
77415+
77416+ return znode_get_level(node) == znode_get_tree(node)->height;
77417+}
77418+
77419+/* Returns true is @node was just created by zget() and wasn't ever loaded
77420+ into memory. */
77421+/* NIKITA-HANS: yes */
77422+int znode_just_created(const znode * node)
77423+{
77424+ assert("nikita-2188", node != NULL);
77425+ return (znode_page(node) == NULL);
77426+}
77427+
77428+/* obtain updated ->znode_epoch. See seal.c for description. */
77429+__u64 znode_build_version(reiser4_tree * tree)
77430+{
77431+ __u64 result;
77432+
77433+ spin_lock(&tree->epoch_lock);
77434+ result = ++tree->znode_epoch;
77435+ spin_unlock(&tree->epoch_lock);
77436+ return result;
77437+}
77438+
77439+void init_load_count(load_count * dh)
77440+{
77441+ assert("nikita-2105", dh != NULL);
77442+ memset(dh, 0, sizeof *dh);
77443+}
77444+
77445+void done_load_count(load_count * dh)
77446+{
77447+ assert("nikita-2106", dh != NULL);
77448+ if (dh->node != NULL) {
77449+ for (; dh->d_ref > 0; --dh->d_ref)
77450+ zrelse(dh->node);
77451+ dh->node = NULL;
77452+ }
77453+}
77454+
77455+static int incr_load_count(load_count * dh)
77456+{
77457+ int result;
77458+
77459+ assert("nikita-2110", dh != NULL);
77460+ assert("nikita-2111", dh->node != NULL);
77461+
77462+ result = zload(dh->node);
77463+ if (result == 0)
77464+ ++dh->d_ref;
77465+ return result;
77466+}
77467+
77468+int incr_load_count_znode(load_count * dh, znode * node)
77469+{
77470+ assert("nikita-2107", dh != NULL);
77471+ assert("nikita-2158", node != NULL);
77472+ assert("nikita-2109",
77473+ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
77474+
77475+ dh->node = node;
77476+ return incr_load_count(dh);
77477+}
77478+
77479+int incr_load_count_jnode(load_count * dh, jnode * node)
77480+{
77481+ if (jnode_is_znode(node)) {
77482+ return incr_load_count_znode(dh, JZNODE(node));
77483+ }
77484+ return 0;
77485+}
77486+
77487+void copy_load_count(load_count * new, load_count * old)
77488+{
77489+ int ret = 0;
77490+ done_load_count(new);
77491+ new->node = old->node;
77492+ new->d_ref = 0;
77493+
77494+ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
77495+ }
77496+
77497+ assert("jmacd-87589", ret == 0);
77498+}
77499+
77500+void move_load_count(load_count * new, load_count * old)
77501+{
77502+ done_load_count(new);
77503+ new->node = old->node;
77504+ new->d_ref = old->d_ref;
77505+ old->node = NULL;
77506+ old->d_ref = 0;
77507+}
77508+
77509+/* convert parent pointer into coord */
77510+void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
77511+{
77512+ assert("nikita-3204", pcoord != NULL);
77513+ assert("nikita-3205", coord != NULL);
77514+
77515+ coord_init_first_unit_nocheck(coord, pcoord->node);
77516+ coord_set_item_pos(coord, pcoord->item_pos);
77517+ coord->between = AT_UNIT;
77518+}
77519+
77520+/* pack coord into parent_coord_t */
77521+void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
77522+{
77523+ assert("nikita-3206", pcoord != NULL);
77524+ assert("nikita-3207", coord != NULL);
77525+
77526+ pcoord->node = coord->node;
77527+ pcoord->item_pos = coord->item_pos;
77528+}
77529+
77530+/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
77531+ look for comments there) */
77532+void init_parent_coord(parent_coord_t * pcoord, const znode * node)
77533+{
77534+ pcoord->node = (znode *) node;
77535+ pcoord->item_pos = (unsigned short)~0;
77536+}
77537+
77538+#if REISER4_DEBUG
77539+
77540+/* debugging aid: znode invariant */
77541+static int znode_invariant_f(const znode * node /* znode to check */ ,
77542+ char const **msg /* where to store error
77543+ * message, if any */ )
77544+{
77545+#define _ergo(ant, con) \
77546+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
77547+
77548+#define _equi(e1, e2) \
77549+ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
77550+
77551+#define _check(exp) ((*msg) = #exp, (exp))
77552+
77553+ return jnode_invariant_f(ZJNODE(node), msg) &&
77554+ /* [znode-fake] invariant */
77555+ /* fake znode doesn't have a parent, and */
77556+ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
77557+ /* there is another way to express this very check, and */
77558+ _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
77559+ /* it has special block number, and */
77560+ _ergo(znode_get_level(node) == 0,
77561+ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77562+ /* it is the only znode with such block number, and */
77563+ _ergo(!znode_above_root(node) && znode_is_loaded(node),
77564+ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77565+ /* it is parent of the tree root node */
77566+ _ergo(znode_is_true_root(node),
77567+ znode_above_root(znode_parent(node))) &&
77568+ /* [znode-level] invariant */
77569+ /* level of parent znode is one larger than that of child,
77570+ except for the fake znode, and */
77571+ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
77572+ znode_get_level(znode_parent(node)) ==
77573+ znode_get_level(node) + 1) &&
77574+ /* left neighbor is at the same level, and */
77575+ _ergo(znode_is_left_connected(node) && node->left != NULL,
77576+ znode_get_level(node) == znode_get_level(node->left)) &&
77577+ /* right neighbor is at the same level */
77578+ _ergo(znode_is_right_connected(node) && node->right != NULL,
77579+ znode_get_level(node) == znode_get_level(node->right)) &&
77580+ /* [znode-connected] invariant */
77581+ _ergo(node->left != NULL, znode_is_left_connected(node)) &&
77582+ _ergo(node->right != NULL, znode_is_right_connected(node)) &&
77583+ _ergo(!znode_is_root(node) && node->left != NULL,
77584+ znode_is_right_connected(node->left) &&
77585+ node->left->right == node) &&
77586+ _ergo(!znode_is_root(node) && node->right != NULL,
77587+ znode_is_left_connected(node->right) &&
77588+ node->right->left == node) &&
77589+ /* [znode-c_count] invariant */
77590+ /* for any znode, c_count of its parent is greater than 0 */
77591+ _ergo(znode_parent(node) != NULL &&
77592+ !znode_above_root(znode_parent(node)),
77593+ znode_parent(node)->c_count > 0) &&
77594+ /* leaves don't have children */
77595+ _ergo(znode_get_level(node) == LEAF_LEVEL,
77596+ node->c_count == 0) &&
77597+ _check(node->zjnode.jnodes.prev != NULL) &&
77598+ _check(node->zjnode.jnodes.next != NULL) &&
77599+ /* orphan doesn't have a parent */
77600+ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
77601+ /* [znode-modify] invariant */
77602+ /* if znode is not write-locked, its checksum remains
77603+ * invariant */
77604+ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
77605+ * cannot check this. */
77606+ /* [znode-refs] invariant */
77607+ /* only referenced znode can be long-term locked */
77608+ _ergo(znode_is_locked(node),
77609+ atomic_read(&ZJNODE(node)->x_count) != 0);
77610+}
77611+
77612+/* debugging aid: check znode invariant and panic if it doesn't hold */
77613+int znode_invariant(znode * node /* znode to check */ )
77614+{
77615+ char const *failed_msg;
77616+ int result;
77617+
77618+ assert("umka-063", node != NULL);
77619+ assert("umka-064", current_tree != NULL);
77620+
77621+ spin_lock_znode(node);
77622+ read_lock_tree(znode_get_tree(node));
77623+ result = znode_invariant_f(node, &failed_msg);
77624+ if (!result) {
77625+ /* print_znode("corrupted node", node); */
77626+ warning("jmacd-555", "Condition %s failed", failed_msg);
77627+ }
77628+ read_unlock_tree(znode_get_tree(node));
77629+ spin_unlock_znode(node);
77630+ return result;
77631+}
77632+
77633+/* return non-0 iff data are loaded into znode */
77634+int znode_is_loaded(const znode * node /* znode to query */ )
77635+{
77636+ assert("nikita-497", node != NULL);
77637+ return jnode_is_loaded(ZJNODE(node));
77638+}
77639+
77640+unsigned long znode_times_locked(const znode * z)
77641+{
77642+ return z->times_locked;
77643+}
77644+
77645+#endif /* REISER4_DEBUG */
77646+
77647+/* Make Linus happy.
77648+ Local variables:
77649+ c-indentation-style: "K&R"
77650+ mode-name: "LC"
77651+ c-basic-offset: 8
77652+ tab-width: 8
77653+ fill-column: 120
77654+ End:
77655+*/
77656diff -urN linux-2.6.23.orig/fs/reiser4/znode.h linux-2.6.23/fs/reiser4/znode.h
77657--- linux-2.6.23.orig/fs/reiser4/znode.h 1970-01-01 03:00:00.000000000 +0300
77658+++ linux-2.6.23/fs/reiser4/znode.h 2007-12-04 16:49:30.000000000 +0300
77659@@ -0,0 +1,434 @@
77660+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
77661+ * reiser4/README */
77662+
77663+/* Declaration of znode (Zam's node). See znode.c for more details. */
77664+
77665+#ifndef __ZNODE_H__
77666+#define __ZNODE_H__
77667+
77668+#include "forward.h"
77669+#include "debug.h"
77670+#include "dformat.h"
77671+#include "key.h"
77672+#include "coord.h"
77673+#include "plugin/node/node.h"
77674+#include "jnode.h"
77675+#include "lock.h"
77676+#include "readahead.h"
77677+
77678+#include <linux/types.h>
77679+#include <linux/spinlock.h>
77680+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
77681+#include <asm/atomic.h>
77682+#include <asm/semaphore.h>
77683+
77684+/* znode tracks its position within parent (internal item in a parent node,
77685+ * that contains znode's block number). */
77686+typedef struct parent_coord {
77687+ znode *node;
77688+ pos_in_node_t item_pos;
77689+} parent_coord_t;
77690+
77691+/* &znode - node in a reiser4 tree.
77692+
77693+ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
77694+ cacheline pressure.
77695+
77696+ Locking:
77697+
77698+ Long term: data in a disk node attached to this znode are protected
77699+ by long term, deadlock aware lock ->lock;
77700+
77701+ Spin lock: the following fields are protected by the spin lock:
77702+
77703+ ->lock
77704+
77705+ Following fields are protected by the global tree lock:
77706+
77707+ ->left
77708+ ->right
77709+ ->in_parent
77710+ ->c_count
77711+
77712+ Following fields are protected by the global delimiting key lock (dk_lock):
77713+
77714+ ->ld_key (to update ->ld_key long-term lock on the node is also required)
77715+ ->rd_key
77716+
77717+ Following fields are protected by the long term lock:
77718+
77719+ ->nr_items
77720+
77721+ ->node_plugin is never changed once set. This means that after code made
77722+ itself sure that field is valid it can be accessed without any additional
77723+ locking.
77724+
77725+ ->level is immutable.
77726+
77727+ Invariants involving this data-type:
77728+
77729+ [znode-fake]
77730+ [znode-level]
77731+ [znode-connected]
77732+ [znode-c_count]
77733+ [znode-refs]
77734+ [jnode-refs]
77735+ [jnode-queued]
77736+ [znode-modify]
77737+
77738+ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
77739+ Suggestions for how to do that are desired.*/
77740+struct znode {
77741+ /* Embedded jnode. */
77742+ jnode zjnode;
77743+
77744+ /* contains three subfields, node, pos_in_node, and pos_in_unit.
77745+
77746+ pos_in_node and pos_in_unit are only hints that are cached to
77747+ speed up lookups during balancing. They are not required to be up to
77748+ date. Synched in find_child_ptr().
77749+
77750+ This value allows us to avoid expensive binary searches.
77751+
77752+ in_parent->node points to the parent of this node, and is NOT a
77753+ hint.
77754+ */
77755+ parent_coord_t in_parent;
77756+
77757+ /*
77758+ * sibling list pointers
77759+ */
77760+
77761+ /* left-neighbor */
77762+ znode *left;
77763+ /* right-neighbor */
77764+ znode *right;
77765+
77766+ /* long term lock on node content. This lock supports deadlock
77767+ detection. See lock.c
77768+ */
77769+ zlock lock;
77770+
77771+ /* You cannot remove from memory a node that has children in
77772+ memory. This is because we rely on the fact that parent of given
77773+ node can always be reached without blocking for io. When reading a
77774+ node into memory you must increase the c_count of its parent, when
77775+ removing it from memory you must decrease the c_count. This makes
77776+ the code simpler, and the cases where it is suboptimal are truly
77777+ obscure.
77778+ */
77779+ int c_count;
77780+
77781+ /* plugin of node attached to this znode. NULL if znode is not
77782+ loaded. */
77783+ node_plugin *nplug;
77784+
77785+ /* version of znode data. This is increased on each modification. This
77786+ * is necessary to implement seals (see seal.[ch]) efficiently. */
77787+ __u64 version;
77788+
77789+ /* left delimiting key. Necessary to efficiently perform
77790+ balancing with node-level locking. Kept in memory only. */
77791+ reiser4_key ld_key;
77792+ /* right delimiting key. */
77793+ reiser4_key rd_key;
77794+
77795+ /* znode's tree level */
77796+ __u16 level;
77797+ /* number of items in this node. This field is modified by node
77798+ * plugin. */
77799+ __u16 nr_items;
77800+
77801+#if REISER4_DEBUG
77802+ void *creator;
77803+ reiser4_key first_key;
77804+ unsigned long times_locked;
77805+ int left_version; /* when node->left was updated */
77806+ int right_version; /* when node->right was updated */
77807+ int ld_key_version; /* when node->ld_key was updated */
77808+ int rd_key_version; /* when node->rd_key was updated */
77809+#endif
77810+
77811+} __attribute__ ((aligned(16)));
77812+
77813+ON_DEBUG(extern atomic_t delim_key_version;
77814+ )
77815+
77816+/* In general I think these macros should not be exposed. */
77817+#define znode_is_locked(node) (lock_is_locked(&node->lock))
77818+#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
77819+#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
77820+#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
77821+#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
77822+#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
77823+/* Macros for accessing the znode state. */
77824+#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
77825+#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
77826+#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
77827+extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
77828+ znode * parent, tree_level level, gfp_t gfp_flag);
77829+extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
77830+extern int zload(znode * node);
77831+extern int zload_ra(znode * node, ra_info_t * info);
77832+extern int zinit_new(znode * node, gfp_t gfp_flags);
77833+extern void zrelse(znode * node);
77834+extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
77835+
77836+/* size of data in znode */
77837+static inline unsigned
77838+znode_size(const znode * node UNUSED_ARG /* znode to query */ )
77839+{
77840+ assert("nikita-1416", node != NULL);
77841+ return PAGE_CACHE_SIZE;
77842+}
77843+
77844+extern void parent_coord_to_coord(const parent_coord_t * pcoord,
77845+ coord_t * coord);
77846+extern void coord_to_parent_coord(const coord_t * coord,
77847+ parent_coord_t * pcoord);
77848+extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
77849+
77850+extern unsigned znode_free_space(znode * node);
77851+
77852+extern reiser4_key *znode_get_rd_key(znode * node);
77853+extern reiser4_key *znode_get_ld_key(znode * node);
77854+
77855+extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
77856+extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
77857+
77858+/* `connected' state checks */
77859+static inline int znode_is_right_connected(const znode * node)
77860+{
77861+ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
77862+}
77863+
77864+static inline int znode_is_left_connected(const znode * node)
77865+{
77866+ return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
77867+}
77868+
77869+static inline int znode_is_connected(const znode * node)
77870+{
77871+ return znode_is_right_connected(node) && znode_is_left_connected(node);
77872+}
77873+
77874+extern int znode_shift_order;
77875+extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
77876+extern void znode_remove(znode *, reiser4_tree *);
77877+extern znode *znode_parent(const znode * node);
77878+extern znode *znode_parent_nolock(const znode * node);
77879+extern int znode_above_root(const znode * node);
77880+extern int init_znodes(void);
77881+extern void done_znodes(void);
77882+extern int znodes_tree_init(reiser4_tree * ztree);
77883+extern void znodes_tree_done(reiser4_tree * ztree);
77884+extern int znode_contains_key(znode * node, const reiser4_key * key);
77885+extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
77886+extern unsigned znode_save_free_space(znode * node);
77887+extern unsigned znode_recover_free_space(znode * node);
77888+extern znode *zalloc(gfp_t gfp_flag);
77889+extern void zinit(znode *, const znode * parent, reiser4_tree *);
77890+extern int zparse(znode * node);
77891+
77892+extern int znode_just_created(const znode * node);
77893+
77894+extern void zfree(znode * node);
77895+
77896+#if REISER4_DEBUG
77897+extern void print_znode(const char *prefix, const znode * node);
77898+#else
77899+#define print_znode( p, n ) noop
77900+#endif
77901+
77902+/* Make it look like various znode functions exist instead of treating znodes as
77903+ jnodes in znode-specific code. */
77904+#define znode_page(x) jnode_page ( ZJNODE(x) )
77905+#define zdata(x) jdata ( ZJNODE(x) )
77906+#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
77907+#define znode_created(x) jnode_created ( ZJNODE(x) )
77908+#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
77909+#define znode_convertible(x) jnode_convertible (ZJNODE(x))
77910+#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
77911+
77912+#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
77913+#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
77914+#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
77915+#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
77916+
77917+#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
77918+#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
77919+#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
77920+#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
77921+#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
77922+
77923+#if REISER4_DEBUG
77924+extern int znode_x_count_is_protected(const znode * node);
77925+extern int znode_invariant(znode * node);
77926+#endif
77927+
77928+/* acquire reference to @node */
77929+static inline znode *zref(znode * node)
77930+{
77931+ /* change of x_count from 0 to 1 is protected by tree spin-lock */
77932+ return JZNODE(jref(ZJNODE(node)));
77933+}
77934+
77935+/* release reference to @node */
77936+static inline void zput(znode * node)
77937+{
77938+ assert("nikita-3564", znode_invariant(node));
77939+ jput(ZJNODE(node));
77940+}
77941+
77942+/* get the level field for a znode */
77943+static inline tree_level znode_get_level(const znode * node)
77944+{
77945+ return node->level;
77946+}
77947+
77948+/* get the level field for a jnode */
77949+static inline tree_level jnode_get_level(const jnode * node)
77950+{
77951+ if (jnode_is_znode(node))
77952+ return znode_get_level(JZNODE(node));
77953+ else
77954+ /* unformatted nodes are all at the LEAF_LEVEL and for
77955+ "semi-formatted" nodes like bitmaps, level doesn't matter. */
77956+ return LEAF_LEVEL;
77957+}
77958+
77959+/* true if jnode is on leaf level */
77960+static inline int jnode_is_leaf(const jnode * node)
77961+{
77962+ if (jnode_is_znode(node))
77963+ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
77964+ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
77965+ return 1;
77966+ return 0;
77967+}
77968+
77969+/* return znode's tree */
77970+static inline reiser4_tree *znode_get_tree(const znode * node)
77971+{
77972+ assert("nikita-2692", node != NULL);
77973+ return jnode_get_tree(ZJNODE(node));
77974+}
77975+
77976+/* resolve race with zput */
77977+static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
77978+{
77979+ jnode *j;
77980+
77981+ j = jnode_rip_sync(tree, ZJNODE(node));
77982+ if (likely(j != NULL))
77983+ node = JZNODE(j);
77984+ else
77985+ node = NULL;
77986+ return node;
77987+}
77988+
77989+#if defined(REISER4_DEBUG)
77990+int znode_is_loaded(const znode * node /* znode to query */ );
77991+#endif
77992+
77993+extern __u64 znode_build_version(reiser4_tree * tree);
77994+
77995+/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
77996+ must load the data for a node in many places. We could do this by simply calling
77997+ zload() everywhere, the difficulty arises when we must release the loaded data by
77998+ calling zrelse. In a function with many possible error/return paths, it requires extra
77999+ work to figure out which exit paths must call zrelse and those which do not. The data
78000+ handle automatically calls zrelse for every zload that it is responsible for. In that
78001+ sense, it acts much like a lock_handle.
78002+*/
78003+typedef struct load_count {
78004+ znode *node;
78005+ int d_ref;
78006+} load_count;
78007+
78008+extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
78009+extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
78010+extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
78011+extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
78012+ * incr_load_count_znode, otherwise do nothing (unformatted nodes
78013+ * don't require zload/zrelse treatment). */
78014+extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
78015+extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
78016+
78017+/* Variable initializers for load_count. */
78018+#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
78019+#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
78020+/* A convenience macro for use in assertions or debug-only code, where loaded
78021+ data is only required to perform the debugging check. This macro
78022+ encapsulates an expression inside a pair of calls to zload()/zrelse(). */
78023+#define WITH_DATA( node, exp ) \
78024+({ \
78025+ long __with_dh_result; \
78026+ znode *__with_dh_node; \
78027+ \
78028+ __with_dh_node = ( node ); \
78029+ __with_dh_result = zload( __with_dh_node ); \
78030+ if( __with_dh_result == 0 ) { \
78031+ __with_dh_result = ( long )( exp ); \
78032+ zrelse( __with_dh_node ); \
78033+ } \
78034+ __with_dh_result; \
78035+})
78036+
78037+/* Same as above, but accepts a return value in case zload fails. */
78038+#define WITH_DATA_RET( node, ret, exp ) \
78039+({ \
78040+ int __with_dh_result; \
78041+ znode *__with_dh_node; \
78042+ \
78043+ __with_dh_node = ( node ); \
78044+ __with_dh_result = zload( __with_dh_node ); \
78045+ if( __with_dh_result == 0 ) { \
78046+ __with_dh_result = ( int )( exp ); \
78047+ zrelse( __with_dh_node ); \
78048+ } else \
78049+ __with_dh_result = ( ret ); \
78050+ __with_dh_result; \
78051+})
78052+
78053+#define WITH_COORD(coord, exp) \
78054+({ \
78055+ coord_t *__coord; \
78056+ \
78057+ __coord = (coord); \
78058+ coord_clear_iplug(__coord); \
78059+ WITH_DATA(__coord->node, exp); \
78060+})
78061+
78062+#if REISER4_DEBUG
78063+#define STORE_COUNTERS \
78064+ reiser4_lock_cnt_info __entry_counters = \
78065+ *reiser4_lock_counters()
78066+#define CHECK_COUNTERS \
78067+ON_DEBUG_CONTEXT( \
78068+({ \
78069+ __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \
78070+ __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \
78071+ __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \
78072+ assert("nikita-2159", \
78073+ !memcmp(&__entry_counters, reiser4_lock_counters(), \
78074+ sizeof __entry_counters)); \
78075+}) )
78076+
78077+#else
78078+#define STORE_COUNTERS
78079+#define CHECK_COUNTERS noop
78080+#endif
78081+
78082+/* __ZNODE_H__ */
78083+#endif
78084+
78085+/* Make Linus happy.
78086+ Local variables:
78087+ c-indentation-style: "K&R"
78088+ mode-name: "LC"
78089+ c-basic-offset: 8
78090+ tab-width: 8
78091+ fill-column: 120
78092+ End:
78093+*/
78094diff -urN linux-2.6.23.orig/include/linux/fs.h linux-2.6.23/include/linux/fs.h
78095--- linux-2.6.23.orig/include/linux/fs.h 2007-10-10 00:31:38.000000000 +0400
78096+++ linux-2.6.23/include/linux/fs.h 2007-12-04 20:02:08.277902069 +0300
78097@@ -1198,6 +1198,8 @@
78098 void (*clear_inode) (struct inode *);
78099 void (*umount_begin) (struct vfsmount *, int);
78100
78101+ void (*sync_inodes) (struct super_block *sb,
78102+ struct writeback_control *wbc);
78103 int (*show_options)(struct seq_file *, struct vfsmount *);
78104 int (*show_stats)(struct seq_file *, struct vfsmount *);
78105 #ifdef CONFIG_QUOTA
78106@@ -1539,6 +1541,7 @@
78107 extern int invalidate_inode_pages2_range(struct address_space *mapping,
78108 pgoff_t start, pgoff_t end);
78109 extern int write_inode_now(struct inode *, int);
78110+extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
78111 extern int filemap_fdatawrite(struct address_space *);
78112 extern int filemap_flush(struct address_space *);
78113 extern int filemap_fdatawait(struct address_space *);
78114diff -urN linux-2.6.23.orig/mm/filemap.c linux-2.6.23/mm/filemap.c
78115--- linux-2.6.23.orig/mm/filemap.c 2007-10-10 00:31:38.000000000 +0400
78116+++ linux-2.6.23/mm/filemap.c 2007-12-04 20:02:09.534225346 +0300
78117@@ -122,6 +122,7 @@
78118 __dec_zone_page_state(page, NR_FILE_PAGES);
78119 BUG_ON(page_mapped(page));
78120 }
78121+EXPORT_SYMBOL(__remove_from_page_cache);
78122
78123 void remove_from_page_cache(struct page *page)
78124 {
78125@@ -133,6 +134,7 @@
78126 __remove_from_page_cache(page);
78127 write_unlock_irq(&mapping->tree_lock);
78128 }
78129+EXPORT_SYMBOL(remove_from_page_cache);
78130
78131 static int sync_page(void *word)
78132 {
78133@@ -720,6 +722,7 @@
78134 read_unlock_irq(&mapping->tree_lock);
78135 return ret;
78136 }
78137+EXPORT_SYMBOL(add_to_page_cache_lru);
78138
78139 /**
78140 * find_get_pages_contig - gang contiguous pagecache lookup
78141@@ -839,6 +842,7 @@
78142
78143 ra->ra_pages /= 4;
78144 }
78145+EXPORT_SYMBOL(find_get_pages);
78146
78147 /**
78148 * do_generic_mapping_read - generic file read routine