]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/reiser4-for-2.6.24.patch
Patched kudzu for new usb networkadapters
[people/pmueller/ipfire-2.x.git] / src / patches / reiser4-for-2.6.24.patch
CommitLineData
64cf4991
AF
1diff -urN linux-2.6.24.orig/arch/x86/lib/usercopy_32.c linux-2.6.24/arch/x86/lib/usercopy_32.c
2--- linux-2.6.24.orig/arch/x86/lib/usercopy_32.c 2008-01-25 14:24:08.234127530 +0300
3+++ linux-2.6.24/arch/x86/lib/usercopy_32.c 2008-01-25 11:39:06.872191202 +0300
4@@ -817,6 +817,7 @@
5 #endif
6 return n;
7 }
8+EXPORT_SYMBOL(__copy_from_user_ll_nocache);
9
10 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
11 unsigned long n)
12@@ -831,6 +832,7 @@
13 #endif
14 return n;
15 }
16+EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
17
18 /**
19 * copy_to_user: - Copy a block of data into user space.
20diff -urN linux-2.6.24.orig/Documentation/Changes linux-2.6.24/Documentation/Changes
21--- linux-2.6.24.orig/Documentation/Changes 2007-10-10 00:31:38.000000000 +0400
22+++ linux-2.6.24/Documentation/Changes 2008-01-25 11:39:06.876192233 +0300
23@@ -36,6 +36,7 @@
24 o e2fsprogs 1.29 # tune2fs
25 o jfsutils 1.1.3 # fsck.jfs -V
26 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
27+o reiser4progs 1.0.0 # fsck.reiser4 -V
28 o xfsprogs 2.6.0 # xfs_db -V
29 o pcmciautils 004 # pccardctl -V
30 o quota-tools 3.09 # quota -V
31@@ -145,6 +146,13 @@
32 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
33 reiserfsck. These utils work on both i386 and alpha platforms.
34
35+Reiser4progs
36+------------
37+
38+The reiser4progs package contains utilities for the reiser4 file system.
39+Detailed instructions are provided in the README file located at:
40+<ftp://ftp.namesys.com/pub/reiser4progs/README>.
41+
42 Xfsprogs
43 --------
44
45@@ -323,6 +331,10 @@
46 -------------
47 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
48
49+Reiser4progs
50+------------
51+o <ftp://ftp.namesys.com/pub/reiser4progs/>
52+
53 Xfsprogs
54 --------
55 o <ftp://oss.sgi.com/projects/xfs/download/>
56diff -urN linux-2.6.24.orig/Documentation/filesystems/reiser4.txt linux-2.6.24/Documentation/filesystems/reiser4.txt
57--- linux-2.6.24.orig/Documentation/filesystems/reiser4.txt 1970-01-01 03:00:00.000000000 +0300
58+++ linux-2.6.24/Documentation/filesystems/reiser4.txt 2008-01-25 11:39:06.876192233 +0300
59@@ -0,0 +1,75 @@
60+Reiser4 filesystem
61+==================
62+Reiser4 is a file system based on dancing tree algorithms, and is
63+described at http://www.namesys.com
64+
65+
66+References
67+==========
68+web page http://namesys.com/v4/v4.html
69+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
70+userland tools ftp://ftp.namesys.com/pub/reiser4progs/
71+install page http://www.namesys.com/install_v4.html
72+
73+Compile options
74+===============
75+Enable reiser4 debug mode
76+ This checks everything imaginable while reiser4
77+ runs
78+
79+Mount options
80+=============
81+tmgr.atom_max_size=N
82+ Atoms containing more than N blocks will be forced to commit.
83+ N is decimal.
84+ Default is nr_free_pagecache_pages() / 2 at mount time.
85+
86+tmgr.atom_max_age=N
87+ Atoms older than N seconds will be forced to commit. N is decimal.
88+ Default is 600.
89+
90+tmgr.atom_max_flushers=N
91+ Limit of concurrent flushers for one atom. 0 means no limit.
92+ Default is 0.
93+
94+tree.cbk_cache.nr_slots=N
95+ Number of slots in the cbk cache.
96+
97+flush.relocate_threshold=N
98+ If flush finds more than N adjacent dirty leaf-level blocks it
99+ will force them to be relocated.
100+ Default is 64.
101+
102+flush.relocate_distance=N
103+ If flush finds can find a block allocation closer than at most
104+ N from the preceder it will relocate to that position.
105+ Default is 64.
106+
107+flush.scan_maxnodes=N
108+ The maximum number of nodes to scan left on a level during
109+ flush.
110+ Default is 10000.
111+
112+optimal_io_size=N
113+ Preferred IO size. This value is used to set st_blksize of
114+ struct stat.
115+ Default is 65536.
116+
117+bsdgroups
118+ Turn on BSD-style gid assignment.
119+
120+32bittimes
121+ By default file in reiser4 have 64 bit timestamps. Files
122+ created when filesystem is mounted with 32bittimes mount
123+ option will get 32 bit timestamps.
124+
125+mtflush
126+ Turn off concurrent flushing.
127+
128+nopseudo
129+ Disable pseudo files support. See
130+ http://namesys.com/v4/pseudo.html for more about pseudo files.
131+
132+dont_load_bitmap
133+ Don't load all bitmap blocks at mount time, it is useful for
134+ machines with tiny RAM and large disks.
135diff -urN linux-2.6.24.orig/fs/fs-writeback.c linux-2.6.24/fs/fs-writeback.c
136--- linux-2.6.24.orig/fs/fs-writeback.c 2008-01-25 14:24:18.344724018 +0300
137+++ linux-2.6.24/fs/fs-writeback.c 2008-01-25 11:39:06.876192233 +0300
138@@ -386,8 +386,6 @@
139 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
140 * that it can be located for waiting on in __writeback_single_inode().
141 *
142- * Called under inode_lock.
143- *
144 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
145 * This function assumes that the blockdev superblock's inodes are backed by
146 * a variety of queues, so all inodes are searched. For other superblocks,
147@@ -403,11 +401,13 @@
148 * on the writer throttling path, and we get decent balancing between many
149 * throttled threads: we don't want them all piling up on inode_sync_wait.
150 */
151-static void
152-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
153+void
154+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
155 {
156 const unsigned long start = jiffies; /* livelock avoidance */
157
158+ spin_lock(&inode_lock);
159+
160 if (!wbc->for_kupdate || list_empty(&sb->s_io))
161 queue_io(sb, wbc->older_than_this);
162
163@@ -482,8 +482,19 @@
164 if (wbc->nr_to_write <= 0)
165 break;
166 }
167+ spin_unlock(&inode_lock);
168 return; /* Leave any unwritten inodes on s_io */
169 }
170+EXPORT_SYMBOL(generic_sync_sb_inodes);
171+
172+static void
173+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
174+{
175+ if (sb->s_op->sync_inodes)
176+ sb->s_op->sync_inodes(sb, wbc);
177+ else
178+ generic_sync_sb_inodes(sb, wbc);
179+}
180
181 /*
182 * Start writeback of dirty pagecache data against all unlocked inodes.
183@@ -524,11 +535,8 @@
184 * be unmounted by the time it is released.
185 */
186 if (down_read_trylock(&sb->s_umount)) {
187- if (sb->s_root) {
188- spin_lock(&inode_lock);
189+ if (sb->s_root)
190 sync_sb_inodes(sb, wbc);
191- spin_unlock(&inode_lock);
192- }
193 up_read(&sb->s_umount);
194 }
195 spin_lock(&sb_lock);
196@@ -566,9 +574,7 @@
197 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
198 nr_dirty + nr_unstable;
199 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
200- spin_lock(&inode_lock);
201 sync_sb_inodes(sb, &wbc);
202- spin_unlock(&inode_lock);
203 }
204
205 /*
206diff -urN linux-2.6.24.orig/fs/Kconfig linux-2.6.24/fs/Kconfig
207--- linux-2.6.24.orig/fs/Kconfig 2008-01-25 14:24:17.976629488 +0300
208+++ linux-2.6.24/fs/Kconfig 2008-01-25 11:39:06.880193263 +0300
209@@ -273,6 +273,8 @@
210 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
211 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
212
213+source "fs/reiser4/Kconfig"
214+
215 config REISERFS_FS
216 tristate "Reiserfs support"
217 help
218diff -urN linux-2.6.24.orig/fs/Makefile linux-2.6.24/fs/Makefile
219--- linux-2.6.24.orig/fs/Makefile 2008-01-25 14:24:17.980630515 +0300
220+++ linux-2.6.24/fs/Makefile 2008-01-25 11:39:06.884194294 +0300
221@@ -66,6 +66,7 @@
222
223 # Do not add any filesystems before this line
224 obj-$(CONFIG_REISERFS_FS) += reiserfs/
225+obj-$(CONFIG_REISER4_FS) += reiser4/
226 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
227 obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
228 obj-$(CONFIG_JBD) += jbd/
229diff -urN linux-2.6.24.orig/fs/reiser4/as_ops.c linux-2.6.24/fs/reiser4/as_ops.c
230--- linux-2.6.24.orig/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300
231+++ linux-2.6.24/fs/reiser4/as_ops.c 2008-01-25 11:39:06.884194294 +0300
232@@ -0,0 +1,377 @@
233+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
234+
235+/* Interface to VFS. Reiser4 address_space_operations are defined here. */
236+
237+#include "forward.h"
238+#include "debug.h"
239+#include "dformat.h"
240+#include "coord.h"
241+#include "plugin/item/item.h"
242+#include "plugin/file/file.h"
243+#include "plugin/security/perm.h"
244+#include "plugin/disk_format/disk_format.h"
245+#include "plugin/plugin.h"
246+#include "plugin/plugin_set.h"
247+#include "plugin/object.h"
248+#include "txnmgr.h"
249+#include "jnode.h"
250+#include "znode.h"
251+#include "block_alloc.h"
252+#include "tree.h"
253+#include "vfs_ops.h"
254+#include "inode.h"
255+#include "page_cache.h"
256+#include "ktxnmgrd.h"
257+#include "super.h"
258+#include "reiser4.h"
259+#include "entd.h"
260+
261+#include <linux/profile.h>
262+#include <linux/types.h>
263+#include <linux/mount.h>
264+#include <linux/vfs.h>
265+#include <linux/mm.h>
266+#include <linux/buffer_head.h>
267+#include <linux/dcache.h>
268+#include <linux/list.h>
269+#include <linux/pagemap.h>
270+#include <linux/slab.h>
271+#include <linux/seq_file.h>
272+#include <linux/init.h>
273+#include <linux/module.h>
274+#include <linux/writeback.h>
275+#include <linux/backing-dev.h>
276+#include <linux/quotaops.h>
277+#include <linux/security.h>
278+
279+/* address space operations */
280+
281+/**
282+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
283+ * @page: page to be dirtied
284+ *
285+ * Operation of struct address_space_operations. This implementation is used by
286+ * unix and cryptcompress file plugins.
287+ *
288+ * This is called when reiser4 page gets dirtied outside of reiser4, for
289+ * example, when dirty bit is moved from pte to physical page.
290+ *
291+ * Tags page in the mapping's page tree with special tag so that it is possible
292+ * to do all the reiser4 specific work wrt dirty pages (jnode creation,
293+ * capturing by an atom) later because it can not be done in the contexts where
294+ * set_page_dirty is called.
295+ */
296+int reiser4_set_page_dirty(struct page *page)
297+{
298+ /* this page can be unformatted only */
299+ assert("vs-1734", (page->mapping &&
300+ page->mapping->host &&
301+ reiser4_get_super_fake(page->mapping->host->i_sb) !=
302+ page->mapping->host
303+ && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
304+ page->mapping->host
305+ && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
306+ page->mapping->host));
307+
308+ if (!TestSetPageDirty(page)) {
309+ struct address_space *mapping = page->mapping;
310+
311+ if (mapping) {
312+ write_lock_irq(&mapping->tree_lock);
313+
314+ /* check for race with truncate */
315+ if (page->mapping) {
316+ assert("vs-1652", page->mapping == mapping);
317+ if (mapping_cap_account_dirty(mapping))
318+ inc_zone_page_state(page,
319+ NR_FILE_DIRTY);
320+ radix_tree_tag_set(&mapping->page_tree,
321+ page->index,
322+ PAGECACHE_TAG_REISER4_MOVED);
323+ }
324+ write_unlock_irq(&mapping->tree_lock);
325+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
326+ }
327+ }
328+ return 0;
329+}
330+
331+/* ->invalidatepage method for reiser4 */
332+
333+/*
334+ * this is called for each truncated page from
335+ * truncate_inode_pages()->truncate_{complete,partial}_page().
336+ *
337+ * At the moment of call, page is under lock, and outstanding io (if any) has
338+ * completed.
339+ */
340+
341+/**
342+ * reiser4_invalidatepage
343+ * @page: page to invalidate
344+ * @offset: starting offset for partial invalidation
345+ *
346+ */
347+void reiser4_invalidatepage(struct page *page, unsigned long offset)
348+{
349+ int ret = 0;
350+ reiser4_context *ctx;
351+ struct inode *inode;
352+ jnode *node;
353+
354+ /*
355+ * This is called to truncate file's page.
356+ *
357+ * Originally, reiser4 implemented truncate in a standard way
358+ * (vmtruncate() calls ->invalidatepage() on all truncated pages
359+ * first, then file system ->truncate() call-back is invoked).
360+ *
361+ * This lead to the problem when ->invalidatepage() was called on a
362+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
363+ * process. That is, truncate was bypassing transactions. To avoid
364+ * this, try_capture_page_to_invalidate() call was added here.
365+ *
366+ * After many troubles with vmtruncate() based truncate (including
367+ * races with flush, tail conversion, etc.) it was re-written in the
368+ * top-to-bottom style: items are killed in reiser4_cut_tree_object()
369+ * and pages belonging to extent are invalidated in kill_hook_extent().
370+ * So probably now additional call to capture is not needed here.
371+ */
372+
373+ assert("nikita-3137", PageLocked(page));
374+ assert("nikita-3138", !PageWriteback(page));
375+ inode = page->mapping->host;
376+
377+ /*
378+ * ->invalidatepage() should only be called for the unformatted
379+ * jnodes. Destruction of all other types of jnodes is performed
380+ * separately. But, during some corner cases (like handling errors
381+ * during mount) it is simpler to let ->invalidatepage to be called on
382+ * them. Check for this, and do nothing.
383+ */
384+ if (reiser4_get_super_fake(inode->i_sb) == inode)
385+ return;
386+ if (reiser4_get_cc_fake(inode->i_sb) == inode)
387+ return;
388+ if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
389+ return;
390+ assert("vs-1426", PagePrivate(page));
391+ assert("vs-1427",
392+ page->mapping == jnode_get_mapping(jnode_by_page(page)));
393+ assert("", jprivate(page) != NULL);
394+ assert("", ergo(inode_file_plugin(inode) !=
395+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
396+ offset == 0));
397+
398+ ctx = reiser4_init_context(inode->i_sb);
399+ if (IS_ERR(ctx))
400+ return;
401+
402+ node = jprivate(page);
403+ spin_lock_jnode(node);
404+ if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
405+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
406+ /* there is not need to capture */
407+ jref(node);
408+ JF_SET(node, JNODE_HEARD_BANSHEE);
409+ page_clear_jnode(page, node);
410+ reiser4_uncapture_jnode(node);
411+ unhash_unformatted_jnode(node);
412+ jput(node);
413+ reiser4_exit_context(ctx);
414+ return;
415+ }
416+ spin_unlock_jnode(node);
417+
418+ /* capture page being truncated. */
419+ ret = try_capture_page_to_invalidate(page);
420+ if (ret != 0)
421+ warning("nikita-3141", "Cannot capture: %i", ret);
422+
423+ if (offset == 0) {
424+ /* remove jnode from transaction and detach it from page. */
425+ jref(node);
426+ JF_SET(node, JNODE_HEARD_BANSHEE);
427+ /* page cannot be detached from jnode concurrently, because it
428+ * is locked */
429+ reiser4_uncapture_page(page);
430+
431+ /* this detaches page from jnode, so that jdelete will not try
432+ * to lock page which is already locked */
433+ spin_lock_jnode(node);
434+ page_clear_jnode(page, node);
435+ spin_unlock_jnode(node);
436+ unhash_unformatted_jnode(node);
437+
438+ jput(node);
439+ }
440+
441+ reiser4_exit_context(ctx);
442+}
443+
444+/* help function called from reiser4_releasepage(). It returns true if jnode
445+ * can be detached from its page and page released. */
446+int jnode_is_releasable(jnode * node /* node to check */ )
447+{
448+ assert("nikita-2781", node != NULL);
449+ assert_spin_locked(&(node->guard));
450+ assert_spin_locked(&(node->load));
451+
452+ /* is some thread is currently using jnode page, later cannot be
453+ * detached */
454+ if (atomic_read(&node->d_count) != 0) {
455+ return 0;
456+ }
457+
458+ assert("vs-1214", !jnode_is_loaded(node));
459+
460+ /*
461+ * can only release page if real block number is assigned to it. Simple
462+ * check for ->atom wouldn't do, because it is possible for node to be
463+ * clean, not it atom yet, and still having fake block number. For
464+ * example, node just created in jinit_new().
465+ */
466+ if (reiser4_blocknr_is_fake(jnode_get_block(node)))
467+ return 0;
468+
469+ /*
470+ * pages prepared for write can not be released anyway, so avoid
471+ * detaching jnode from the page
472+ */
473+ if (JF_ISSET(node, JNODE_WRITE_PREPARED))
474+ return 0;
475+
476+ /*
477+ * dirty jnode cannot be released. It can however be submitted to disk
478+ * as part of early flushing, but only after getting flush-prepped.
479+ */
480+ if (JF_ISSET(node, JNODE_DIRTY))
481+ return 0;
482+
483+ /* overwrite set is only written by log writer. */
484+ if (JF_ISSET(node, JNODE_OVRWR))
485+ return 0;
486+
487+ /* jnode is already under writeback */
488+ if (JF_ISSET(node, JNODE_WRITEBACK))
489+ return 0;
490+
491+ /* don't flush bitmaps or journal records */
492+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
493+ return 0;
494+
495+ return 1;
496+}
497+
498+/*
499+ * ->releasepage method for reiser4
500+ *
501+ * This is called by VM scanner when it comes across clean page. What we have
502+ * to do here is to check whether page can really be released (freed that is)
503+ * and if so, detach jnode from it and remove page from the page cache.
504+ *
505+ * Check for releasability is done by releasable() function.
506+ */
507+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
508+{
509+ jnode *node;
510+
511+ assert("nikita-2257", PagePrivate(page));
512+ assert("nikita-2259", PageLocked(page));
513+ assert("nikita-2892", !PageWriteback(page));
514+ assert("nikita-3019", reiser4_schedulable());
515+
516+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
517+ is not clear what to do in this case. A lot of deadlocks seems be
518+ possible. */
519+
520+ node = jnode_by_page(page);
521+ assert("nikita-2258", node != NULL);
522+ assert("reiser4-4", page->mapping != NULL);
523+ assert("reiser4-5", page->mapping->host != NULL);
524+
525+ if (PageDirty(page))
526+ return 0;
527+
528+ /* extra page reference is used by reiser4 to protect
529+ * jnode<->page link from this ->releasepage(). */
530+ if (page_count(page) > 3)
531+ return 0;
532+
533+ /* releasable() needs jnode lock, because it looks at the jnode fields
534+ * and we need jload_lock here to avoid races with jload(). */
535+ spin_lock_jnode(node);
536+ spin_lock(&(node->load));
537+ if (jnode_is_releasable(node)) {
538+ struct address_space *mapping;
539+
540+ mapping = page->mapping;
541+ jref(node);
542+ /* there is no need to synchronize against
543+ * jnode_extent_write() here, because pages seen by
544+ * jnode_extent_write() are !releasable(). */
545+ page_clear_jnode(page, node);
546+ spin_unlock(&(node->load));
547+ spin_unlock_jnode(node);
548+
549+ /* we are under memory pressure so release jnode also. */
550+ jput(node);
551+
552+ return 1;
553+ } else {
554+ spin_unlock(&(node->load));
555+ spin_unlock_jnode(node);
556+ assert("nikita-3020", reiser4_schedulable());
557+ return 0;
558+ }
559+}
560+
561+int reiser4_readpage(struct file *file, struct page *page)
562+{
563+ assert("edward-1533", PageLocked(page));
564+ assert("edward-1534", !PageUptodate(page));
565+ assert("edward-1535", page->mapping && page->mapping->host);
566+
567+ return inode_file_plugin(page->mapping->host)->readpage(file, page);
568+}
569+
570+int reiser4_readpages(struct file *file, struct address_space *mapping,
571+ struct list_head *pages, unsigned nr_pages)
572+{
573+ return inode_file_plugin(mapping->host)->readpages(file, mapping,
574+ pages, nr_pages);
575+}
576+
577+int reiser4_writepages(struct address_space *mapping,
578+ struct writeback_control *wbc)
579+{
580+ return inode_file_plugin(mapping->host)->writepages(mapping, wbc);
581+}
582+
583+int reiser4_prepare_write(struct file *file, struct page *page,
584+ unsigned from, unsigned to)
585+{
586+ return inode_file_plugin(file->f_dentry->d_inode)->prepare_write(file,
587+ page,
588+ from,
589+ to);
590+}
591+
592+int reiser4_commit_write(struct file *file, struct page *page,
593+ unsigned from, unsigned to)
594+{
595+ return inode_file_plugin(file->f_dentry->d_inode)->commit_write(file,
596+ page,
597+ from,
598+ to);
599+}
600+
601+/* Make Linus happy.
602+ Local variables:
603+ c-indentation-style: "K&R"
604+ mode-name: "LC"
605+ c-basic-offset: 8
606+ tab-width: 8
607+ fill-column: 120
608+ End:
609+*/
610diff -urN linux-2.6.24.orig/fs/reiser4/block_alloc.c linux-2.6.24/fs/reiser4/block_alloc.c
611--- linux-2.6.24.orig/fs/reiser4/block_alloc.c 1970-01-01 03:00:00.000000000 +0300
612+++ linux-2.6.24/fs/reiser4/block_alloc.c 2008-01-25 11:39:06.888195324 +0300
613@@ -0,0 +1,1137 @@
614+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
615+
616+#include "debug.h"
617+#include "dformat.h"
618+#include "plugin/plugin.h"
619+#include "txnmgr.h"
620+#include "znode.h"
621+#include "block_alloc.h"
622+#include "tree.h"
623+#include "super.h"
624+
625+#include <linux/types.h> /* for __u?? */
626+#include <linux/fs.h> /* for struct super_block */
627+#include <linux/spinlock.h>
628+
629+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
630+
631+/* We need to be able to reserve enough disk space to ensure that an atomic
632+ operation will have enough disk space to flush (see flush.c and
633+ http://namesys.com/v4/v4.html) and commit it once it is started.
634+
635+ In our design a call for reserving disk space may fail but not an actual
636+ block allocation.
637+
638+ All free blocks, already allocated blocks, and all kinds of reserved blocks
639+ are counted in different per-fs block counters.
640+
641+ A reiser4 super block's set of block counters currently is:
642+
643+ free -- free blocks,
644+ used -- already allocated blocks,
645+
646+ grabbed -- initially reserved for performing an fs operation, those blocks
647+ are taken from free blocks, then grabbed disk space leaks from grabbed
648+ blocks counter to other counters like "fake allocated", "flush
649+ reserved", "used", the rest of not used grabbed space is returned to
650+ free space at the end of fs operation;
651+
652+ fake allocated -- counts all nodes without real disk block numbers assigned,
653+ we have separate accounting for formatted and unformatted
654+ nodes (for easier debugging);
655+
656+ flush reserved -- disk space needed for flushing and committing an atom.
657+ Each dirty already allocated block could be written as a
658+ part of atom's overwrite set or as a part of atom's
659+ relocate set. In both case one additional block is needed,
660+ it is used as a wandered block if we do overwrite or as a
661+ new location for a relocated block.
662+
663+ In addition, blocks in some states are counted on per-thread and per-atom
664+ basis. A reiser4 context has a counter of blocks grabbed by this transaction
665+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
666+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
667+ blocks, which are reserved for flush processing and atom commit. */
668+
669+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
670+ number of blocks to grab for most expensive case of balancing when the leaf
671+ node we insert new item to gets split and new leaf node is allocated.
672+
673+ So, we need to grab blocks for
674+
675+ 1) one block for possible dirtying the node we insert an item to. That block
676+ would be used for node relocation at flush time or for allocating of a
677+ wandered one, it depends what will be a result (what set, relocate or
678+ overwrite the node gets assigned to) of the node processing by the flush
679+ algorithm.
680+
681+ 2) one block for either allocating a new node, or dirtying of right or left
682+ clean neighbor, only one case may happen.
683+
684+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
685+ node, and creation of new node. have I forgotten something? email me.
686+
687+ These grabbed blocks are counted in both reiser4 context "grabbed blocks"
688+ counter and in the fs-wide one (both ctx->grabbed_blocks and
689+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
690+ decremented by 2.
691+
692+ Suppose both two blocks were spent for dirtying of an already allocated clean
693+ node (one block went from "grabbed" to "flush reserved") and for new block
694+ allocating (one block went from "grabbed" to "fake allocated formatted").
695+
696+ Inserting of a child pointer to the parent node caused parent node to be
697+ split, the balancing code takes care about this grabbing necessary space
698+ immediately by calling reiser4_grab with BA_RESERVED flag set which means
699+ "can use the 5% reserved disk space".
700+
701+ At this moment insertion completes and grabbed blocks (if they were not used)
702+ should be returned to the free space counter.
703+
704+ However the atom life-cycle is not completed. The atom had one "flush
705+ reserved" block added by our insertion and the new fake allocated node is
706+ counted as a "fake allocated formatted" one. The atom has to be fully
707+ processed by flush before commit. Suppose that the flush moved the first,
708+ already allocated node to the atom's overwrite list, the new fake allocated
709+ node, obviously, went into the atom relocate set. The reiser4 flush
710+ allocates the new node using one unit from "fake allocated formatted"
711+ counter, the log writer uses one from "flush reserved" for wandered block
712+ allocation.
713+
714+ And, it is not the end. When the wandered block is deallocated after the
715+ atom gets fully played (see wander.c for term description), the disk space
716+ occupied for it is returned to free blocks. */
717+
718+/* BLOCK NUMBERS */
719+
720+/* Any reiser4 node has a block number assigned to it. We use these numbers for
721+ indexing in hash tables, so if a block has not yet been assigned a location
722+ on disk we need to give it a temporary fake block number.
723+
724+ Current implementation of reiser4 uses 64-bit integers for block numbers. We
725+ use highest bit in 64-bit block number to distinguish fake and real block
726+ numbers. So, only 63 bits may be used to addressing of real device
727+ blocks. That "fake" block numbers space is divided into subspaces of fake
728+ block numbers for data blocks and for shadow (working) bitmap blocks.
729+
730+ Fake block numbers for data blocks are generated by a cyclic counter, which
731+ gets incremented after each real block allocation. We assume that it is
732+ impossible to overload this counter during one transaction life. */
733+
734+/* Initialize a blocknr hint. */
735+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
736+{
737+ memset(hint, 0, sizeof(reiser4_blocknr_hint));
738+}
739+
740+/* Release any resources of a blocknr hint. */
741+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
742+{
743+ /* No resources should be freed in current blocknr_hint implementation. */
744+}
745+
746+/* see above for explanation of fake block number. */
747+/* Audited by: green(2002.06.11) */
748+int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
749+{
750+ /* The reason for not simply returning result of '&' operation is that
751+ while return value is (possibly 32bit) int, the reiser4_block_nr is
752+ at least 64 bits long, and high bit (which is the only possible
753+ non zero bit after the masking) would be stripped off */
754+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
755+}
756+
757+/* Static functions for <reiser4 super block>/<reiser4 context> block counters
758+ arithmetic. Mostly, they are isolated to not to code same assertions in
759+ several places. */
760+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
761+{
762+ BUG_ON(ctx->grabbed_blocks < count);
763+ assert("zam-527", ctx->grabbed_blocks >= count);
764+ ctx->grabbed_blocks -= count;
765+}
766+
767+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
768+{
769+ ctx->grabbed_blocks += count;
770+}
771+
772+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
773+{
774+ assert("zam-525", sbinfo->blocks_grabbed >= count);
775+ sbinfo->blocks_grabbed -= count;
776+}
777+
778+/* Decrease the counter of block reserved for flush in super block. */
779+static void
780+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
781+{
782+ assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
783+ sbinfo->blocks_flush_reserved -= count;
784+}
785+
786+static void
787+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
788+ reiser4_ba_flags_t flags)
789+{
790+ if (flags & BA_FORMATTED) {
791+ assert("zam-806", sbinfo->blocks_fake_allocated >= count);
792+ sbinfo->blocks_fake_allocated -= count;
793+ } else {
794+ assert("zam-528",
795+ sbinfo->blocks_fake_allocated_unformatted >= count);
796+ sbinfo->blocks_fake_allocated_unformatted -= count;
797+ }
798+}
799+
800+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
801+{
802+ assert("zam-530",
803+ sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
804+ sbinfo->blocks_used -= count;
805+}
806+
807+static void
808+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
809+{
810+ assert("edward-501", sbinfo->blocks_clustered >= count);
811+ sbinfo->blocks_clustered -= count;
812+}
813+
814+/* Increase the counter of block reserved for flush in atom. */
815+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
816+{
817+ assert("zam-772", atom != NULL);
818+ assert_spin_locked(&(atom->alock));
819+ atom->flush_reserved += count;
820+}
821+
822+/* Decrease the counter of block reserved for flush in atom. */
823+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
824+{
825+ assert("zam-774", atom != NULL);
826+ assert_spin_locked(&(atom->alock));
827+ assert("nikita-2790", atom->flush_reserved >= count);
828+ atom->flush_reserved -= count;
829+}
830+
831+/* super block has 6 counters: free, used, grabbed, fake allocated
832+ (formatted and unformatted) and flush reserved. Their sum must be
833+ number of blocks on a device. This function checks this */
834+int reiser4_check_block_counters(const struct super_block *super)
835+{
836+ __u64 sum;
837+
838+ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
839+ reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
840+ reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
841+ reiser4_clustered_blocks(super);
842+ if (reiser4_block_count(super) != sum) {
843+ printk("super block counters: "
844+ "used %llu, free %llu, "
845+ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
846+ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
847+ (unsigned long long)reiser4_data_blocks(super),
848+ (unsigned long long)reiser4_free_blocks(super),
849+ (unsigned long long)reiser4_grabbed_blocks(super),
850+ (unsigned long long)reiser4_fake_allocated(super),
851+ (unsigned long long)
852+ reiser4_fake_allocated_unformatted(super),
853+ (unsigned long long)reiser4_flush_reserved(super),
854+ (unsigned long long)reiser4_clustered_blocks(super),
855+ (unsigned long long)sum,
856+ (unsigned long long)reiser4_block_count(super));
857+ return 0;
858+ }
859+ return 1;
860+}
861+
862+/* Adjust "working" free blocks counter for number of blocks we are going to
863+ allocate. Record number of grabbed blocks in fs-wide and per-thread
864+ counters. This function should be called before bitmap scanning or
865+ allocating fake block numbers
866+
867+ @super -- pointer to reiser4 super block;
868+ @count -- number of blocks we reserve;
869+
870+ @return -- 0 if success, -ENOSPC, if all
871+ free blocks are preserved or already allocated.
872+*/
873+
874+static int
875+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
876+{
877+ __u64 free_blocks;
878+ int ret = 0, use_reserved = flags & BA_RESERVED;
879+ reiser4_super_info_data *sbinfo;
880+
881+ assert("vs-1276", ctx == get_current_context());
882+
883+ /* Do not grab anything on ro-mounted fs. */
884+ if (rofs_super(ctx->super)) {
885+ ctx->grab_enabled = 0;
886+ return 0;
887+ }
888+
889+ sbinfo = get_super_private(ctx->super);
890+
891+ spin_lock_reiser4_super(sbinfo);
892+
893+ free_blocks = sbinfo->blocks_free;
894+
895+ if ((use_reserved && free_blocks < count) ||
896+ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
897+ ret = RETERR(-ENOSPC);
898+ goto unlock_and_ret;
899+ }
900+
901+ add_to_ctx_grabbed(ctx, count);
902+
903+ sbinfo->blocks_grabbed += count;
904+ sbinfo->blocks_free -= count;
905+
906+#if REISER4_DEBUG
907+ if (ctx->grabbed_initially == 0)
908+ ctx->grabbed_initially = count;
909+#endif
910+
911+ assert("nikita-2986", reiser4_check_block_counters(ctx->super));
912+
913+ /* disable grab space in current context */
914+ ctx->grab_enabled = 0;
915+
916+ unlock_and_ret:
917+ spin_unlock_reiser4_super(sbinfo);
918+
919+ return ret;
920+}
921+
922+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
923+{
924+ int ret;
925+ reiser4_context *ctx;
926+
927+ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
928+ lock_stack_isclean(get_current_lock_stack
929+ ())));
930+ ctx = get_current_context();
931+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
932+ return 0;
933+ }
934+
935+ ret = reiser4_grab(ctx, count, flags);
936+ if (ret == -ENOSPC) {
937+
938+ /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
939+ if (flags & BA_CAN_COMMIT) {
940+ txnmgr_force_commit_all(ctx->super, 0);
941+ ctx->grab_enabled = 1;
942+ ret = reiser4_grab(ctx, count, flags);
943+ }
944+ }
945+ /*
946+ * allocation from reserved pool cannot fail. This is severe error.
947+ */
948+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
949+ return ret;
950+}
951+
952+/*
953+ * SPACE RESERVED FOR UNLINK/TRUNCATE
954+ *
955+ * Unlink and truncate require space in transaction (to update stat data, at
956+ * least). But we don't want rm(1) to fail with "No space on device" error.
957+ *
958+ * Solution is to reserve 5% of disk space for truncates and
959+ * unlinks. Specifically, normal space grabbing requests don't grab space from
960+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
961+ * drain it. Per super block delete mutex is used to allow only one
962+ * thread at a time to grab from reserved area.
963+ *
964+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
965+ * flag.
966+ *
967+ */
968+
969+int reiser4_grab_reserved(struct super_block *super,
970+ __u64 count, reiser4_ba_flags_t flags)
971+{
972+ reiser4_super_info_data *sbinfo = get_super_private(super);
973+
974+ assert("nikita-3175", flags & BA_CAN_COMMIT);
975+
976+ /* Check the delete mutex already taken by us, we assume that
977+ * reading of machine word is atomic. */
978+ if (sbinfo->delete_mutex_owner == current) {
979+ if (reiser4_grab_space
980+ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
981+ warning("zam-1003",
982+ "nested call of grab_reserved fails count=(%llu)",
983+ (unsigned long long)count);
984+ reiser4_release_reserved(super);
985+ return RETERR(-ENOSPC);
986+ }
987+ return 0;
988+ }
989+
990+ if (reiser4_grab_space(count, flags)) {
991+ mutex_lock(&sbinfo->delete_mutex);
992+ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
993+ sbinfo->delete_mutex_owner = current;
994+
995+ if (reiser4_grab_space(count, flags | BA_RESERVED)) {
996+ warning("zam-833",
997+ "reserved space is not enough (%llu)",
998+ (unsigned long long)count);
999+ reiser4_release_reserved(super);
1000+ return RETERR(-ENOSPC);
1001+ }
1002+ }
1003+ return 0;
1004+}
1005+
1006+void reiser4_release_reserved(struct super_block *super)
1007+{
1008+ reiser4_super_info_data *info;
1009+
1010+ info = get_super_private(super);
1011+ if (info->delete_mutex_owner == current) {
1012+ info->delete_mutex_owner = NULL;
1013+ mutex_unlock(&info->delete_mutex);
1014+ }
1015+}
1016+
1017+static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
1018+{
1019+ reiser4_context *ctx;
1020+ reiser4_super_info_data *sbinfo;
1021+
1022+ ctx = get_current_context();
1023+ sub_from_ctx_grabbed(ctx, count);
1024+
1025+ sbinfo = get_super_private(ctx->super);
1026+ spin_lock_reiser4_super(sbinfo);
1027+
1028+ sub_from_sb_grabbed(sbinfo, count);
1029+ /* return sbinfo locked */
1030+ return sbinfo;
1031+}
1032+
1033+/* is called after @count fake block numbers are allocated and pointer to
1034+ those blocks are inserted into tree. */
1035+static void grabbed2fake_allocated_formatted(void)
1036+{
1037+ reiser4_super_info_data *sbinfo;
1038+
1039+ sbinfo = grabbed2fake_allocated_head(1);
1040+ sbinfo->blocks_fake_allocated++;
1041+
1042+ assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1043+
1044+ spin_unlock_reiser4_super(sbinfo);
1045+}
1046+
1047+/**
1048+ * grabbed2fake_allocated_unformatted
1049+ * @count:
1050+ *
1051+ */
1052+static void grabbed2fake_allocated_unformatted(int count)
1053+{
1054+ reiser4_super_info_data *sbinfo;
1055+
1056+ sbinfo = grabbed2fake_allocated_head(count);
1057+ sbinfo->blocks_fake_allocated_unformatted += count;
1058+
1059+ assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1060+
1061+ spin_unlock_reiser4_super(sbinfo);
1062+}
1063+
1064+void grabbed2cluster_reserved(int count)
1065+{
1066+ reiser4_context *ctx;
1067+ reiser4_super_info_data *sbinfo;
1068+
1069+ ctx = get_current_context();
1070+ sub_from_ctx_grabbed(ctx, count);
1071+
1072+ sbinfo = get_super_private(ctx->super);
1073+ spin_lock_reiser4_super(sbinfo);
1074+
1075+ sub_from_sb_grabbed(sbinfo, count);
1076+ sbinfo->blocks_clustered += count;
1077+
1078+ assert("edward-504", reiser4_check_block_counters(ctx->super));
1079+
1080+ spin_unlock_reiser4_super(sbinfo);
1081+}
1082+
1083+void cluster_reserved2grabbed(int count)
1084+{
1085+ reiser4_context *ctx;
1086+ reiser4_super_info_data *sbinfo;
1087+
1088+ ctx = get_current_context();
1089+
1090+ sbinfo = get_super_private(ctx->super);
1091+ spin_lock_reiser4_super(sbinfo);
1092+
1093+ sub_from_cluster_reserved(sbinfo, count);
1094+ sbinfo->blocks_grabbed += count;
1095+
1096+ assert("edward-505", reiser4_check_block_counters(ctx->super));
1097+
1098+ spin_unlock_reiser4_super(sbinfo);
1099+ add_to_ctx_grabbed(ctx, count);
1100+}
1101+
1102+void cluster_reserved2free(int count)
1103+{
1104+ reiser4_context *ctx;
1105+ reiser4_super_info_data *sbinfo;
1106+
1107+ ctx = get_current_context();
1108+ sbinfo = get_super_private(ctx->super);
1109+
1110+ cluster_reserved2grabbed(count);
1111+ grabbed2free(ctx, sbinfo, count);
1112+}
1113+
1114+static DEFINE_SPINLOCK(fake_lock);
1115+static reiser4_block_nr fake_gen = 0;
1116+
1117+/**
1118+ * assign_fake_blocknr
1119+ * @blocknr:
1120+ * @count:
1121+ *
1122+ * Obtain a fake block number for new node which will be used to refer to
1123+ * this newly allocated node until real allocation is done.
1124+ */
1125+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1126+{
1127+ spin_lock(&fake_lock);
1128+ *blocknr = fake_gen;
1129+ fake_gen += count;
1130+ spin_unlock(&fake_lock);
1131+
1132+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1133+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1134+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1135+ assert("zam-394", zlook(current_tree, blocknr) == NULL);
1136+}
1137+
1138+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1139+{
1140+ assign_fake_blocknr(blocknr, 1);
1141+ grabbed2fake_allocated_formatted();
1142+ return 0;
1143+}
1144+
1145+/**
1146+ * fake_blocknrs_unformatted
1147+ * @count: number of fake numbers to get
1148+ *
1149+ * Allocates @count fake block numbers which will be assigned to jnodes
1150+ */
1151+reiser4_block_nr fake_blocknr_unformatted(int count)
1152+{
1153+ reiser4_block_nr blocknr;
1154+
1155+ assign_fake_blocknr(&blocknr, count);
1156+ grabbed2fake_allocated_unformatted(count);
1157+
1158+ return blocknr;
1159+}
1160+
1161+/* adjust sb block counters, if real (on-disk) block allocation immediately
1162+ follows grabbing of free disk space. */
1163+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1164+ __u64 count)
1165+{
1166+ sub_from_ctx_grabbed(ctx, count);
1167+
1168+ spin_lock_reiser4_super(sbinfo);
1169+
1170+ sub_from_sb_grabbed(sbinfo, count);
1171+ sbinfo->blocks_used += count;
1172+
1173+ assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1174+
1175+ spin_unlock_reiser4_super(sbinfo);
1176+}
1177+
1178+/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1179+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1180+ reiser4_ba_flags_t flags)
1181+{
1182+ spin_lock_reiser4_super(sbinfo);
1183+
1184+ sub_from_sb_fake_allocated(sbinfo, count, flags);
1185+ sbinfo->blocks_used += count;
1186+
1187+ assert("nikita-2680",
1188+ reiser4_check_block_counters(reiser4_get_current_sb()));
1189+
1190+ spin_unlock_reiser4_super(sbinfo);
1191+}
1192+
1193+static void flush_reserved2used(txn_atom * atom, __u64 count)
1194+{
1195+ reiser4_super_info_data *sbinfo;
1196+
1197+ assert("zam-787", atom != NULL);
1198+ assert_spin_locked(&(atom->alock));
1199+
1200+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1201+
1202+ sbinfo = get_current_super_private();
1203+ spin_lock_reiser4_super(sbinfo);
1204+
1205+ sub_from_sb_flush_reserved(sbinfo, count);
1206+ sbinfo->blocks_used += count;
1207+
1208+ assert("zam-789",
1209+ reiser4_check_block_counters(reiser4_get_current_sb()));
1210+
1211+ spin_unlock_reiser4_super(sbinfo);
1212+}
1213+
1214+/* update the per fs blocknr hint default value. */
1215+void
1216+update_blocknr_hint_default(const struct super_block *s,
1217+ const reiser4_block_nr * block)
1218+{
1219+ reiser4_super_info_data *sbinfo = get_super_private(s);
1220+
1221+ assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1222+
1223+ spin_lock_reiser4_super(sbinfo);
1224+ if (*block < sbinfo->block_count) {
1225+ sbinfo->blocknr_hint_default = *block;
1226+ } else {
1227+ warning("zam-676",
1228+ "block number %llu is too large to be used in a blocknr hint\n",
1229+ (unsigned long long)*block);
1230+ dump_stack();
1231+ DEBUGON(1);
1232+ }
1233+ spin_unlock_reiser4_super(sbinfo);
1234+}
1235+
1236+/* get current value of the default blocknr hint. */
1237+void get_blocknr_hint_default(reiser4_block_nr * result)
1238+{
1239+ reiser4_super_info_data *sbinfo = get_current_super_private();
1240+
1241+ spin_lock_reiser4_super(sbinfo);
1242+ *result = sbinfo->blocknr_hint_default;
1243+ assert("zam-677", *result < sbinfo->block_count);
1244+ spin_unlock_reiser4_super(sbinfo);
1245+}
1246+
1247+/* Allocate "real" disk blocks by calling a proper space allocation plugin
1248+ * method. Blocks are allocated in one contiguous disk region. The plugin
1249+ * independent part accounts blocks by subtracting allocated amount from grabbed
1250+ * or fake block counter and add the same amount to the counter of allocated
1251+ * blocks.
1252+ *
1253+ * @hint -- a reiser4 blocknr hint object which contains further block
1254+ * allocation hints and parameters (search start, a stage of block
1255+ * which will be mapped to disk, etc.),
1256+ * @blk -- an out parameter for the beginning of the allocated region,
1257+ * @len -- in/out parameter, it should contain the maximum number of allocated
1258+ * blocks, after block allocation completes, it contains the length of
1259+ * allocated disk region.
1260+ * @flags -- see reiser4_ba_flags_t description.
1261+ *
1262+ * @return -- 0 if success, error code otherwise.
1263+ */
1264+int
1265+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1266+ reiser4_block_nr * len, reiser4_ba_flags_t flags)
1267+{
1268+ __u64 needed = *len;
1269+ reiser4_context *ctx;
1270+ reiser4_super_info_data *sbinfo;
1271+ int ret;
1272+
1273+ assert("zam-986", hint != NULL);
1274+
1275+ ctx = get_current_context();
1276+ sbinfo = get_super_private(ctx->super);
1277+
1278+ /* For write-optimized data we use default search start value, which is
1279+ * close to last write location. */
1280+ if (flags & BA_USE_DEFAULT_SEARCH_START) {
1281+ get_blocknr_hint_default(&hint->blk);
1282+ }
1283+
1284+ /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1285+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1286+ if (hint->block_stage == BLOCK_NOT_COUNTED) {
1287+ ret = reiser4_grab_space_force(*len, flags);
1288+ if (ret != 0)
1289+ return ret;
1290+ }
1291+
1292+ ret =
1293+ sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1294+ hint, (int)needed, blk, len);
1295+
1296+ if (!ret) {
1297+ assert("zam-680", *blk < reiser4_block_count(ctx->super));
1298+ assert("zam-681",
1299+ *blk + *len <= reiser4_block_count(ctx->super));
1300+
1301+ if (flags & BA_PERMANENT) {
1302+ /* we assume that current atom exists at this moment */
1303+ txn_atom *atom = get_current_atom_locked();
1304+ atom->nr_blocks_allocated += *len;
1305+ spin_unlock_atom(atom);
1306+ }
1307+
1308+ switch (hint->block_stage) {
1309+ case BLOCK_NOT_COUNTED:
1310+ case BLOCK_GRABBED:
1311+ grabbed2used(ctx, sbinfo, *len);
1312+ break;
1313+ case BLOCK_UNALLOCATED:
1314+ fake_allocated2used(sbinfo, *len, flags);
1315+ break;
1316+ case BLOCK_FLUSH_RESERVED:
1317+ {
1318+ txn_atom *atom = get_current_atom_locked();
1319+ flush_reserved2used(atom, *len);
1320+ spin_unlock_atom(atom);
1321+ }
1322+ break;
1323+ default:
1324+ impossible("zam-531", "wrong block stage");
1325+ }
1326+ } else {
1327+ assert("zam-821",
1328+ ergo(hint->max_dist == 0
1329+ && !hint->backward, ret != -ENOSPC));
1330+ if (hint->block_stage == BLOCK_NOT_COUNTED)
1331+ grabbed2free(ctx, sbinfo, needed);
1332+ }
1333+
1334+ return ret;
1335+}
1336+
1337+/* used -> fake_allocated -> grabbed -> free */
1338+
1339+/* adjust sb block counters when @count unallocated blocks get unmapped from
1340+ disk */
1341+static void
1342+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1343+ int formatted)
1344+{
1345+ spin_lock_reiser4_super(sbinfo);
1346+
1347+ if (formatted)
1348+ sbinfo->blocks_fake_allocated += count;
1349+ else
1350+ sbinfo->blocks_fake_allocated_unformatted += count;
1351+
1352+ sub_from_sb_used(sbinfo, count);
1353+
1354+ assert("nikita-2681",
1355+ reiser4_check_block_counters(reiser4_get_current_sb()));
1356+
1357+ spin_unlock_reiser4_super(sbinfo);
1358+}
1359+
1360+static void
1361+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1362+ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1363+{
1364+ assert("nikita-2791", atom != NULL);
1365+ assert_spin_locked(&(atom->alock));
1366+
1367+ add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1368+
1369+ spin_lock_reiser4_super(sbinfo);
1370+
1371+ sbinfo->blocks_flush_reserved += count;
1372+ /*add_to_sb_flush_reserved(sbinfo, count); */
1373+ sub_from_sb_used(sbinfo, count);
1374+
1375+ assert("nikita-2681",
1376+ reiser4_check_block_counters(reiser4_get_current_sb()));
1377+
1378+ spin_unlock_reiser4_super(sbinfo);
1379+}
1380+
1381+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1382+static void
1383+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1384+ __u64 count, reiser4_ba_flags_t flags)
1385+{
1386+ add_to_ctx_grabbed(ctx, count);
1387+
1388+ spin_lock_reiser4_super(sbinfo);
1389+
1390+ assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1391+
1392+ sbinfo->blocks_grabbed += count;
1393+ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1394+
1395+ assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1396+
1397+ spin_unlock_reiser4_super(sbinfo);
1398+}
1399+
1400+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1401+{
1402+ reiser4_context *ctx;
1403+ reiser4_super_info_data *sbinfo;
1404+
1405+ ctx = get_current_context();
1406+ sbinfo = get_super_private(ctx->super);
1407+
1408+ fake_allocated2grabbed(ctx, sbinfo, count, flags);
1409+ grabbed2free(ctx, sbinfo, count);
1410+}
1411+
1412+void grabbed2free_mark(__u64 mark)
1413+{
1414+ reiser4_context *ctx;
1415+ reiser4_super_info_data *sbinfo;
1416+
1417+ ctx = get_current_context();
1418+ sbinfo = get_super_private(ctx->super);
1419+
1420+ assert("nikita-3007", (__s64) mark >= 0);
1421+ assert("nikita-3006", ctx->grabbed_blocks >= mark);
1422+ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1423+}
1424+
1425+/**
1426+ * grabbed2free - adjust grabbed and free block counters
1427+ * @ctx: context to update grabbed block counter of
1428+ * @sbinfo: super block to update grabbed and free block counters of
1429+ * @count: number of blocks to adjust counters by
1430+ *
1431+ * Decreases context's and per filesystem's counters of grabbed
1432+ * blocks. Increases per filesystem's counter of free blocks.
1433+ */
1434+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1435+ __u64 count)
1436+{
1437+ sub_from_ctx_grabbed(ctx, count);
1438+
1439+ spin_lock_reiser4_super(sbinfo);
1440+
1441+ sub_from_sb_grabbed(sbinfo, count);
1442+ sbinfo->blocks_free += count;
1443+ assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1444+
1445+ spin_unlock_reiser4_super(sbinfo);
1446+}
1447+
1448+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1449+{
1450+ reiser4_context *ctx;
1451+ reiser4_super_info_data *sbinfo;
1452+
1453+ assert("vs-1095", atom);
1454+
1455+ ctx = get_current_context();
1456+ sbinfo = get_super_private(ctx->super);
1457+
1458+ sub_from_ctx_grabbed(ctx, count);
1459+
1460+ add_to_atom_flush_reserved_nolock(atom, count);
1461+
1462+ spin_lock_reiser4_super(sbinfo);
1463+
1464+ sbinfo->blocks_flush_reserved += count;
1465+ sub_from_sb_grabbed(sbinfo, count);
1466+
1467+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
1468+
1469+ spin_unlock_reiser4_super(sbinfo);
1470+}
1471+
1472+void grabbed2flush_reserved(__u64 count)
1473+{
1474+ txn_atom *atom = get_current_atom_locked();
1475+
1476+ grabbed2flush_reserved_nolock(atom, count);
1477+
1478+ spin_unlock_atom(atom);
1479+}
1480+
1481+void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1482+{
1483+ reiser4_context *ctx;
1484+ reiser4_super_info_data *sbinfo;
1485+
1486+ assert("nikita-2788", atom != NULL);
1487+ assert_spin_locked(&(atom->alock));
1488+
1489+ ctx = get_current_context();
1490+ sbinfo = get_super_private(ctx->super);
1491+
1492+ add_to_ctx_grabbed(ctx, count);
1493+
1494+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1495+
1496+ spin_lock_reiser4_super(sbinfo);
1497+
1498+ sbinfo->blocks_grabbed += count;
1499+ sub_from_sb_flush_reserved(sbinfo, count);
1500+
1501+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
1502+
1503+ spin_unlock_reiser4_super(sbinfo);
1504+}
1505+
1506+/**
1507+ * all_grabbed2free - releases all blocks grabbed in context
1508+ *
1509+ * Decreases context's and super block's grabbed block counters by number of
1510+ * blocks grabbed by current context and increases super block's free block
1511+ * counter correspondingly.
1512+ */
1513+void all_grabbed2free(void)
1514+{
1515+ reiser4_context *ctx = get_current_context();
1516+
1517+ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1518+}
1519+
1520+/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1521+ after freeing, @count blocks become "grabbed". */
1522+static void
1523+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1524+ __u64 count)
1525+{
1526+ add_to_ctx_grabbed(ctx, count);
1527+
1528+ spin_lock_reiser4_super(sbinfo);
1529+
1530+ sbinfo->blocks_grabbed += count;
1531+ sub_from_sb_used(sbinfo, count);
1532+
1533+ assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1534+
1535+ spin_unlock_reiser4_super(sbinfo);
1536+}
1537+
1538+/* this used to be done through used2grabbed and grabbed2free*/
1539+static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1540+{
1541+ spin_lock_reiser4_super(sbinfo);
1542+
1543+ sbinfo->blocks_free += count;
1544+ sub_from_sb_used(sbinfo, count);
1545+
1546+ assert("nikita-2685",
1547+ reiser4_check_block_counters(reiser4_get_current_sb()));
1548+
1549+ spin_unlock_reiser4_super(sbinfo);
1550+}
1551+
1552+#if REISER4_DEBUG
1553+
1554+/* check "allocated" state of given block range */
1555+static void
1556+reiser4_check_blocks(const reiser4_block_nr * start,
1557+ const reiser4_block_nr * len, int desired)
1558+{
1559+ sa_check_blocks(start, len, desired);
1560+}
1561+
1562+/* check "allocated" state of given block */
1563+void reiser4_check_block(const reiser4_block_nr * block, int desired)
1564+{
1565+ const reiser4_block_nr one = 1;
1566+
1567+ reiser4_check_blocks(block, &one, desired);
1568+}
1569+
1570+#endif
1571+
1572+/* Blocks deallocation function may do an actual deallocation through space
1573+ plugin allocation or store deleted block numbers in atom's delete_set data
1574+ structure depend on @defer parameter. */
1575+
1576+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1577+ will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1578+ freed but disk space is still grabbed by current thread, or these blocks must
1579+ not be counted in any reiser4 sb block counters, see block_stage_t comment */
1580+
1581+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1582+ distinguish blocks allocated for unformatted and formatted nodes */
1583+
1584+int
1585+reiser4_dealloc_blocks(const reiser4_block_nr * start,
1586+ const reiser4_block_nr * len,
1587+ block_stage_t target_stage, reiser4_ba_flags_t flags)
1588+{
1589+ txn_atom *atom = NULL;
1590+ int ret;
1591+ reiser4_context *ctx;
1592+ reiser4_super_info_data *sbinfo;
1593+
1594+ ctx = get_current_context();
1595+ sbinfo = get_super_private(ctx->super);
1596+
1597+ if (REISER4_DEBUG) {
1598+ assert("zam-431", *len != 0);
1599+ assert("zam-432", *start != 0);
1600+ assert("zam-558", !reiser4_blocknr_is_fake(start));
1601+
1602+ spin_lock_reiser4_super(sbinfo);
1603+ assert("zam-562", *start < sbinfo->block_count);
1604+ spin_unlock_reiser4_super(sbinfo);
1605+ }
1606+
1607+ if (flags & BA_DEFER) {
1608+ blocknr_set_entry *bsep = NULL;
1609+
1610+ /* storing deleted block numbers in a blocknr set
1611+ datastructure for further actual deletion */
1612+ do {
1613+ atom = get_current_atom_locked();
1614+ assert("zam-430", atom != NULL);
1615+
1616+ ret =
1617+ blocknr_set_add_extent(atom, &atom->delete_set,
1618+ &bsep, start, len);
1619+
1620+ if (ret == -ENOMEM)
1621+ return ret;
1622+
1623+ /* This loop might spin at most two times */
1624+ } while (ret == -E_REPEAT);
1625+
1626+ assert("zam-477", ret == 0);
1627+ assert("zam-433", atom != NULL);
1628+
1629+ spin_unlock_atom(atom);
1630+
1631+ } else {
1632+ assert("zam-425", get_current_super_private() != NULL);
1633+ sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1634+ *start, *len);
1635+
1636+ if (flags & BA_PERMANENT) {
1637+ /* These blocks were counted as allocated, we have to revert it
1638+ * back if allocation is discarded. */
1639+ txn_atom *atom = get_current_atom_locked();
1640+ atom->nr_blocks_allocated -= *len;
1641+ spin_unlock_atom(atom);
1642+ }
1643+
1644+ switch (target_stage) {
1645+ case BLOCK_NOT_COUNTED:
1646+ assert("vs-960", flags & BA_FORMATTED);
1647+ /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1648+ used2free(sbinfo, *len);
1649+ break;
1650+
1651+ case BLOCK_GRABBED:
1652+ used2grabbed(ctx, sbinfo, *len);
1653+ break;
1654+
1655+ case BLOCK_UNALLOCATED:
1656+ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1657+ break;
1658+
1659+ case BLOCK_FLUSH_RESERVED:{
1660+ txn_atom *atom;
1661+
1662+ atom = get_current_atom_locked();
1663+ used2flush_reserved(sbinfo, atom, *len,
1664+ flags & BA_FORMATTED);
1665+ spin_unlock_atom(atom);
1666+ break;
1667+ }
1668+ default:
1669+ impossible("zam-532", "wrong block stage");
1670+ }
1671+ }
1672+
1673+ return 0;
1674+}
1675+
1676+/* wrappers for block allocator plugin methods */
1677+int reiser4_pre_commit_hook(void)
1678+{
1679+ assert("zam-502", get_current_super_private() != NULL);
1680+ sa_pre_commit_hook();
1681+ return 0;
1682+}
1683+
1684+/* an actor which applies delete set to block allocator data */
1685+static int
1686+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1687+ const reiser4_block_nr * b, void *data UNUSED_ARG)
1688+{
1689+ reiser4_context *ctx;
1690+ reiser4_super_info_data *sbinfo;
1691+
1692+ __u64 len = 1;
1693+
1694+ ctx = get_current_context();
1695+ sbinfo = get_super_private(ctx->super);
1696+
1697+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1698+ assert("zam-552", sbinfo != NULL);
1699+
1700+ if (b != NULL)
1701+ len = *b;
1702+
1703+ if (REISER4_DEBUG) {
1704+ spin_lock_reiser4_super(sbinfo);
1705+
1706+ assert("zam-554", *a < reiser4_block_count(ctx->super));
1707+ assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1708+
1709+ spin_unlock_reiser4_super(sbinfo);
1710+ }
1711+
1712+ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1713+ /* adjust sb block counters */
1714+ used2free(sbinfo, len);
1715+ return 0;
1716+}
1717+
1718+void reiser4_post_commit_hook(void)
1719+{
1720+ txn_atom *atom;
1721+
1722+ atom = get_current_atom_locked();
1723+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1724+ spin_unlock_atom(atom);
1725+
1726+ /* do the block deallocation which was deferred
1727+ until commit is done */
1728+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1729+
1730+ assert("zam-504", get_current_super_private() != NULL);
1731+ sa_post_commit_hook();
1732+}
1733+
1734+void reiser4_post_write_back_hook(void)
1735+{
1736+ assert("zam-504", get_current_super_private() != NULL);
1737+
1738+ sa_post_commit_hook();
1739+}
1740+
1741+/*
1742+ Local variables:
1743+ c-indentation-style: "K&R"
1744+ mode-name: "LC"
1745+ c-basic-offset: 8
1746+ tab-width: 8
1747+ fill-column: 120
1748+ scroll-step: 1
1749+ End:
1750+*/
1751diff -urN linux-2.6.24.orig/fs/reiser4/block_alloc.h linux-2.6.24/fs/reiser4/block_alloc.h
1752--- linux-2.6.24.orig/fs/reiser4/block_alloc.h 1970-01-01 03:00:00.000000000 +0300
1753+++ linux-2.6.24/fs/reiser4/block_alloc.h 2008-01-25 11:39:06.888195324 +0300
1754@@ -0,0 +1,175 @@
1755+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1756+
1757+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
1758+#define __FS_REISER4_BLOCK_ALLOC_H__
1759+
1760+#include "dformat.h"
1761+#include "forward.h"
1762+
1763+#include <linux/types.h> /* for __u?? */
1764+#include <linux/fs.h>
1765+
1766+/* Mask when is applied to given block number shows is that block number is a fake one */
1767+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
1768+/* Mask which isolates a type of object this fake block number was assigned to */
1769+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1770+
1771+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1772+ against these two values to understand is the object unallocated or bitmap
1773+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1774+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
1775+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
1776+
1777+/* specification how block allocation was counted in sb block counters */
1778+typedef enum {
1779+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
1780+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation
1781+ of this block */
1782+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
1783+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
1784+ ( unallocated formatted or unformatted
1785+ node) */
1786+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
1787+ number assigned */
1788+} block_stage_t;
1789+
1790+/* a hint for block allocator */
1791+struct reiser4_blocknr_hint {
1792+ /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
1793+ is to prevent jnode_flush() calls from interleaving allocations on the same
1794+ bitmap, once a hint is established. */
1795+
1796+ /* search start hint */
1797+ reiser4_block_nr blk;
1798+ /* if not zero, it is a region size we search for free blocks in */
1799+ reiser4_block_nr max_dist;
1800+ /* level for allocation, may be useful have branch-level and higher
1801+ write-optimized. */
1802+ tree_level level;
1803+ /* block allocator assumes that blocks, which will be mapped to disk,
1804+ are in this specified block_stage */
1805+ block_stage_t block_stage;
1806+ /* If direction = 1 allocate blocks in backward direction from the end
1807+ * of disk to the beginning of disk. */
1808+ unsigned int backward:1;
1809+
1810+};
1811+
1812+/* These flags control block allocation/deallocation behavior */
1813+enum reiser4_ba_flags {
1814+ /* do allocatations from reserved (5%) area */
1815+ BA_RESERVED = (1 << 0),
1816+
1817+ /* block allocator can do commit trying to recover free space */
1818+ BA_CAN_COMMIT = (1 << 1),
1819+
1820+ /* if operation will be applied to formatted block */
1821+ BA_FORMATTED = (1 << 2),
1822+
1823+ /* defer actual block freeing until transaction commit */
1824+ BA_DEFER = (1 << 3),
1825+
1826+ /* allocate blocks for permanent fs objects (formatted or unformatted), not
1827+ wandered of log blocks */
1828+ BA_PERMANENT = (1 << 4),
1829+
1830+ /* grab space even it was disabled */
1831+ BA_FORCE = (1 << 5),
1832+
1833+ /* use default start value for free blocks search. */
1834+ BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1835+};
1836+
1837+typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1838+
1839+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1840+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
1841+extern void update_blocknr_hint_default(const struct super_block *,
1842+ const reiser4_block_nr *);
1843+extern void get_blocknr_hint_default(reiser4_block_nr *);
1844+
1845+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1846+
1847+int assign_fake_blocknr_formatted(reiser4_block_nr *);
1848+reiser4_block_nr fake_blocknr_unformatted(int);
1849+
1850+/* free -> grabbed -> fake_allocated -> used */
1851+
1852+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1853+void all_grabbed2free(void);
1854+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
1855+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1856+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1857+void grabbed2flush_reserved(__u64 count);
1858+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1859+ reiser4_block_nr * start,
1860+ reiser4_block_nr * len, reiser4_ba_flags_t flags);
1861+int reiser4_dealloc_blocks(const reiser4_block_nr *,
1862+ const reiser4_block_nr *,
1863+ block_stage_t, reiser4_ba_flags_t flags);
1864+
1865+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
1866+ reiser4_block_nr * start,
1867+ reiser4_ba_flags_t flags)
1868+{
1869+ reiser4_block_nr one = 1;
1870+ return reiser4_alloc_blocks(hint, start, &one, flags);
1871+}
1872+
1873+static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
1874+ block_stage_t stage,
1875+ reiser4_ba_flags_t flags)
1876+{
1877+ const reiser4_block_nr one = 1;
1878+ return reiser4_dealloc_blocks(block, &one, stage, flags);
1879+}
1880+
1881+#define reiser4_grab_space_force(count, flags) \
1882+ reiser4_grab_space(count, flags | BA_FORCE)
1883+
1884+extern void grabbed2free_mark(__u64 mark);
1885+extern int reiser4_grab_reserved(struct super_block *,
1886+ __u64, reiser4_ba_flags_t);
1887+extern void reiser4_release_reserved(struct super_block *super);
1888+
1889+/* grabbed -> fake_allocated */
1890+
1891+/* fake_allocated -> used */
1892+
1893+/* used -> fake_allocated -> grabbed -> free */
1894+
1895+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
1896+
1897+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
1898+
1899+extern void grabbed2cluster_reserved(int count);
1900+extern void cluster_reserved2grabbed(int count);
1901+extern void cluster_reserved2free(int count);
1902+
1903+extern int reiser4_check_block_counters(const struct super_block *);
1904+
1905+#if REISER4_DEBUG
1906+
1907+extern void reiser4_check_block(const reiser4_block_nr *, int);
1908+
1909+#else
1910+
1911+# define reiser4_check_block(beg, val) noop
1912+
1913+#endif
1914+
1915+extern int reiser4_pre_commit_hook(void);
1916+extern void reiser4_post_commit_hook(void);
1917+extern void reiser4_post_write_back_hook(void);
1918+
1919+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
1920+
1921+/* Make Linus happy.
1922+ Local variables:
1923+ c-indentation-style: "K&R"
1924+ mode-name: "LC"
1925+ c-basic-offset: 8
1926+ tab-width: 8
1927+ fill-column: 120
1928+ End:
1929+*/
1930diff -urN linux-2.6.24.orig/fs/reiser4/blocknrset.c linux-2.6.24/fs/reiser4/blocknrset.c
1931--- linux-2.6.24.orig/fs/reiser4/blocknrset.c 1970-01-01 03:00:00.000000000 +0300
1932+++ linux-2.6.24/fs/reiser4/blocknrset.c 2008-01-25 11:39:06.892196354 +0300
1933@@ -0,0 +1,368 @@
1934+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1935+
1936+/* This file contains code for various block number sets used by the atom to
1937+ track the deleted set and wandered block mappings. */
1938+
1939+#include "debug.h"
1940+#include "dformat.h"
1941+#include "txnmgr.h"
1942+#include "context.h"
1943+
1944+#include <linux/slab.h>
1945+
1946+/* The proposed data structure for storing unordered block number sets is a
1947+ list of elements, each of which contains an array of block number or/and
1948+ array of block number pairs. That element called blocknr_set_entry is used
1949+ to store block numbers from the beginning and for extents from the end of
1950+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
1951+ count numbers of blocks and extents.
1952+
1953+ +------------------- blocknr_set_entry->data ------------------+
1954+ |block1|block2| ... <free space> ... |pair3|pair2|pair1|
1955+ +------------------------------------------------------------+
1956+
1957+ When current blocknr_set_entry is full, allocate a new one. */
1958+
1959+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
1960+ * set (single blocks and block extents), in that case blocknr pair represent an
1961+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
1962+ * there represent a (real block) -> (wandered block) mapping. */
1963+
1964+/* Protection: blocknr sets belong to reiser4 atom, and
1965+ * their modifications are performed with the atom lock held */
1966+
1967+/* The total size of a blocknr_set_entry. */
1968+#define BLOCKNR_SET_ENTRY_SIZE 128
1969+
1970+/* The number of blocks that can fit the blocknr data area. */
1971+#define BLOCKNR_SET_ENTRIES_NUMBER \
1972+ ((BLOCKNR_SET_ENTRY_SIZE - \
1973+ 2 * sizeof (unsigned) - \
1974+ sizeof(struct list_head)) / \
1975+ sizeof(reiser4_block_nr))
1976+
1977+/* An entry of the blocknr_set */
1978+struct blocknr_set_entry {
1979+ unsigned nr_singles;
1980+ unsigned nr_pairs;
1981+ struct list_head link;
1982+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
1983+};
1984+
1985+/* A pair of blocks as recorded in the blocknr_set_entry data. */
1986+struct blocknr_pair {
1987+ reiser4_block_nr a;
1988+ reiser4_block_nr b;
1989+};
1990+
1991+/* Return the number of blocknr slots available in a blocknr_set_entry. */
1992+/* Audited by: green(2002.06.11) */
1993+static unsigned bse_avail(blocknr_set_entry * bse)
1994+{
1995+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
1996+
1997+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
1998+ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
1999+
2000+ return BLOCKNR_SET_ENTRIES_NUMBER - used;
2001+}
2002+
2003+/* Initialize a blocknr_set_entry. */
2004+static void bse_init(blocknr_set_entry *bse)
2005+{
2006+ bse->nr_singles = 0;
2007+ bse->nr_pairs = 0;
2008+ INIT_LIST_HEAD(&bse->link);
2009+}
2010+
2011+/* Allocate and initialize a blocknr_set_entry. */
2012+/* Audited by: green(2002.06.11) */
2013+static blocknr_set_entry *bse_alloc(void)
2014+{
2015+ blocknr_set_entry *e;
2016+
2017+ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
2018+ reiser4_ctx_gfp_mask_get())) == NULL)
2019+ return NULL;
2020+
2021+ bse_init(e);
2022+
2023+ return e;
2024+}
2025+
2026+/* Free a blocknr_set_entry. */
2027+/* Audited by: green(2002.06.11) */
2028+static void bse_free(blocknr_set_entry * bse)
2029+{
2030+ kfree(bse);
2031+}
2032+
2033+/* Add a block number to a blocknr_set_entry */
2034+/* Audited by: green(2002.06.11) */
2035+static void
2036+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2037+{
2038+ assert("jmacd-5099", bse_avail(bse) >= 1);
2039+
2040+ bse->entries[bse->nr_singles++] = *block;
2041+}
2042+
2043+/* Get a pair of block numbers */
2044+/* Audited by: green(2002.06.11) */
2045+static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
2046+ unsigned pno)
2047+{
2048+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2049+
2050+ return (struct blocknr_pair *) (bse->entries +
2051+ BLOCKNR_SET_ENTRIES_NUMBER -
2052+ 2 * (pno + 1));
2053+}
2054+
2055+/* Add a pair of block numbers to a blocknr_set_entry */
2056+/* Audited by: green(2002.06.11) */
2057+static void
2058+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2059+ const reiser4_block_nr * b)
2060+{
2061+ struct blocknr_pair *pair;
2062+
2063+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2064+
2065+ pair = bse_get_pair(bse, bse->nr_pairs++);
2066+
2067+ pair->a = *a;
2068+ pair->b = *b;
2069+}
2070+
2071+/* Add either a block or pair of blocks to the block number set. The first
2072+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2073+ @b is non-NULL a pair is added. The block number set belongs to atom, and
2074+ the call is made with the atom lock held. There may not be enough space in
2075+ the current blocknr_set_entry. If new_bsep points to a non-NULL
2076+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2077+ will be set to NULL. If new_bsep contains NULL then the atom lock will be
2078+ released and a new bse will be allocated in new_bsep. E_REPEAT will be
2079+ returned with the atom unlocked for the operation to be tried again. If
2080+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2081+ used during the call, it will be freed automatically. */
2082+static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2083+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2084+ const reiser4_block_nr *b)
2085+{
2086+ blocknr_set_entry *bse;
2087+ unsigned entries_needed;
2088+
2089+ assert("jmacd-5101", a != NULL);
2090+
2091+ entries_needed = (b == NULL) ? 1 : 2;
2092+ if (list_empty(bset) ||
2093+ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2094+ /* See if a bse was previously allocated. */
2095+ if (*new_bsep == NULL) {
2096+ spin_unlock_atom(atom);
2097+ *new_bsep = bse_alloc();
2098+ return (*new_bsep != NULL) ? -E_REPEAT :
2099+ RETERR(-ENOMEM);
2100+ }
2101+
2102+ /* Put it on the head of the list. */
2103+ list_add(&((*new_bsep)->link), bset);
2104+
2105+ *new_bsep = NULL;
2106+ }
2107+
2108+ /* Add the single or pair. */
2109+ bse = list_entry(bset->next, blocknr_set_entry, link);
2110+ if (b == NULL) {
2111+ bse_put_single(bse, a);
2112+ } else {
2113+ bse_put_pair(bse, a, b);
2114+ }
2115+
2116+ /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2117+ if (*new_bsep != NULL) {
2118+ bse_free(*new_bsep);
2119+ *new_bsep = NULL;
2120+ }
2121+
2122+ return 0;
2123+}
2124+
2125+/* Add an extent to the block set. If the length is 1, it is treated as a
2126+ single block (e.g., reiser4_set_add_block). */
2127+/* Audited by: green(2002.06.11) */
2128+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2129+ kmalloc might schedule. The only exception is atom spinlock, which is
2130+ properly freed. */
2131+int
2132+blocknr_set_add_extent(txn_atom * atom,
2133+ struct list_head * bset,
2134+ blocknr_set_entry ** new_bsep,
2135+ const reiser4_block_nr * start,
2136+ const reiser4_block_nr * len)
2137+{
2138+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2139+ return blocknr_set_add(atom, bset, new_bsep, start,
2140+ *len == 1 ? NULL : len);
2141+}
2142+
2143+/* Add a block pair to the block set. It adds exactly a pair, which is checked
2144+ * by an assertion that both arguments are not null.*/
2145+/* Audited by: green(2002.06.11) */
2146+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2147+ kmalloc might schedule. The only exception is atom spinlock, which is
2148+ properly freed. */
2149+int
2150+blocknr_set_add_pair(txn_atom * atom,
2151+ struct list_head * bset,
2152+ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2153+ const reiser4_block_nr * b)
2154+{
2155+ assert("jmacd-5103", a != NULL && b != NULL);
2156+ return blocknr_set_add(atom, bset, new_bsep, a, b);
2157+}
2158+
2159+/* Initialize a blocknr_set. */
2160+void blocknr_set_init(struct list_head *bset)
2161+{
2162+ INIT_LIST_HEAD(bset);
2163+}
2164+
2165+/* Release the entries of a blocknr_set. */
2166+void blocknr_set_destroy(struct list_head *bset)
2167+{
2168+ blocknr_set_entry *bse;
2169+
2170+ while (!list_empty(bset)) {
2171+ bse = list_entry(bset->next, blocknr_set_entry, link);
2172+ list_del_init(&bse->link);
2173+ bse_free(bse);
2174+ }
2175+}
2176+
2177+/* Merge blocknr_set entries out of @from into @into. */
2178+/* Audited by: green(2002.06.11) */
2179+/* Auditor comments: This merge does not know if merged sets contain
2180+ blocks pairs (As for wandered sets) or extents, so it cannot really merge
2181+ overlapping ranges if there is some. So I believe it may lead to
2182+ some blocks being presented several times in one blocknr_set. To help
2183+ debugging such problems it might help to check for duplicate entries on
2184+ actual processing of this set. Testing this kind of stuff right here is
2185+ also complicated by the fact that these sets are not sorted and going
2186+ through whole set on each element addition is going to be CPU-heavy task */
2187+void blocknr_set_merge(struct list_head * from, struct list_head * into)
2188+{
2189+ blocknr_set_entry *bse_into = NULL;
2190+
2191+ /* If @from is empty, no work to perform. */
2192+ if (list_empty(from))
2193+ return;
2194+ /* If @into is not empty, try merging partial-entries. */
2195+ if (!list_empty(into)) {
2196+
2197+ /* Neither set is empty, pop the front to members and try to combine them. */
2198+ blocknr_set_entry *bse_from;
2199+ unsigned into_avail;
2200+
2201+ bse_into = list_entry(into->next, blocknr_set_entry, link);
2202+ list_del_init(&bse_into->link);
2203+ bse_from = list_entry(from->next, blocknr_set_entry, link);
2204+ list_del_init(&bse_from->link);
2205+
2206+ /* Combine singles. */
2207+ for (into_avail = bse_avail(bse_into);
2208+ into_avail != 0 && bse_from->nr_singles != 0;
2209+ into_avail -= 1) {
2210+ bse_put_single(bse_into,
2211+ &bse_from->entries[--bse_from->
2212+ nr_singles]);
2213+ }
2214+
2215+ /* Combine pairs. */
2216+ for (; into_avail > 1 && bse_from->nr_pairs != 0;
2217+ into_avail -= 2) {
2218+ struct blocknr_pair *pair =
2219+ bse_get_pair(bse_from, --bse_from->nr_pairs);
2220+ bse_put_pair(bse_into, &pair->a, &pair->b);
2221+ }
2222+
2223+ /* If bse_from is empty, delete it now. */
2224+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2225+ bse_free(bse_from);
2226+ } else {
2227+ /* Otherwise, bse_into is full or nearly full (e.g.,
2228+ it could have one slot avail and bse_from has one
2229+ pair left). Push it back onto the list. bse_from
2230+ becomes bse_into, which will be the new partial. */
2231+ list_add(&bse_into->link, into);
2232+ bse_into = bse_from;
2233+ }
2234+ }
2235+
2236+ /* Splice lists together. */
2237+ list_splice_init(from, into->prev);
2238+
2239+ /* Add the partial entry back to the head of the list. */
2240+ if (bse_into != NULL)
2241+ list_add(&bse_into->link, into);
2242+}
2243+
2244+/* Iterate over all blocknr set elements. */
2245+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2246+ blocknr_set_actor_f actor, void *data, int delete)
2247+{
2248+
2249+ blocknr_set_entry *entry;
2250+
2251+ assert("zam-429", atom != NULL);
2252+ assert("zam-430", atom_is_protected(atom));
2253+ assert("zam-431", bset != 0);
2254+ assert("zam-432", actor != NULL);
2255+
2256+ entry = list_entry(bset->next, blocknr_set_entry, link);
2257+ while (bset != &entry->link) {
2258+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2259+ unsigned int i;
2260+ int ret;
2261+
2262+ for (i = 0; i < entry->nr_singles; i++) {
2263+ ret = actor(atom, &entry->entries[i], NULL, data);
2264+
2265+ /* We can't break a loop if delete flag is set. */
2266+ if (ret != 0 && !delete)
2267+ return ret;
2268+ }
2269+
2270+ for (i = 0; i < entry->nr_pairs; i++) {
2271+ struct blocknr_pair *ab;
2272+
2273+ ab = bse_get_pair(entry, i);
2274+
2275+ ret = actor(atom, &ab->a, &ab->b, data);
2276+
2277+ if (ret != 0 && !delete)
2278+ return ret;
2279+ }
2280+
2281+ if (delete) {
2282+ list_del(&entry->link);
2283+ bse_free(entry);
2284+ }
2285+
2286+ entry = tmp;
2287+ }
2288+
2289+ return 0;
2290+}
2291+
2292+/*
2293+ * Local variables:
2294+ * c-indentation-style: "K&R"
2295+ * mode-name: "LC"
2296+ * c-basic-offset: 8
2297+ * tab-width: 8
2298+ * fill-column: 79
2299+ * scroll-step: 1
2300+ * End:
2301+ */
2302diff -urN linux-2.6.24.orig/fs/reiser4/carry.c linux-2.6.24/fs/reiser4/carry.c
2303--- linux-2.6.24.orig/fs/reiser4/carry.c 1970-01-01 03:00:00.000000000 +0300
2304+++ linux-2.6.24/fs/reiser4/carry.c 2008-01-25 11:39:06.896197385 +0300
2305@@ -0,0 +1,1391 @@
2306+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2307+/* Functions to "carry" tree modification(s) upward. */
2308+/* Tree is modified one level at a time. As we modify a level we accumulate a
2309+ set of changes that need to be propagated to the next level. We manage
2310+ node locking such that any searches that collide with carrying are
2311+ restarted, from the root if necessary.
2312+
2313+ Insertion of a new item may result in items being moved among nodes and
2314+ this requires the delimiting key to be updated at the least common parent
2315+ of the nodes modified to preserve search tree invariants. Also, insertion
2316+ may require allocation of a new node. A pointer to the new node has to be
2317+ inserted into some node on the parent level, etc.
2318+
2319+ Tree carrying is meant to be analogous to arithmetic carrying.
2320+
2321+ A carry operation is always associated with some node (&carry_node).
2322+
2323+ Carry process starts with some initial set of operations to be performed
2324+ and an initial set of already locked nodes. Operations are performed one
2325+ by one. Performing each single operation has following possible effects:
2326+
2327+ - content of carry node associated with operation is modified
2328+ - new carry nodes are locked and involved into carry process on this level
2329+ - new carry operations are posted to the next level
2330+
2331+ After all carry operations on this level are done, process is repeated for
2332+ the accumulated sequence on carry operations for the next level. This
2333+ starts by trying to lock (in left to right order) all carry nodes
2334+ associated with carry operations on the parent level. After this, we decide
2335+ whether more nodes are required on the left of already locked set. If so,
2336+ all locks taken on the parent level are released, new carry nodes are
2337+ added, and locking process repeats.
2338+
2339+ It may happen that balancing process fails owing to unrecoverable error on
2340+ some of upper levels of a tree (possible causes are io error, failure to
2341+ allocate new node, etc.). In this case we should unmount the filesystem,
2342+ rebooting if it is the root, and possibly advise the use of fsck.
2343+
2344+ USAGE:
2345+
2346+ int some_tree_operation( znode *node, ... )
2347+ {
2348+ // Allocate on a stack pool of carry objects: operations and nodes.
2349+ // Most carry processes will only take objects from here, without
2350+ // dynamic allocation.
2351+
2352+I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2353+
2354+ carry_pool pool;
2355+ carry_level lowest_level;
2356+ carry_op *op;
2357+
2358+ init_carry_pool( &pool );
2359+ init_carry_level( &lowest_level, &pool );
2360+
2361+ // operation may be one of:
2362+ // COP_INSERT --- insert new item into node
2363+ // COP_CUT --- remove part of or whole node
2364+ // COP_PASTE --- increase size of item
2365+ // COP_DELETE --- delete pointer from parent node
2366+ // COP_UPDATE --- update delimiting key in least
2367+ // common ancestor of two
2368+
2369+ op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2370+ if( IS_ERR( op ) || ( op == NULL ) ) {
2371+ handle error
2372+ } else {
2373+ // fill in remaining fields in @op, according to carry.h:carry_op
2374+ result = carry( &lowest_level, NULL );
2375+ }
2376+ done_carry_pool( &pool );
2377+ }
2378+
2379+ When you are implementing node plugin method that participates in carry
2380+ (shifting, insertion, deletion, etc.), do the following:
2381+
2382+ int foo_node_method( znode *node, ..., carry_level *todo )
2383+ {
2384+ carry_op *op;
2385+
2386+ ....
2387+
2388+ // note, that last argument to reiser4_post_carry() is non-null
2389+ // here, because @op is to be applied to the parent of @node, rather
2390+ // than to the @node itself as in the previous case.
2391+
2392+ op = node_post_carry( todo, operation, node, 1 );
2393+ // fill in remaining fields in @op, according to carry.h:carry_op
2394+
2395+ ....
2396+
2397+ }
2398+
2399+ BATCHING:
2400+
2401+ One of the main advantages of level-by-level balancing implemented here is
2402+ ability to batch updates on a parent level and to peform them more
2403+ efficiently as a result.
2404+
2405+ Description To Be Done (TBD).
2406+
2407+ DIFFICULTIES AND SUBTLE POINTS:
2408+
2409+ 1. complex plumbing is required, because:
2410+
2411+ a. effective allocation through pools is needed
2412+
2413+ b. target of operation is not exactly known when operation is
2414+ posted. This is worked around through bitfields in &carry_node and
2415+ logic in lock_carry_node()
2416+
2417+ c. of interaction with locking code: node should be added into sibling
2418+ list when pointer to it is inserted into its parent, which is some time
2419+ after node was created. Between these moments, node is somewhat in
2420+ suspended state and is only registered in the carry lists
2421+
2422+ 2. whole balancing logic is implemented here, in particular, insertion
2423+ logic is coded in make_space().
2424+
2425+ 3. special cases like insertion (reiser4_add_tree_root()) or deletion
2426+ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2427+ (insert_paste()) have to be handled.
2428+
2429+ 4. there is non-trivial interdependency between allocation of new nodes
2430+ and almost everything else. This is mainly due to the (1.c) above. I shall
2431+ write about this later.
2432+
2433+*/
2434+
2435+#include "forward.h"
2436+#include "debug.h"
2437+#include "key.h"
2438+#include "coord.h"
2439+#include "plugin/item/item.h"
2440+#include "plugin/item/extent.h"
2441+#include "plugin/node/node.h"
2442+#include "jnode.h"
2443+#include "znode.h"
2444+#include "tree_mod.h"
2445+#include "tree_walk.h"
2446+#include "block_alloc.h"
2447+#include "pool.h"
2448+#include "tree.h"
2449+#include "carry.h"
2450+#include "carry_ops.h"
2451+#include "super.h"
2452+#include "reiser4.h"
2453+
2454+#include <linux/types.h>
2455+
2456+/* level locking/unlocking */
2457+static int lock_carry_level(carry_level * level);
2458+static void unlock_carry_level(carry_level * level, int failure);
2459+static void done_carry_level(carry_level * level);
2460+static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2461+
2462+int lock_carry_node(carry_level * level, carry_node * node);
2463+int lock_carry_node_tail(carry_node * node);
2464+
2465+/* carry processing proper */
2466+static int carry_on_level(carry_level * doing, carry_level * todo);
2467+
2468+static carry_op *add_op(carry_level * level, pool_ordering order,
2469+ carry_op * reference);
2470+
2471+/* handlers for carry operations. */
2472+
2473+static void fatal_carry_error(carry_level * doing, int ecode);
2474+static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2475+
2476+static void print_level(const char *prefix, carry_level * level);
2477+
2478+#if REISER4_DEBUG
2479+typedef enum {
2480+ CARRY_TODO,
2481+ CARRY_DOING
2482+} carry_queue_state;
2483+static int carry_level_invariant(carry_level * level, carry_queue_state state);
2484+#endif
2485+
2486+/* main entry point for tree balancing.
2487+
2488+ Tree carry performs operations from @doing and while doing so accumulates
2489+ information about operations to be performed on the next level ("carried"
2490+ to the parent level). Carried operations are performed, causing possibly
2491+ more operations to be carried upward etc. carry() takes care about
2492+ locking and pinning znodes while operating on them.
2493+
2494+ For usage, see comment at the top of fs/reiser4/carry.c
2495+
2496+*/
2497+int reiser4_carry(carry_level * doing /* set of carry operations to be
2498+ * performed */ ,
2499+ carry_level * done /* set of nodes, already performed
2500+ * at the previous level.
2501+ * NULL in most cases */)
2502+{
2503+ int result = 0;
2504+ /* queue of new requests */
2505+ carry_level *todo;
2506+ ON_DEBUG(STORE_COUNTERS);
2507+
2508+ assert("nikita-888", doing != NULL);
2509+ BUG_ON(done != NULL);
2510+
2511+ todo = doing + 1;
2512+ init_carry_level(todo, doing->pool);
2513+
2514+ /* queue of requests preformed on the previous level */
2515+ done = todo + 1;
2516+ init_carry_level(done, doing->pool);
2517+
2518+ /* iterate until there is nothing more to do */
2519+ while (result == 0 && doing->ops_num > 0) {
2520+ carry_level *tmp;
2521+
2522+ /* at this point @done is locked. */
2523+ /* repeat lock/do/unlock while
2524+
2525+ (1) lock_carry_level() fails due to deadlock avoidance, or
2526+
2527+ (2) carry_on_level() decides that more nodes have to
2528+ be involved.
2529+
2530+ (3) some unexpected error occurred while balancing on the
2531+ upper levels. In this case all changes are rolled back.
2532+
2533+ */
2534+ while (1) {
2535+ result = lock_carry_level(doing);
2536+ if (result == 0) {
2537+ /* perform operations from @doing and
2538+ accumulate new requests in @todo */
2539+ result = carry_on_level(doing, todo);
2540+ if (result == 0)
2541+ break;
2542+ else if (result != -E_REPEAT ||
2543+ !doing->restartable) {
2544+ warning("nikita-1043",
2545+ "Fatal error during carry: %i",
2546+ result);
2547+ print_level("done", done);
2548+ print_level("doing", doing);
2549+ print_level("todo", todo);
2550+ /* do some rough stuff like aborting
2551+ all pending transcrashes and thus
2552+ pushing tree back to the consistent
2553+ state. Alternatvely, just panic.
2554+ */
2555+ fatal_carry_error(doing, result);
2556+ return result;
2557+ }
2558+ } else if (result != -E_REPEAT) {
2559+ fatal_carry_error(doing, result);
2560+ return result;
2561+ }
2562+ unlock_carry_level(doing, 1);
2563+ }
2564+ /* at this point @done can be safely unlocked */
2565+ done_carry_level(done);
2566+
2567+ /* cyclically shift queues */
2568+ tmp = done;
2569+ done = doing;
2570+ doing = todo;
2571+ todo = tmp;
2572+ init_carry_level(todo, doing->pool);
2573+
2574+ /* give other threads chance to run */
2575+ reiser4_preempt_point();
2576+ }
2577+ done_carry_level(done);
2578+
2579+ /* all counters, but x_refs should remain the same. x_refs can change
2580+ owing to transaction manager */
2581+ ON_DEBUG(CHECK_COUNTERS);
2582+ return result;
2583+}
2584+
2585+/* perform carry operations on given level.
2586+
2587+ Optimizations proposed by pooh:
2588+
2589+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2590+ required;
2591+
2592+ (2) unlock node if there are no more operations to be performed upon it and
2593+ node didn't add any operation to @todo. This can be implemented by
2594+ attaching to each node two counters: counter of operaions working on this
2595+ node and counter and operations carried upward from this node.
2596+
2597+*/
2598+static int carry_on_level(carry_level * doing /* queue of carry operations to
2599+ * do on this level */ ,
2600+ carry_level * todo /* queue where new carry
2601+ * operations to be performed on
2602+ * the * parent level are
2603+ * accumulated during @doing
2604+ * processing. */ )
2605+{
2606+ int result;
2607+ int (*f) (carry_op *, carry_level *, carry_level *);
2608+ carry_op *op;
2609+ carry_op *tmp_op;
2610+
2611+ assert("nikita-1034", doing != NULL);
2612+ assert("nikita-1035", todo != NULL);
2613+
2614+ /* @doing->nodes are locked. */
2615+
2616+ /* This function can be split into two phases: analysis and modification.
2617+
2618+ Analysis calculates precisely what items should be moved between
2619+ nodes. This information is gathered in some structures attached to
2620+ each carry_node in a @doing queue. Analysis also determines whether
2621+ new nodes are to be allocated etc.
2622+
2623+ After analysis is completed, actual modification is performed. Here
2624+ we can take advantage of "batch modification": if there are several
2625+ operations acting on the same node, modifications can be performed
2626+ more efficiently when batched together.
2627+
2628+ Above is an optimization left for the future.
2629+ */
2630+ /* Important, but delayed optimization: it's possible to batch
2631+ operations together and perform them more efficiently as a
2632+ result. For example, deletion of several neighboring items from a
2633+ node can be converted to a single ->cut() operation.
2634+
2635+ Before processing queue, it should be scanned and "mergeable"
2636+ operations merged.
2637+ */
2638+ result = 0;
2639+ for_all_ops(doing, op, tmp_op) {
2640+ carry_opcode opcode;
2641+
2642+ assert("nikita-1041", op != NULL);
2643+ opcode = op->op;
2644+ assert("nikita-1042", op->op < COP_LAST_OP);
2645+ f = op_dispatch_table[op->op].handler;
2646+ result = f(op, doing, todo);
2647+ /* locking can fail with -E_REPEAT. Any different error is fatal
2648+ and will be handled by fatal_carry_error() sledgehammer.
2649+ */
2650+ if (result != 0)
2651+ break;
2652+ }
2653+ if (result == 0) {
2654+ carry_plugin_info info;
2655+ carry_node *scan;
2656+ carry_node *tmp_scan;
2657+
2658+ info.doing = doing;
2659+ info.todo = todo;
2660+
2661+ assert("nikita-3002",
2662+ carry_level_invariant(doing, CARRY_DOING));
2663+ for_all_nodes(doing, scan, tmp_scan) {
2664+ znode *node;
2665+
2666+ node = reiser4_carry_real(scan);
2667+ assert("nikita-2547", node != NULL);
2668+ if (node_is_empty(node)) {
2669+ result =
2670+ node_plugin_by_node(node)->
2671+ prepare_removal(node, &info);
2672+ if (result != 0)
2673+ break;
2674+ }
2675+ }
2676+ }
2677+ return result;
2678+}
2679+
2680+/* post carry operation
2681+
2682+ This is main function used by external carry clients: node layout plugins
2683+ and tree operations to create new carry operation to be performed on some
2684+ level.
2685+
2686+ New operation will be included in the @level queue. To actually perform it,
2687+ call carry( level, ... ). This function takes write lock on @node. Carry
2688+ manages all its locks by itself, don't worry about this.
2689+
2690+ This function adds operation and node at the end of the queue. It is up to
2691+ caller to guarantee proper ordering of node queue.
2692+
2693+*/
2694+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2695+ * is to be posted at */ ,
2696+ carry_opcode op /* opcode of operation */ ,
2697+ znode * node /* node on which this operation
2698+ * will operate */ ,
2699+ int apply_to_parent_p /* whether operation will
2700+ * operate directly on @node
2701+ * or on it parent. */)
2702+{
2703+ carry_op *result;
2704+ carry_node *child;
2705+
2706+ assert("nikita-1046", level != NULL);
2707+ assert("nikita-1788", znode_is_write_locked(node));
2708+
2709+ result = add_op(level, POOLO_LAST, NULL);
2710+ if (IS_ERR(result))
2711+ return result;
2712+ child = reiser4_add_carry(level, POOLO_LAST, NULL);
2713+ if (IS_ERR(child)) {
2714+ reiser4_pool_free(&level->pool->op_pool, &result->header);
2715+ return (carry_op *) child;
2716+ }
2717+ result->node = child;
2718+ result->op = op;
2719+ child->parent = apply_to_parent_p;
2720+ if (ZF_ISSET(node, JNODE_ORPHAN))
2721+ child->left_before = 1;
2722+ child->node = node;
2723+ return result;
2724+}
2725+
2726+/* initialize carry queue */
2727+void init_carry_level(carry_level * level /* level to initialize */ ,
2728+ carry_pool * pool /* pool @level will allocate objects
2729+ * from */ )
2730+{
2731+ assert("nikita-1045", level != NULL);
2732+ assert("nikita-967", pool != NULL);
2733+
2734+ memset(level, 0, sizeof *level);
2735+ level->pool = pool;
2736+
2737+ INIT_LIST_HEAD(&level->nodes);
2738+ INIT_LIST_HEAD(&level->ops);
2739+}
2740+
2741+/* allocate carry pool and initialize pools within queue */
2742+carry_pool *init_carry_pool(int size)
2743+{
2744+ carry_pool *pool;
2745+
2746+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
2747+ pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
2748+ if (pool == NULL)
2749+ return ERR_PTR(RETERR(-ENOMEM));
2750+
2751+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2752+ (char *)pool->op);
2753+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2754+ NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2755+ return pool;
2756+}
2757+
2758+/* finish with queue pools */
2759+void done_carry_pool(carry_pool * pool /* pool to destroy */ )
2760+{
2761+ reiser4_done_pool(&pool->op_pool);
2762+ reiser4_done_pool(&pool->node_pool);
2763+ kfree(pool);
2764+}
2765+
2766+/* add new carry node to the @level.
2767+
2768+ Returns pointer to the new carry node allocated from pool. It's up to
2769+ callers to maintain proper order in the @level. Assumption is that if carry
2770+ nodes on one level are already sorted and modifications are peroformed from
2771+ left to right, carry nodes added on the parent level will be ordered
2772+ automatically. To control ordering use @order and @reference parameters.
2773+
2774+*/
2775+carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2776+ * node to */ ,
2777+ pool_ordering order /* where to insert:
2778+ * at the beginning of
2779+ * @level,
2780+ * before @reference,
2781+ * after @reference,
2782+ * at the end of @level
2783+ */ ,
2784+ carry_node * reference/* reference node for
2785+ * insertion */)
2786+{
2787+ ON_DEBUG(carry_node * orig_ref = reference);
2788+
2789+ if (order == POOLO_BEFORE) {
2790+ reference = find_left_carry(reference, level);
2791+ if (reference == NULL)
2792+ reference = list_entry(level->nodes.next, carry_node,
2793+ header.level_linkage);
2794+ else
2795+ reference = list_entry(reference->header.level_linkage.next,
2796+ carry_node, header.level_linkage);
2797+ } else if (order == POOLO_AFTER) {
2798+ reference = find_right_carry(reference, level);
2799+ if (reference == NULL)
2800+ reference = list_entry(level->nodes.prev, carry_node,
2801+ header.level_linkage);
2802+ else
2803+ reference = list_entry(reference->header.level_linkage.prev,
2804+ carry_node, header.level_linkage);
2805+ }
2806+ assert("nikita-2209",
2807+ ergo(orig_ref != NULL,
2808+ reiser4_carry_real(reference) ==
2809+ reiser4_carry_real(orig_ref)));
2810+ return reiser4_add_carry(level, order, reference);
2811+}
2812+
2813+carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node
2814+ * to */ ,
2815+ pool_ordering order /* where to insert: at the
2816+ * beginning of @level, before
2817+ * @reference, after @reference,
2818+ * at the end of @level */ ,
2819+ carry_node * reference /* reference node for
2820+ * insertion */ )
2821+{
2822+ carry_node *result;
2823+
2824+ result =
2825+ (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2826+ &level->nodes,
2827+ order, &reference->header);
2828+ if (!IS_ERR(result) && (result != NULL))
2829+ ++level->nodes_num;
2830+ return result;
2831+}
2832+
2833+/* add new carry operation to the @level.
2834+
2835+ Returns pointer to the new carry operations allocated from pool. It's up to
2836+ callers to maintain proper order in the @level. To control ordering use
2837+ @order and @reference parameters.
2838+
2839+*/
2840+static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
2841+ pool_ordering order /* where to insert: at the beginning of
2842+ * @level, before @reference, after
2843+ * @reference, at the end of @level */ ,
2844+ carry_op *
2845+ reference /* reference node for insertion */ )
2846+{
2847+ carry_op *result;
2848+
2849+ result =
2850+ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2851+ order, &reference->header);
2852+ if (!IS_ERR(result) && (result != NULL))
2853+ ++level->ops_num;
2854+ return result;
2855+}
2856+
2857+/* Return node on the right of which @node was created.
2858+
2859+ Each node is created on the right of some existing node (or it is new root,
2860+ which is special case not handled here).
2861+
2862+ @node is new node created on some level, but not yet inserted into its
2863+ parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
2864+
2865+*/
2866+static carry_node *find_begetting_brother(carry_node * node /* node to start search
2867+ * from */ ,
2868+ carry_level * kin UNUSED_ARG /* level to
2869+ * scan */ )
2870+{
2871+ carry_node *scan;
2872+
2873+ assert("nikita-1614", node != NULL);
2874+ assert("nikita-1615", kin != NULL);
2875+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
2876+ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
2877+ ZF_ISSET(reiser4_carry_real(node),
2878+ JNODE_ORPHAN)));
2879+ for (scan = node;;
2880+ scan = list_entry(scan->header.level_linkage.prev, carry_node,
2881+ header.level_linkage)) {
2882+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
2883+ if ((scan->node != node->node) &&
2884+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
2885+ assert("nikita-1618", reiser4_carry_real(scan) != NULL);
2886+ break;
2887+ }
2888+ }
2889+ return scan;
2890+}
2891+
2892+static cmp_t
2893+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
2894+{
2895+ assert("nikita-2199", n1 != NULL);
2896+ assert("nikita-2200", n2 != NULL);
2897+
2898+ if (n1 == n2)
2899+ return EQUAL_TO;
2900+ while (1) {
2901+ n1 = carry_node_next(n1);
2902+ if (carry_node_end(level, n1))
2903+ return GREATER_THAN;
2904+ if (n1 == n2)
2905+ return LESS_THAN;
2906+ }
2907+ impossible("nikita-2201", "End of level reached");
2908+}
2909+
2910+carry_node *find_carry_node(carry_level * level, const znode * node)
2911+{
2912+ carry_node *scan;
2913+ carry_node *tmp_scan;
2914+
2915+ assert("nikita-2202", level != NULL);
2916+ assert("nikita-2203", node != NULL);
2917+
2918+ for_all_nodes(level, scan, tmp_scan) {
2919+ if (reiser4_carry_real(scan) == node)
2920+ return scan;
2921+ }
2922+ return NULL;
2923+}
2924+
2925+znode *reiser4_carry_real(const carry_node * node)
2926+{
2927+ assert("nikita-3061", node != NULL);
2928+
2929+ return node->lock_handle.node;
2930+}
2931+
2932+carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
2933+ const znode * node)
2934+{
2935+ carry_node *base;
2936+ carry_node *scan;
2937+ carry_node *tmp_scan;
2938+ carry_node *proj;
2939+
2940+ base = find_carry_node(doing, node);
2941+ assert("nikita-2204", base != NULL);
2942+
2943+ for_all_nodes(todo, scan, tmp_scan) {
2944+ proj = find_carry_node(doing, scan->node);
2945+ assert("nikita-2205", proj != NULL);
2946+ if (carry_node_cmp(doing, proj, base) != LESS_THAN)
2947+ break;
2948+ }
2949+ return scan;
2950+}
2951+
2952+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
2953+ znode * node)
2954+{
2955+ carry_node *reference;
2956+
2957+ assert("nikita-2994", doing != NULL);
2958+ assert("nikita-2995", todo != NULL);
2959+ assert("nikita-2996", node != NULL);
2960+
2961+ reference = insert_carry_node(doing, todo, node);
2962+ assert("nikita-2997", reference != NULL);
2963+
2964+ return reiser4_add_carry(todo, POOLO_BEFORE, reference);
2965+}
2966+
2967+/* like reiser4_post_carry(), but designed to be called from node plugin methods.
2968+ This function is different from reiser4_post_carry() in that it finds proper
2969+ place to insert node in the queue. */
2970+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
2971+ * passed down to node
2972+ * plugin */ ,
2973+ carry_opcode op /* opcode of operation */ ,
2974+ znode * node /* node on which this
2975+ * operation will operate */ ,
2976+ int apply_to_parent_p /* whether operation will
2977+ * operate directly on @node
2978+ * or on it parent. */ )
2979+{
2980+ carry_op *result;
2981+ carry_node *child;
2982+
2983+ assert("nikita-2207", info != NULL);
2984+ assert("nikita-2208", info->todo != NULL);
2985+
2986+ if (info->doing == NULL)
2987+ return reiser4_post_carry(info->todo, op, node,
2988+ apply_to_parent_p);
2989+
2990+ result = add_op(info->todo, POOLO_LAST, NULL);
2991+ if (IS_ERR(result))
2992+ return result;
2993+ child = add_carry_atplace(info->doing, info->todo, node);
2994+ if (IS_ERR(child)) {
2995+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
2996+ return (carry_op *) child;
2997+ }
2998+ result->node = child;
2999+ result->op = op;
3000+ child->parent = apply_to_parent_p;
3001+ if (ZF_ISSET(node, JNODE_ORPHAN))
3002+ child->left_before = 1;
3003+ child->node = node;
3004+ return result;
3005+}
3006+
3007+/* lock all carry nodes in @level */
3008+static int lock_carry_level(carry_level * level /* level to lock */ )
3009+{
3010+ int result;
3011+ carry_node *node;
3012+ carry_node *tmp_node;
3013+
3014+ assert("nikita-881", level != NULL);
3015+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3016+
3017+ /* lock nodes from left to right */
3018+ result = 0;
3019+ for_all_nodes(level, node, tmp_node) {
3020+ result = lock_carry_node(level, node);
3021+ if (result != 0)
3022+ break;
3023+ }
3024+ return result;
3025+}
3026+
3027+/* Synchronize delimiting keys between @node and its left neighbor.
3028+
3029+ To reduce contention on dk key and simplify carry code, we synchronize
3030+ delimiting keys only when carry ultimately leaves tree level (carrying
3031+ changes upward) and unlocks nodes at this level.
3032+
3033+ This function first finds left neighbor of @node and then updates left
3034+ neighbor's right delimiting key to conincide with least key in @node.
3035+
3036+*/
3037+
3038+ON_DEBUG(extern atomic_t delim_key_version;
3039+ )
3040+
3041+static void sync_dkeys(znode * spot /* node to update */ )
3042+{
3043+ reiser4_key pivot;
3044+ reiser4_tree *tree;
3045+
3046+ assert("nikita-1610", spot != NULL);
3047+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3048+
3049+ tree = znode_get_tree(spot);
3050+ read_lock_tree(tree);
3051+ write_lock_dk(tree);
3052+
3053+ assert("nikita-2192", znode_is_loaded(spot));
3054+
3055+ /* sync left delimiting key of @spot with key in its leftmost item */
3056+ if (node_is_empty(spot))
3057+ pivot = *znode_get_rd_key(spot);
3058+ else
3059+ leftmost_key_in_node(spot, &pivot);
3060+
3061+ znode_set_ld_key(spot, &pivot);
3062+
3063+ /* there can be sequence of empty nodes pending removal on the left of
3064+ @spot. Scan them and update their left and right delimiting keys to
3065+ match left delimiting key of @spot. Also, update right delimiting
3066+ key of first non-empty left neighbor.
3067+ */
3068+ while (1) {
3069+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3070+ break;
3071+
3072+ spot = spot->left;
3073+ if (spot == NULL)
3074+ break;
3075+
3076+ znode_set_rd_key(spot, &pivot);
3077+ /* don't sink into the domain of another balancing */
3078+ if (!znode_is_write_locked(spot))
3079+ break;
3080+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3081+ znode_set_ld_key(spot, &pivot);
3082+ else
3083+ break;
3084+ }
3085+
3086+ write_unlock_dk(tree);
3087+ read_unlock_tree(tree);
3088+}
3089+
3090+/* unlock all carry nodes in @level */
3091+static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3092+ int failure /* true if unlocking owing to
3093+ * failure */ )
3094+{
3095+ carry_node *node;
3096+ carry_node *tmp_node;
3097+
3098+ assert("nikita-889", level != NULL);
3099+
3100+ if (!failure) {
3101+ znode *spot;
3102+
3103+ spot = NULL;
3104+ /* update delimiting keys */
3105+ for_all_nodes(level, node, tmp_node) {
3106+ if (reiser4_carry_real(node) != spot) {
3107+ spot = reiser4_carry_real(node);
3108+ sync_dkeys(spot);
3109+ }
3110+ }
3111+ }
3112+
3113+ /* nodes can be unlocked in arbitrary order. In preemptible
3114+ environment it's better to unlock in reverse order of locking,
3115+ though.
3116+ */
3117+ for_all_nodes_back(level, node, tmp_node) {
3118+ /* all allocated nodes should be already linked to their
3119+ parents at this moment. */
3120+ assert("nikita-1631",
3121+ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3122+ JNODE_ORPHAN)));
3123+ ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3124+ unlock_carry_node(level, node, failure);
3125+ }
3126+ level->new_root = NULL;
3127+}
3128+
3129+/* finish with @level
3130+
3131+ Unlock nodes and release all allocated resources */
3132+static void done_carry_level(carry_level * level /* level to finish */ )
3133+{
3134+ carry_node *node;
3135+ carry_node *tmp_node;
3136+ carry_op *op;
3137+ carry_op *tmp_op;
3138+
3139+ assert("nikita-1076", level != NULL);
3140+
3141+ unlock_carry_level(level, 0);
3142+ for_all_nodes(level, node, tmp_node) {
3143+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3144+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3145+ reiser4_pool_free(&level->pool->node_pool, &node->header);
3146+ }
3147+ for_all_ops(level, op, tmp_op)
3148+ reiser4_pool_free(&level->pool->op_pool, &op->header);
3149+}
3150+
3151+/* helper function to complete locking of carry node
3152+
3153+ Finish locking of carry node. There are several ways in which new carry
3154+ node can be added into carry level and locked. Normal is through
3155+ lock_carry_node(), but also from find_{left|right}_neighbor(). This
3156+ function factors out common final part of all locking scenarios. It
3157+ supposes that @node -> lock_handle is lock handle for lock just taken and
3158+ fills ->real_node from this lock handle.
3159+
3160+*/
3161+int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3162+{
3163+ assert("nikita-1052", node != NULL);
3164+ assert("nikita-1187", reiser4_carry_real(node) != NULL);
3165+ assert("nikita-1188", !node->unlock);
3166+
3167+ node->unlock = 1;
3168+ /* Load node content into memory and install node plugin by
3169+ looking at the node header.
3170+
3171+ Most of the time this call is cheap because the node is
3172+ already in memory.
3173+
3174+ Corresponding zrelse() is in unlock_carry_node()
3175+ */
3176+ return zload(reiser4_carry_real(node));
3177+}
3178+
3179+/* lock carry node
3180+
3181+ "Resolve" node to real znode, lock it and mark as locked.
3182+ This requires recursive locking of znodes.
3183+
3184+ When operation is posted to the parent level, node it will be applied to is
3185+ not yet known. For example, when shifting data between two nodes,
3186+ delimiting has to be updated in parent or parents of nodes involved. But
3187+ their parents is not yet locked and, moreover said nodes can be reparented
3188+ by concurrent balancing.
3189+
3190+ To work around this, carry operation is applied to special "carry node"
3191+ rather than to the znode itself. Carry node consists of some "base" or
3192+ "reference" znode and flags indicating how to get to the target of carry
3193+ operation (->real_node field of carry_node) from base.
3194+
3195+*/
3196+int lock_carry_node(carry_level * level /* level @node is in */ ,
3197+ carry_node * node /* node to lock */ )
3198+{
3199+ int result;
3200+ znode *reference_point;
3201+ lock_handle lh;
3202+ lock_handle tmp_lh;
3203+ reiser4_tree *tree;
3204+
3205+ assert("nikita-887", level != NULL);
3206+ assert("nikita-882", node != NULL);
3207+
3208+ result = 0;
3209+ reference_point = node->node;
3210+ init_lh(&lh);
3211+ init_lh(&tmp_lh);
3212+ if (node->left_before) {
3213+ /* handling of new nodes, allocated on the previous level:
3214+
3215+ some carry ops were propably posted from the new node, but
3216+ this node neither has parent pointer set, nor is
3217+ connected. This will be done in ->create_hook() for
3218+ internal item.
3219+
3220+ No then less, parent of new node has to be locked. To do
3221+ this, first go to the "left" in the carry order. This
3222+ depends on the decision to always allocate new node on the
3223+ right of existing one.
3224+
3225+ Loop handles case when multiple nodes, all orphans, were
3226+ inserted.
3227+
3228+ Strictly speaking, taking tree lock is not necessary here,
3229+ because all nodes scanned by loop in
3230+ find_begetting_brother() are write-locked by this thread,
3231+ and thus, their sibling linkage cannot change.
3232+
3233+ */
3234+ tree = znode_get_tree(reference_point);
3235+ read_lock_tree(tree);
3236+ reference_point = find_begetting_brother(node, level)->node;
3237+ read_unlock_tree(tree);
3238+ assert("nikita-1186", reference_point != NULL);
3239+ }
3240+ if (node->parent && (result == 0)) {
3241+ result =
3242+ reiser4_get_parent(&tmp_lh, reference_point,
3243+ ZNODE_WRITE_LOCK);
3244+ if (result != 0) {
3245+ ; /* nothing */
3246+ } else if (znode_get_level(tmp_lh.node) == 0) {
3247+ assert("nikita-1347", znode_above_root(tmp_lh.node));
3248+ result = add_new_root(level, node, tmp_lh.node);
3249+ if (result == 0) {
3250+ reference_point = level->new_root;
3251+ move_lh(&lh, &node->lock_handle);
3252+ }
3253+ } else if ((level->new_root != NULL)
3254+ && (level->new_root !=
3255+ znode_parent_nolock(reference_point))) {
3256+ /* parent of node exists, but this level aready
3257+ created different new root, so */
3258+ warning("nikita-1109",
3259+ /* it should be "radicis", but tradition is
3260+ tradition. do banshees read latin? */
3261+ "hodie natus est radici frater");
3262+ result = -EIO;
3263+ } else {
3264+ move_lh(&lh, &tmp_lh);
3265+ reference_point = lh.node;
3266+ }
3267+ }
3268+ if (node->left && (result == 0)) {
3269+ assert("nikita-1183", node->parent);
3270+ assert("nikita-883", reference_point != NULL);
3271+ result =
3272+ reiser4_get_left_neighbor(&tmp_lh, reference_point,
3273+ ZNODE_WRITE_LOCK,
3274+ GN_CAN_USE_UPPER_LEVELS);
3275+ if (result == 0) {
3276+ done_lh(&lh);
3277+ move_lh(&lh, &tmp_lh);
3278+ reference_point = lh.node;
3279+ }
3280+ }
3281+ if (!node->parent && !node->left && !node->left_before) {
3282+ result =
3283+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3284+ ZNODE_LOCK_HIPRI);
3285+ }
3286+ if (result == 0) {
3287+ move_lh(&node->lock_handle, &lh);
3288+ result = lock_carry_node_tail(node);
3289+ }
3290+ done_lh(&tmp_lh);
3291+ done_lh(&lh);
3292+ return result;
3293+}
3294+
3295+/* release a lock on &carry_node.
3296+
3297+ Release if necessary lock on @node. This opearion is pair of
3298+ lock_carry_node() and is idempotent: you can call it more than once on the
3299+ same node.
3300+
3301+*/
3302+static void
3303+unlock_carry_node(carry_level * level,
3304+ carry_node * node /* node to be released */ ,
3305+ int failure /* 0 if node is unlocked due
3306+ * to some error */ )
3307+{
3308+ znode *real_node;
3309+
3310+ assert("nikita-884", node != NULL);
3311+
3312+ real_node = reiser4_carry_real(node);
3313+ /* pair to zload() in lock_carry_node_tail() */
3314+ zrelse(real_node);
3315+ if (node->unlock && (real_node != NULL)) {
3316+ assert("nikita-899", real_node == node->lock_handle.node);
3317+ longterm_unlock_znode(&node->lock_handle);
3318+ }
3319+ if (failure) {
3320+ if (node->deallocate && (real_node != NULL)) {
3321+ /* free node in bitmap
3322+
3323+ Prepare node for removal. Last zput() will finish
3324+ with it.
3325+ */
3326+ ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3327+ }
3328+ if (node->free) {
3329+ assert("nikita-2177",
3330+ list_empty_careful(&node->lock_handle.locks_link));
3331+ assert("nikita-2112",
3332+ list_empty_careful(&node->lock_handle.owners_link));
3333+ reiser4_pool_free(&level->pool->node_pool,
3334+ &node->header);
3335+ }
3336+ }
3337+}
3338+
3339+/* fatal_carry_error() - all-catching error handling function
3340+
3341+ It is possible that carry faces unrecoverable error, like unability to
3342+ insert pointer at the internal level. Our simple solution is just panic in
3343+ this situation. More sophisticated things like attempt to remount
3344+ file-system as read-only can be implemented without much difficlties.
3345+
3346+ It is believed, that:
3347+
3348+ 1. in stead of panicking, all current transactions can be aborted rolling
3349+ system back to the consistent state.
3350+
3351+Umm, if you simply panic without doing anything more at all, then all current
3352+transactions are aborted and the system is rolled back to a consistent state,
3353+by virtue of the design of the transactional mechanism. Well, wait, let's be
3354+precise. If an internal node is corrupted on disk due to hardware failure,
3355+then there may be no consistent state that can be rolled back to, so instead
3356+we should say that it will rollback the transactions, which barring other
3357+factors means rolling back to a consistent state.
3358+
3359+# Nikita: there is a subtle difference between panic and aborting
3360+# transactions: machine doesn't reboot. Processes aren't killed. Processes
3361+# don't using reiser4 (not that we care about such processes), or using other
3362+# reiser4 mounts (about them we do care) will simply continue to run. With
3363+# some luck, even application using aborted file system can survive: it will
3364+# get some error, like EBADF, from each file descriptor on failed file system,
3365+# but applications that do care about tolerance will cope with this (squid
3366+# will).
3367+
3368+It would be a nice feature though to support rollback without rebooting
3369+followed by remount, but this can wait for later versions.
3370+
3371+ 2. once isolated transactions will be implemented it will be possible to
3372+ roll back offending transaction.
3373+
3374+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3375+it more before deciding if it should be done. -Hans
3376+
3377+*/
3378+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3379+ * where
3380+ * unrecoverable
3381+ * error
3382+ * occurred */ ,
3383+ int ecode /* error code */ )
3384+{
3385+ assert("nikita-1230", doing != NULL);
3386+ assert("nikita-1231", ecode < 0);
3387+
3388+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3389+}
3390+
3391+/* add new root to the tree
3392+
3393+ This function itself only manages changes in carry structures and delegates
3394+ all hard work (allocation of znode for new root, changes of parent and
3395+ sibling pointers to the reiser4_add_tree_root().
3396+
3397+ Locking: old tree root is locked by carry at this point. Fake znode is also
3398+ locked.
3399+
3400+*/
3401+static int add_new_root(carry_level * level /* carry level in context of which
3402+ * operation is performed */ ,
3403+ carry_node * node /* carry node for existing root */ ,
3404+ znode * fake /* "fake" znode already locked by
3405+ * us */ )
3406+{
3407+ int result;
3408+
3409+ assert("nikita-1104", level != NULL);
3410+ assert("nikita-1105", node != NULL);
3411+
3412+ assert("nikita-1403", znode_is_write_locked(node->node));
3413+ assert("nikita-1404", znode_is_write_locked(fake));
3414+
3415+ /* trying to create new root. */
3416+ /* @node is root and it's already locked by us. This
3417+ means that nobody else can be trying to add/remove
3418+ tree root right now.
3419+ */
3420+ if (level->new_root == NULL)
3421+ level->new_root = reiser4_add_tree_root(node->node, fake);
3422+ if (!IS_ERR(level->new_root)) {
3423+ assert("nikita-1210", znode_is_root(level->new_root));
3424+ node->deallocate = 1;
3425+ result =
3426+ longterm_lock_znode(&node->lock_handle, level->new_root,
3427+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3428+ if (result == 0)
3429+ zput(level->new_root);
3430+ } else {
3431+ result = PTR_ERR(level->new_root);
3432+ level->new_root = NULL;
3433+ }
3434+ return result;
3435+}
3436+
3437+/* allocate new znode and add the operation that inserts the
3438+ pointer to it into the parent node into the todo level
3439+
3440+ Allocate new znode, add it into carry queue and post into @todo queue
3441+ request to add pointer to new node into its parent.
3442+
3443+ This is carry related routing that calls reiser4_new_node() to allocate new
3444+ node.
3445+*/
3446+carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3447+ * node */ ,
3448+ carry_node * ref /* carry node after which new
3449+ * carry node is to be inserted
3450+ * into queue. This affects
3451+ * locking. */ ,
3452+ carry_level * doing /* carry queue where new node is
3453+ * to be added */ ,
3454+ carry_level * todo /* carry queue where COP_INSERT
3455+ * operation to add pointer to
3456+ * new node will ne added */ )
3457+{
3458+ carry_node *fresh;
3459+ znode *new_znode;
3460+ carry_op *add_pointer;
3461+ carry_plugin_info info;
3462+
3463+ assert("nikita-1048", brother != NULL);
3464+ assert("nikita-1049", todo != NULL);
3465+
3466+ /* There is a lot of possible variations here: to what parent
3467+ new node will be attached and where. For simplicity, always
3468+ do the following:
3469+
3470+ (1) new node and @brother will have the same parent.
3471+
3472+ (2) new node is added on the right of @brother
3473+
3474+ */
3475+
3476+ fresh = reiser4_add_carry_skip(doing,
3477+ ref ? POOLO_AFTER : POOLO_LAST, ref);
3478+ if (IS_ERR(fresh))
3479+ return fresh;
3480+
3481+ fresh->deallocate = 1;
3482+ fresh->free = 1;
3483+
3484+ new_znode = reiser4_new_node(brother, znode_get_level(brother));
3485+ if (IS_ERR(new_znode))
3486+ /* @fresh will be deallocated automatically by error
3487+ handling code in the caller. */
3488+ return (carry_node *) new_znode;
3489+
3490+ /* new_znode returned znode with x_count 1. Caller has to decrease
3491+ it. make_space() does. */
3492+
3493+ ZF_SET(new_znode, JNODE_ORPHAN);
3494+ fresh->node = new_znode;
3495+
3496+ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3497+ ref = carry_node_prev(ref);
3498+ assert("nikita-1606", !carry_node_end(doing, ref));
3499+ }
3500+
3501+ info.todo = todo;
3502+ info.doing = doing;
3503+ add_pointer = node_post_carry(&info, COP_INSERT,
3504+ reiser4_carry_real(ref), 1);
3505+ if (IS_ERR(add_pointer)) {
3506+ /* no need to deallocate @new_znode here: it will be
3507+ deallocated during carry error handling. */
3508+ return (carry_node *) add_pointer;
3509+ }
3510+
3511+ add_pointer->u.insert.type = COPT_CHILD;
3512+ add_pointer->u.insert.child = fresh;
3513+ add_pointer->u.insert.brother = brother;
3514+ /* initially new node spawns empty key range */
3515+ write_lock_dk(znode_get_tree(brother));
3516+ znode_set_ld_key(new_znode,
3517+ znode_set_rd_key(new_znode,
3518+ znode_get_rd_key(brother)));
3519+ write_unlock_dk(znode_get_tree(brother));
3520+ return fresh;
3521+}
3522+
3523+/* DEBUGGING FUNCTIONS.
3524+
3525+ Probably we also should leave them on even when
3526+ debugging is turned off to print dumps at errors.
3527+*/
3528+#if REISER4_DEBUG
3529+static int carry_level_invariant(carry_level * level, carry_queue_state state)
3530+{
3531+ carry_node *node;
3532+ carry_node *tmp_node;
3533+
3534+ if (level == NULL)
3535+ return 0;
3536+
3537+ if (level->track_type != 0 &&
3538+ level->track_type != CARRY_TRACK_NODE &&
3539+ level->track_type != CARRY_TRACK_CHANGE)
3540+ return 0;
3541+
3542+ /* check that nodes are in ascending order */
3543+ for_all_nodes(level, node, tmp_node) {
3544+ znode *left;
3545+ znode *right;
3546+
3547+ reiser4_key lkey;
3548+ reiser4_key rkey;
3549+
3550+ if (node != carry_node_front(level)) {
3551+ if (state == CARRY_TODO) {
3552+ right = node->node;
3553+ left = carry_node_prev(node)->node;
3554+ } else {
3555+ right = reiser4_carry_real(node);
3556+ left = reiser4_carry_real(carry_node_prev(node));
3557+ }
3558+ if (right == NULL || left == NULL)
3559+ continue;
3560+ if (node_is_empty(right) || node_is_empty(left))
3561+ continue;
3562+ if (!keyle(leftmost_key_in_node(left, &lkey),
3563+ leftmost_key_in_node(right, &rkey))) {
3564+ warning("", "wrong key order");
3565+ return 0;
3566+ }
3567+ }
3568+ }
3569+ return 1;
3570+}
3571+#endif
3572+
3573+/* get symbolic name for boolean */
3574+static const char *tf(int boolean /* truth value */ )
3575+{
3576+ return boolean ? "t" : "f";
3577+}
3578+
3579+/* symbolic name for carry operation */
3580+static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3581+{
3582+ switch (op) {
3583+ case COP_INSERT:
3584+ return "COP_INSERT";
3585+ case COP_DELETE:
3586+ return "COP_DELETE";
3587+ case COP_CUT:
3588+ return "COP_CUT";
3589+ case COP_PASTE:
3590+ return "COP_PASTE";
3591+ case COP_UPDATE:
3592+ return "COP_UPDATE";
3593+ case COP_EXTENT:
3594+ return "COP_EXTENT";
3595+ case COP_INSERT_FLOW:
3596+ return "COP_INSERT_FLOW";
3597+ default:{
3598+ /* not mt safe, but who cares? */
3599+ static char buf[20];
3600+
3601+ sprintf(buf, "unknown op: %x", op);
3602+ return buf;
3603+ }
3604+ }
3605+}
3606+
3607+/* dump information about carry node */
3608+static void print_carry(const char *prefix /* prefix to print */ ,
3609+ carry_node * node /* node to print */ )
3610+{
3611+ if (node == NULL) {
3612+ printk("%s: null\n", prefix);
3613+ return;
3614+ }
3615+ printk
3616+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3617+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3618+ tf(node->free), tf(node->deallocate));
3619+}
3620+
3621+/* dump information about carry operation */
3622+static void print_op(const char *prefix /* prefix to print */ ,
3623+ carry_op * op /* operation to print */ )
3624+{
3625+ if (op == NULL) {
3626+ printk("%s: null\n", prefix);
3627+ return;
3628+ }
3629+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3630+ print_carry("\tnode", op->node);
3631+ switch (op->op) {
3632+ case COP_INSERT:
3633+ case COP_PASTE:
3634+ print_coord("\tcoord",
3635+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3636+ reiser4_print_key("\tkey",
3637+ op->u.insert.d ? op->u.insert.d->key : NULL);
3638+ print_carry("\tchild", op->u.insert.child);
3639+ break;
3640+ case COP_DELETE:
3641+ print_carry("\tchild", op->u.delete.child);
3642+ break;
3643+ case COP_CUT:
3644+ if (op->u.cut_or_kill.is_cut) {
3645+ print_coord("\tfrom",
3646+ op->u.cut_or_kill.u.kill->params.from, 0);
3647+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3648+ 0);
3649+ } else {
3650+ print_coord("\tfrom",
3651+ op->u.cut_or_kill.u.cut->params.from, 0);
3652+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3653+ 0);
3654+ }
3655+ break;
3656+ case COP_UPDATE:
3657+ print_carry("\tleft", op->u.update.left);
3658+ break;
3659+ default:
3660+ /* do nothing */
3661+ break;
3662+ }
3663+}
3664+
3665+/* dump information about all nodes and operations in a @level */
3666+static void print_level(const char *prefix /* prefix to print */ ,
3667+ carry_level * level /* level to print */ )
3668+{
3669+ carry_node *node;
3670+ carry_node *tmp_node;
3671+ carry_op *op;
3672+ carry_op *tmp_op;
3673+
3674+ if (level == NULL) {
3675+ printk("%s: null\n", prefix);
3676+ return;
3677+ }
3678+ printk("%s: %p, restartable: %s\n",
3679+ prefix, level, tf(level->restartable));
3680+
3681+ for_all_nodes(level, node, tmp_node)
3682+ print_carry("\tcarry node", node);
3683+ for_all_ops(level, op, tmp_op)
3684+ print_op("\tcarry op", op);
3685+}
3686+
3687+/* Make Linus happy.
3688+ Local variables:
3689+ c-indentation-style: "K&R"
3690+ mode-name: "LC"
3691+ c-basic-offset: 8
3692+ tab-width: 8
3693+ fill-column: 120
3694+ scroll-step: 1
3695+ End:
3696+*/
3697diff -urN linux-2.6.24.orig/fs/reiser4/carry.h linux-2.6.24/fs/reiser4/carry.h
3698--- linux-2.6.24.orig/fs/reiser4/carry.h 1970-01-01 03:00:00.000000000 +0300
3699+++ linux-2.6.24/fs/reiser4/carry.h 2008-01-25 11:39:06.896197385 +0300
3700@@ -0,0 +1,442 @@
3701+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3702+
3703+/* Functions and data types to "carry" tree modification(s) upward.
3704+ See fs/reiser4/carry.c for details. */
3705+
3706+#if !defined( __FS_REISER4_CARRY_H__ )
3707+#define __FS_REISER4_CARRY_H__
3708+
3709+#include "forward.h"
3710+#include "debug.h"
3711+#include "pool.h"
3712+#include "znode.h"
3713+
3714+#include <linux/types.h>
3715+
3716+/* &carry_node - "location" of carry node.
3717+
3718+ "location" of node that is involved or going to be involved into
3719+ carry process. Node where operation will be carried to on the
3720+ parent level cannot be recorded explicitly. Operation will be carried
3721+ usually to the parent of some node (where changes are performed at
3722+ the current level) or, to the left neighbor of its parent. But while
3723+ modifications are performed at the current level, parent may
3724+ change. So, we have to allow some indirection (or, positevly,
3725+ flexibility) in locating carry nodes.
3726+
3727+*/
3728+typedef struct carry_node {
3729+ /* pool linkage */
3730+ struct reiser4_pool_header header;
3731+
3732+ /* base node from which real_node is calculated. See
3733+ fs/reiser4/carry.c:lock_carry_node(). */
3734+ znode *node;
3735+
3736+ /* how to get ->real_node */
3737+ /* to get ->real_node obtain parent of ->node */
3738+ __u32 parent:1;
3739+ /* to get ->real_node obtain left neighbor of parent of
3740+ ->node */
3741+ __u32 left:1;
3742+ __u32 left_before:1;
3743+
3744+ /* locking */
3745+
3746+ /* this node was locked by carry process and should be
3747+ unlocked when carry leaves a level */
3748+ __u32 unlock:1;
3749+
3750+ /* disk block for this node was allocated by carry process and
3751+ should be deallocated when carry leaves a level */
3752+ __u32 deallocate:1;
3753+ /* this carry node was allocated by carry process and should be
3754+ freed when carry leaves a level */
3755+ __u32 free:1;
3756+
3757+ /* type of lock we want to take on this node */
3758+ lock_handle lock_handle;
3759+} carry_node;
3760+
3761+/* &carry_opcode - elementary operations that can be carried upward
3762+
3763+ Operations that carry() can handle. This list is supposed to be
3764+ expanded.
3765+
3766+ Each carry operation (cop) is handled by appropriate function defined
3767+ in fs/reiser4/carry.c. For example COP_INSERT is handled by
3768+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3769+ call plugins of nodes affected by operation to modify nodes' content
3770+ and to gather operations to be performed on the next level.
3771+
3772+*/
3773+typedef enum {
3774+ /* insert new item into node. */
3775+ COP_INSERT,
3776+ /* delete pointer from parent node */
3777+ COP_DELETE,
3778+ /* remove part of or whole node. */
3779+ COP_CUT,
3780+ /* increase size of item. */
3781+ COP_PASTE,
3782+ /* insert extent (that is sequence of unformatted nodes). */
3783+ COP_EXTENT,
3784+ /* update delimiting key in least common ancestor of two
3785+ nodes. This is performed when items are moved between two
3786+ nodes.
3787+ */
3788+ COP_UPDATE,
3789+ /* insert flow */
3790+ COP_INSERT_FLOW,
3791+ COP_LAST_OP,
3792+} carry_opcode;
3793+
3794+#define CARRY_FLOW_NEW_NODES_LIMIT 20
3795+
3796+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3797+ item is determined. */
3798+typedef enum {
3799+ /* target item is one containing pointer to the ->child node */
3800+ COPT_CHILD,
3801+ /* target item is given explicitly by @coord */
3802+ COPT_ITEM_DATA,
3803+ /* target item is given by key */
3804+ COPT_KEY,
3805+ /* see insert_paste_common() for more comments on this. */
3806+ COPT_PASTE_RESTARTED,
3807+} cop_insert_pos_type;
3808+
3809+/* flags to cut and delete */
3810+typedef enum {
3811+ /* don't kill node even if it became completely empty as results of
3812+ * cut. This is needed for eottl handling. See carry_extent() for
3813+ * details. */
3814+ DELETE_RETAIN_EMPTY = (1 << 0)
3815+} cop_delete_flag;
3816+
3817+/*
3818+ * carry() implements "lock handle tracking" feature.
3819+ *
3820+ * Callers supply carry with node where to perform initial operation and lock
3821+ * handle on this node. Trying to optimize node utilization carry may actually
3822+ * move insertion point to different node. Callers expect that lock handle
3823+ * will rebe transferred to the new node also.
3824+ *
3825+ */
3826+typedef enum {
3827+ /* transfer lock handle along with insertion point */
3828+ CARRY_TRACK_CHANGE = 1,
3829+ /* acquire new lock handle to the node where insertion point is. This
3830+ * is used when carry() client doesn't initially possess lock handle
3831+ * on the insertion point node, for example, by extent insertion
3832+ * code. See carry_extent(). */
3833+ CARRY_TRACK_NODE = 2
3834+} carry_track_type;
3835+
3836+/* data supplied to COP_{INSERT|PASTE} by callers */
3837+typedef struct carry_insert_data {
3838+ /* position where new item is to be inserted */
3839+ coord_t *coord;
3840+ /* new item description */
3841+ reiser4_item_data *data;
3842+ /* key of new item */
3843+ const reiser4_key *key;
3844+} carry_insert_data;
3845+
3846+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
3847+struct cut_kill_params {
3848+ /* coord where cut starts (inclusive) */
3849+ coord_t *from;
3850+ /* coord where cut stops (inclusive, this item/unit will also be
3851+ * cut) */
3852+ coord_t *to;
3853+ /* starting key. This is necessary when item and unit pos don't
3854+ * uniquely identify what portion or tree to remove. For example, this
3855+ * indicates what portion of extent unit will be affected. */
3856+ const reiser4_key *from_key;
3857+ /* exclusive stop key */
3858+ const reiser4_key *to_key;
3859+ /* if this is not NULL, smallest actually removed key is stored
3860+ * here. */
3861+ reiser4_key *smallest_removed;
3862+ /* kill_node_content() is called for file truncate */
3863+ int truncate;
3864+};
3865+
3866+struct carry_cut_data {
3867+ struct cut_kill_params params;
3868+};
3869+
3870+struct carry_kill_data {
3871+ struct cut_kill_params params;
3872+ /* parameter to be passed to the ->kill_hook() method of item
3873+ * plugin */
3874+ /*void *iplug_params; *//* FIXME: unused currently */
3875+ /* if not NULL---inode whose items are being removed. This is needed
3876+ * for ->kill_hook() of extent item to update VM structures when
3877+ * removing pages. */
3878+ struct inode *inode;
3879+ /* sibling list maintenance is complicated by existence of eottl. When
3880+ * eottl whose left and right neighbors are formatted leaves is
3881+ * removed, one has to connect said leaves in the sibling list. This
3882+ * cannot be done when extent removal is just started as locking rules
3883+ * require sibling list update to happen atomically with removal of
3884+ * extent item. Therefore: 1. pointers to left and right neighbors
3885+ * have to be passed down to the ->kill_hook() of extent item, and
3886+ * 2. said neighbors have to be locked. */
3887+ lock_handle *left;
3888+ lock_handle *right;
3889+ /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
3890+ unsigned flags;
3891+ char *buf;
3892+};
3893+
3894+/* &carry_tree_op - operation to "carry" upward.
3895+
3896+ Description of an operation we want to "carry" to the upper level of
3897+ a tree: e.g, when we insert something and there is not enough space
3898+ we allocate a new node and "carry" the operation of inserting a
3899+ pointer to the new node to the upper level, on removal of empty node,
3900+ we carry up operation of removing appropriate entry from parent.
3901+
3902+ There are two types of carry ops: when adding or deleting node we
3903+ node at the parent level where appropriate modification has to be
3904+ performed is known in advance. When shifting items between nodes
3905+ (split, merge), delimiting key should be changed in the least common
3906+ parent of the nodes involved that is not known in advance.
3907+
3908+ For the operations of the first type we store in &carry_op pointer to
3909+ the &carry_node at the parent level. For the operation of the second
3910+ type we store &carry_node or parents of the left and right nodes
3911+ modified and keep track of them upward until they coincide.
3912+
3913+*/
3914+typedef struct carry_op {
3915+ /* pool linkage */
3916+ struct reiser4_pool_header header;
3917+ carry_opcode op;
3918+ /* node on which operation is to be performed:
3919+
3920+ for insert, paste: node where new item is to be inserted
3921+
3922+ for delete: node where pointer is to be deleted
3923+
3924+ for cut: node to cut from
3925+
3926+ for update: node where delimiting key is to be modified
3927+
3928+ for modify: parent of modified node
3929+
3930+ */
3931+ carry_node *node;
3932+ union {
3933+ struct {
3934+ /* (sub-)type of insertion/paste. Taken from
3935+ cop_insert_pos_type. */
3936+ __u8 type;
3937+ /* various operation flags. Taken from
3938+ cop_insert_flag. */
3939+ __u8 flags;
3940+ carry_insert_data *d;
3941+ carry_node *child;
3942+ znode *brother;
3943+ } insert, paste, extent;
3944+
3945+ struct {
3946+ int is_cut;
3947+ union {
3948+ carry_kill_data *kill;
3949+ carry_cut_data *cut;
3950+ } u;
3951+ } cut_or_kill;
3952+
3953+ struct {
3954+ carry_node *left;
3955+ } update;
3956+ struct {
3957+ /* changed child */
3958+ carry_node *child;
3959+ /* bitmask of changes. See &cop_modify_flag */
3960+ __u32 flag;
3961+ } modify;
3962+ struct {
3963+ /* flags to deletion operation. Are taken from
3964+ cop_delete_flag */
3965+ __u32 flags;
3966+ /* child to delete from parent. If this is
3967+ NULL, delete op->node. */
3968+ carry_node *child;
3969+ } delete;
3970+ struct {
3971+ /* various operation flags. Taken from
3972+ cop_insert_flag. */
3973+ __u32 flags;
3974+ flow_t *flow;
3975+ coord_t *insert_point;
3976+ reiser4_item_data *data;
3977+ /* flow insertion is limited by number of new blocks
3978+ added in that operation which do not get any data
3979+ but part of flow. This limit is set by macro
3980+ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
3981+ of nodes added already during one carry_flow */
3982+ int new_nodes;
3983+ } insert_flow;
3984+ } u;
3985+} carry_op;
3986+
3987+/* &carry_op_pool - preallocated pool of carry operations, and nodes */
3988+typedef struct carry_pool {
3989+ carry_op op[CARRIES_POOL_SIZE];
3990+ struct reiser4_pool op_pool;
3991+ carry_node node[NODES_LOCKED_POOL_SIZE];
3992+ struct reiser4_pool node_pool;
3993+} carry_pool;
3994+
3995+/* &carry_tree_level - carry process on given level
3996+
3997+ Description of balancing process on the given level.
3998+
3999+ No need for locking here, as carry_tree_level is essentially per
4000+ thread thing (for now).
4001+
4002+*/
4003+struct carry_level {
4004+ /* this level may be restarted */
4005+ __u32 restartable:1;
4006+ /* list of carry nodes on this level, ordered by key order */
4007+ struct list_head nodes;
4008+ struct list_head ops;
4009+ /* pool where new objects are allocated from */
4010+ carry_pool *pool;
4011+ int ops_num;
4012+ int nodes_num;
4013+ /* new root created on this level, if any */
4014+ znode *new_root;
4015+ /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
4016+ when they want ->tracked to automagically wander to the node where
4017+ insertion point moved after insert or paste.
4018+ */
4019+ carry_track_type track_type;
4020+ /* lock handle supplied by user that we are tracking. See
4021+ above. */
4022+ lock_handle *tracked;
4023+};
4024+
4025+/* information carry passes to plugin methods that may add new operations to
4026+ the @todo queue */
4027+struct carry_plugin_info {
4028+ carry_level *doing;
4029+ carry_level *todo;
4030+};
4031+
4032+int reiser4_carry(carry_level * doing, carry_level * done);
4033+
4034+carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
4035+ carry_node * reference);
4036+carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
4037+ carry_node * reference);
4038+
4039+extern carry_node *insert_carry_node(carry_level * doing,
4040+ carry_level * todo, const znode * node);
4041+
4042+extern carry_pool *init_carry_pool(int);
4043+extern void done_carry_pool(carry_pool * pool);
4044+
4045+extern void init_carry_level(carry_level * level, carry_pool * pool);
4046+
4047+extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4048+ znode * node, int apply_to_parent);
4049+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4050+ znode * node, int apply_to_parent_p);
4051+
4052+carry_node *add_new_znode(znode * brother, carry_node * reference,
4053+ carry_level * doing, carry_level * todo);
4054+
4055+carry_node *find_carry_node(carry_level * level, const znode * node);
4056+
4057+extern znode *reiser4_carry_real(const carry_node * node);
4058+
4059+/* helper macros to iterate over carry queues */
4060+
4061+#define carry_node_next( node ) \
4062+ list_entry((node)->header.level_linkage.next, carry_node, \
4063+ header.level_linkage)
4064+
4065+#define carry_node_prev( node ) \
4066+ list_entry((node)->header.level_linkage.prev, carry_node, \
4067+ header.level_linkage)
4068+
4069+#define carry_node_front( level ) \
4070+ list_entry((level)->nodes.next, carry_node, header.level_linkage)
4071+
4072+#define carry_node_back( level ) \
4073+ list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4074+
4075+#define carry_node_end( level, node ) \
4076+ (&(level)->nodes == &(node)->header.level_linkage)
4077+
4078+/* macro to iterate over all operations in a @level */
4079+#define for_all_ops( level /* carry level (of type carry_level *) */, \
4080+ op /* pointer to carry operation, modified by loop (of \
4081+ * type carry_op *) */, \
4082+ tmp /* pointer to carry operation (of type carry_op *), \
4083+ * used to make iterator stable in the face of \
4084+ * deletions from the level */ ) \
4085+for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4086+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4087+ &op->header.level_linkage != &level->ops; \
4088+ op = tmp, \
4089+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4090+
4091+#if 0
4092+for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4093+ tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4094+ ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4095+ op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4096+#endif
4097+
4098+/* macro to iterate over all nodes in a @level */ \
4099+#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4100+ node /* pointer to carry node, modified by loop (of \
4101+ * type carry_node *) */, \
4102+ tmp /* pointer to carry node (of type carry_node *), \
4103+ * used to make iterator stable in the face of * \
4104+ * deletions from the level */ ) \
4105+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4106+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4107+ &node->header.level_linkage != &level->nodes; \
4108+ node = tmp, \
4109+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4110+
4111+#if 0
4112+for( node = carry_node_front( level ), \
4113+ tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4114+ node = tmp, tmp = carry_node_next( node ) )
4115+#endif
4116+
4117+/* macro to iterate over all nodes in a @level in reverse order
4118+
4119+ This is used, because nodes are unlocked in reversed order of locking */
4120+#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4121+ node /* pointer to carry node, modified by loop \
4122+ * (of type carry_node *) */, \
4123+ tmp /* pointer to carry node (of type carry_node \
4124+ * *), used to make iterator stable in the \
4125+ * face of deletions from the level */ ) \
4126+for( node = carry_node_back( level ), \
4127+ tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4128+ node = tmp, tmp = carry_node_prev( node ) )
4129+
4130+/* __FS_REISER4_CARRY_H__ */
4131+#endif
4132+
4133+/* Make Linus happy.
4134+ Local variables:
4135+ c-indentation-style: "K&R"
4136+ mode-name: "LC"
4137+ c-basic-offset: 8
4138+ tab-width: 8
4139+ fill-column: 120
4140+ scroll-step: 1
4141+ End:
4142+*/
4143diff -urN linux-2.6.24.orig/fs/reiser4/carry_ops.c linux-2.6.24/fs/reiser4/carry_ops.c
4144--- linux-2.6.24.orig/fs/reiser4/carry_ops.c 1970-01-01 03:00:00.000000000 +0300
4145+++ linux-2.6.24/fs/reiser4/carry_ops.c 2008-01-25 11:39:06.900198415 +0300
4146@@ -0,0 +1,2131 @@
4147+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4148+
4149+/* implementation of carry operations */
4150+
4151+#include "forward.h"
4152+#include "debug.h"
4153+#include "key.h"
4154+#include "coord.h"
4155+#include "plugin/item/item.h"
4156+#include "plugin/node/node.h"
4157+#include "jnode.h"
4158+#include "znode.h"
4159+#include "block_alloc.h"
4160+#include "tree_walk.h"
4161+#include "pool.h"
4162+#include "tree_mod.h"
4163+#include "carry.h"
4164+#include "carry_ops.h"
4165+#include "tree.h"
4166+#include "super.h"
4167+#include "reiser4.h"
4168+
4169+#include <linux/types.h>
4170+#include <linux/err.h>
4171+
4172+static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4173+ carry_level * doing, carry_level * todo,
4174+ unsigned int including_insert_coord_p);
4175+
4176+extern int lock_carry_node(carry_level * level, carry_node * node);
4177+extern int lock_carry_node_tail(carry_node * node);
4178+
4179+/* find left neighbor of a carry node
4180+
4181+ Look for left neighbor of @node and add it to the @doing queue. See
4182+ comments in the body.
4183+
4184+*/
4185+static carry_node *find_left_neighbor(carry_op * op /* node to find left
4186+ * neighbor of */ ,
4187+ carry_level * doing /* level to scan */ )
4188+{
4189+ int result;
4190+ carry_node *node;
4191+ carry_node *left;
4192+ int flags;
4193+ reiser4_tree *tree;
4194+
4195+ node = op->node;
4196+
4197+ tree = current_tree;
4198+ read_lock_tree(tree);
4199+ /* first, check whether left neighbor is already in a @doing queue */
4200+ if (reiser4_carry_real(node)->left != NULL) {
4201+ /* NOTE: there is locking subtlety here. Look into
4202+ * find_right_neighbor() for more info */
4203+ if (find_carry_node(doing,
4204+ reiser4_carry_real(node)->left) != NULL) {
4205+ read_unlock_tree(tree);
4206+ left = node;
4207+ do {
4208+ left = list_entry(left->header.level_linkage.prev,
4209+ carry_node, header.level_linkage);
4210+ assert("nikita-3408", !carry_node_end(doing,
4211+ left));
4212+ } while (reiser4_carry_real(left) ==
4213+ reiser4_carry_real(node));
4214+ return left;
4215+ }
4216+ }
4217+ read_unlock_tree(tree);
4218+
4219+ left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4220+ if (IS_ERR(left))
4221+ return left;
4222+
4223+ left->node = node->node;
4224+ left->free = 1;
4225+
4226+ flags = GN_TRY_LOCK;
4227+ if (!op->u.insert.flags & COPI_LOAD_LEFT)
4228+ flags |= GN_NO_ALLOC;
4229+
4230+ /* then, feeling lucky, peek left neighbor in the cache. */
4231+ result = reiser4_get_left_neighbor(&left->lock_handle,
4232+ reiser4_carry_real(node),
4233+ ZNODE_WRITE_LOCK, flags);
4234+ if (result == 0) {
4235+ /* ok, node found and locked. */
4236+ result = lock_carry_node_tail(left);
4237+ if (result != 0)
4238+ left = ERR_PTR(result);
4239+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4240+ /* node is leftmost node in a tree, or neighbor wasn't in
4241+ cache, or there is an extent on the left. */
4242+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4243+ left = NULL;
4244+ } else if (doing->restartable) {
4245+ /* if left neighbor is locked, and level is restartable, add
4246+ new node to @doing and restart. */
4247+ assert("nikita-913", node->parent != 0);
4248+ assert("nikita-914", node->node != NULL);
4249+ left->left = 1;
4250+ left->free = 0;
4251+ left = ERR_PTR(-E_REPEAT);
4252+ } else {
4253+ /* left neighbor is locked, level cannot be restarted. Just
4254+ ignore left neighbor. */
4255+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4256+ left = NULL;
4257+ }
4258+ return left;
4259+}
4260+
4261+/* find right neighbor of a carry node
4262+
4263+ Look for right neighbor of @node and add it to the @doing queue. See
4264+ comments in the body.
4265+
4266+*/
4267+static carry_node *find_right_neighbor(carry_op * op /* node to find right
4268+ * neighbor of */ ,
4269+ carry_level * doing /* level to scan */ )
4270+{
4271+ int result;
4272+ carry_node *node;
4273+ carry_node *right;
4274+ lock_handle lh;
4275+ int flags;
4276+ reiser4_tree *tree;
4277+
4278+ init_lh(&lh);
4279+
4280+ node = op->node;
4281+
4282+ tree = current_tree;
4283+ read_lock_tree(tree);
4284+ /* first, check whether right neighbor is already in a @doing queue */
4285+ if (reiser4_carry_real(node)->right != NULL) {
4286+ /*
4287+ * Tree lock is taken here anyway, because, even if _outcome_
4288+ * of (find_carry_node() != NULL) doesn't depends on
4289+ * concurrent updates to ->right, find_carry_node() cannot
4290+ * work with second argument NULL. Hence, following comment is
4291+ * of historic importance only.
4292+ *
4293+ * Subtle:
4294+ *
4295+ * Q: why don't we need tree lock here, looking for the right
4296+ * neighbor?
4297+ *
4298+ * A: even if value of node->real_node->right were changed
4299+ * during find_carry_node() execution, outcome of execution
4300+ * wouldn't change, because (in short) other thread cannot add
4301+ * elements to the @doing, and if node->real_node->right
4302+ * already was in @doing, value of node->real_node->right
4303+ * couldn't change, because node cannot be inserted between
4304+ * locked neighbors.
4305+ */
4306+ if (find_carry_node(doing,
4307+ reiser4_carry_real(node)->right) != NULL) {
4308+ read_unlock_tree(tree);
4309+ /*
4310+ * What we are doing here (this is also applicable to
4311+ * the find_left_neighbor()).
4312+ *
4313+ * tree_walk.c code requires that insertion of a
4314+ * pointer to a child, modification of parent pointer
4315+ * in the child, and insertion of the child into
4316+ * sibling list are atomic (see
4317+ * plugin/item/internal.c:create_hook_internal()).
4318+ *
4319+ * carry allocates new node long before pointer to it
4320+ * is inserted into parent and, actually, long before
4321+ * parent is even known. Such allocated-but-orphaned
4322+ * nodes are only trackable through carry level lists.
4323+ *
4324+ * Situation that is handled here is following: @node
4325+ * has valid ->right pointer, but there is
4326+ * allocated-but-orphaned node in the carry queue that
4327+ * is logically between @node and @node->right. Here
4328+ * we are searching for it. Critical point is that
4329+ * this is only possible if @node->right is also in
4330+ * the carry queue (this is checked above), because
4331+ * this is the only way new orphaned node could be
4332+ * inserted between them (before inserting new node,
4333+ * make_space() first tries to shift to the right, so,
4334+ * right neighbor will be locked and queued).
4335+ *
4336+ */
4337+ right = node;
4338+ do {
4339+ right = list_entry(right->header.level_linkage.next,
4340+ carry_node, header.level_linkage);
4341+ assert("nikita-3408", !carry_node_end(doing,
4342+ right));
4343+ } while (reiser4_carry_real(right) ==
4344+ reiser4_carry_real(node));
4345+ return right;
4346+ }
4347+ }
4348+ read_unlock_tree(tree);
4349+
4350+ flags = GN_CAN_USE_UPPER_LEVELS;
4351+ if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4352+ flags = GN_NO_ALLOC;
4353+
4354+ /* then, try to lock right neighbor */
4355+ init_lh(&lh);
4356+ result = reiser4_get_right_neighbor(&lh,
4357+ reiser4_carry_real(node),
4358+ ZNODE_WRITE_LOCK, flags);
4359+ if (result == 0) {
4360+ /* ok, node found and locked. */
4361+ right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4362+ if (!IS_ERR(right)) {
4363+ right->node = lh.node;
4364+ move_lh(&right->lock_handle, &lh);
4365+ right->free = 1;
4366+ result = lock_carry_node_tail(right);
4367+ if (result != 0)
4368+ right = ERR_PTR(result);
4369+ }
4370+ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4371+ /* node is rightmost node in a tree, or neighbor wasn't in
4372+ cache, or there is an extent on the right. */
4373+ right = NULL;
4374+ } else
4375+ right = ERR_PTR(result);
4376+ done_lh(&lh);
4377+ return right;
4378+}
4379+
4380+/* how much free space in a @node is needed for @op
4381+
4382+ How much space in @node is required for completion of @op, where @op is
4383+ insert or paste operation.
4384+*/
4385+static unsigned int space_needed_for_op(znode * node /* znode data are
4386+ * inserted or
4387+ * pasted in */ ,
4388+ carry_op * op /* carry
4389+ operation */ )
4390+{
4391+ assert("nikita-919", op != NULL);
4392+
4393+ switch (op->op) {
4394+ default:
4395+ impossible("nikita-1701", "Wrong opcode");
4396+ case COP_INSERT:
4397+ return space_needed(node, NULL, op->u.insert.d->data, 1);
4398+ case COP_PASTE:
4399+ return space_needed(node, op->u.insert.d->coord,
4400+ op->u.insert.d->data, 0);
4401+ }
4402+}
4403+
4404+/* how much space in @node is required to insert or paste @data at
4405+ @coord. */
4406+unsigned int space_needed(const znode * node /* node data are inserted or
4407+ * pasted in */ ,
4408+ const coord_t * coord /* coord where data are
4409+ * inserted or pasted
4410+ * at */ ,
4411+ const reiser4_item_data * data /* data to insert or
4412+ * paste */ ,
4413+ int insertion /* non-0 is inserting, 0---paste */ )
4414+{
4415+ int result;
4416+ item_plugin *iplug;
4417+
4418+ assert("nikita-917", node != NULL);
4419+ assert("nikita-918", node_plugin_by_node(node) != NULL);
4420+ assert("vs-230", !insertion || (coord == NULL));
4421+
4422+ result = 0;
4423+ iplug = data->iplug;
4424+ if (iplug->b.estimate != NULL) {
4425+ /* ask item plugin how much space is needed to insert this
4426+ item */
4427+ result += iplug->b.estimate(insertion ? NULL : coord, data);
4428+ } else {
4429+ /* reasonable default */
4430+ result += data->length;
4431+ }
4432+ if (insertion) {
4433+ node_plugin *nplug;
4434+
4435+ nplug = node->nplug;
4436+ /* and add node overhead */
4437+ if (nplug->item_overhead != NULL) {
4438+ result += nplug->item_overhead(node, NULL);
4439+ }
4440+ }
4441+ return result;
4442+}
4443+
4444+/* find &coord in parent where pointer to new child is to be stored. */
4445+static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4446+ * insert pointer to new
4447+ * child */ )
4448+{
4449+ int result;
4450+ znode *node;
4451+ znode *child;
4452+
4453+ assert("nikita-941", op != NULL);
4454+ assert("nikita-942", op->op == COP_INSERT);
4455+
4456+ node = reiser4_carry_real(op->node);
4457+ assert("nikita-943", node != NULL);
4458+ assert("nikita-944", node_plugin_by_node(node) != NULL);
4459+
4460+ child = reiser4_carry_real(op->u.insert.child);
4461+ result =
4462+ find_new_child_ptr(node, child, op->u.insert.brother,
4463+ op->u.insert.d->coord);
4464+
4465+ build_child_ptr_data(child, op->u.insert.d->data);
4466+ return result;
4467+}
4468+
4469+/* additional amount of free space in @node required to complete @op */
4470+static int free_space_shortage(znode * node /* node to check */ ,
4471+ carry_op * op /* operation being performed */ )
4472+{
4473+ assert("nikita-1061", node != NULL);
4474+ assert("nikita-1062", op != NULL);
4475+
4476+ switch (op->op) {
4477+ default:
4478+ impossible("nikita-1702", "Wrong opcode");
4479+ case COP_INSERT:
4480+ case COP_PASTE:
4481+ return space_needed_for_op(node, op) - znode_free_space(node);
4482+ case COP_EXTENT:
4483+ /* when inserting extent shift data around until insertion
4484+ point is utmost in the node. */
4485+ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4486+ return +1;
4487+ else
4488+ return -1;
4489+ }
4490+}
4491+
4492+/* helper function: update node pointer in operation after insertion
4493+ point was probably shifted into @target. */
4494+static znode *sync_op(carry_op * op, carry_node * target)
4495+{
4496+ znode *insertion_node;
4497+
4498+ /* reget node from coord: shift might move insertion coord to
4499+ the neighbor */
4500+ insertion_node = op->u.insert.d->coord->node;
4501+ /* if insertion point was actually moved into new node,
4502+ update carry node pointer in operation. */
4503+ if (insertion_node != reiser4_carry_real(op->node)) {
4504+ op->node = target;
4505+ assert("nikita-2540",
4506+ reiser4_carry_real(target) == insertion_node);
4507+ }
4508+ assert("nikita-2541",
4509+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4510+ return insertion_node;
4511+}
4512+
4513+/*
4514+ * complete make_space() call: update tracked lock handle if necessary. See
4515+ * comments for fs/reiser4/carry.h:carry_track_type
4516+ */
4517+static int
4518+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4519+{
4520+ int result;
4521+ carry_track_type tracking;
4522+ znode *node;
4523+
4524+ tracking = doing->track_type;
4525+ node = op->u.insert.d->coord->node;
4526+
4527+ if (tracking == CARRY_TRACK_NODE ||
4528+ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4529+ /* inserting or pasting into node different from
4530+ original. Update lock handle supplied by caller. */
4531+ assert("nikita-1417", doing->tracked != NULL);
4532+ done_lh(doing->tracked);
4533+ init_lh(doing->tracked);
4534+ result = longterm_lock_znode(doing->tracked, node,
4535+ ZNODE_WRITE_LOCK,
4536+ ZNODE_LOCK_HIPRI);
4537+ } else
4538+ result = 0;
4539+ return result;
4540+}
4541+
4542+/* This is insertion policy function. It shifts data to the left and right
4543+ neighbors of insertion coord and allocates new nodes until there is enough
4544+ free space to complete @op.
4545+
4546+ See comments in the body.
4547+
4548+ Assumes that the node format favors insertions at the right end of the node
4549+ as node40 does.
4550+
4551+ See carry_flow() on detail about flow insertion
4552+*/
4553+static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4554+ carry_level * doing /* current carry queue */ ,
4555+ carry_level * todo /* carry queue on the parent level */ )
4556+{
4557+ znode *node;
4558+ int result;
4559+ int not_enough_space;
4560+ int blk_alloc;
4561+ znode *orig_node;
4562+ __u32 flags;
4563+
4564+ coord_t *coord;
4565+
4566+ assert("nikita-890", op != NULL);
4567+ assert("nikita-891", todo != NULL);
4568+ assert("nikita-892",
4569+ op->op == COP_INSERT ||
4570+ op->op == COP_PASTE || op->op == COP_EXTENT);
4571+ assert("nikita-1607",
4572+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4573+
4574+ flags = op->u.insert.flags;
4575+
4576+ /* NOTE check that new node can only be allocated after checking left
4577+ * and right neighbors. This is necessary for proper work of
4578+ * find_{left,right}_neighbor(). */
4579+ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4580+ flags & COPI_DONT_SHIFT_LEFT));
4581+ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4582+ flags & COPI_DONT_SHIFT_RIGHT));
4583+
4584+ coord = op->u.insert.d->coord;
4585+ orig_node = node = coord->node;
4586+
4587+ assert("nikita-908", node != NULL);
4588+ assert("nikita-909", node_plugin_by_node(node) != NULL);
4589+
4590+ result = 0;
4591+ /* If there is not enough space in a node, try to shift something to
4592+ the left neighbor. This is a bit tricky, as locking to the left is
4593+ low priority. This is handled by restart logic in carry().
4594+ */
4595+ not_enough_space = free_space_shortage(node, op);
4596+ if (not_enough_space <= 0)
4597+ /* it is possible that carry was called when there actually
4598+ was enough space in the node. For example, when inserting
4599+ leftmost item so that delimiting keys have to be updated.
4600+ */
4601+ return make_space_tail(op, doing, orig_node);
4602+ if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4603+ carry_node *left;
4604+ /* make note in statistics of an attempt to move
4605+ something into the left neighbor */
4606+ left = find_left_neighbor(op, doing);
4607+ if (unlikely(IS_ERR(left))) {
4608+ if (PTR_ERR(left) == -E_REPEAT)
4609+ return -E_REPEAT;
4610+ else {
4611+ /* some error other than restart request
4612+ occurred. This shouldn't happen. Issue a
4613+ warning and continue as if left neighbor
4614+ weren't existing.
4615+ */
4616+ warning("nikita-924",
4617+ "Error accessing left neighbor: %li",
4618+ PTR_ERR(left));
4619+ }
4620+ } else if (left != NULL) {
4621+
4622+ /* shift everything possible on the left of and
4623+ including insertion coord into the left neighbor */
4624+ result = carry_shift_data(LEFT_SIDE, coord,
4625+ reiser4_carry_real(left),
4626+ doing, todo,
4627+ flags & COPI_GO_LEFT);
4628+
4629+ /* reget node from coord: shift_left() might move
4630+ insertion coord to the left neighbor */
4631+ node = sync_op(op, left);
4632+
4633+ not_enough_space = free_space_shortage(node, op);
4634+ /* There is not enough free space in @node, but
4635+ may be, there is enough free space in
4636+ @left. Various balancing decisions are valid here.
4637+ The same for the shifiting to the right.
4638+ */
4639+ }
4640+ }
4641+ /* If there still is not enough space, shift to the right */
4642+ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4643+ carry_node *right;
4644+
4645+ right = find_right_neighbor(op, doing);
4646+ if (IS_ERR(right)) {
4647+ warning("nikita-1065",
4648+ "Error accessing right neighbor: %li",
4649+ PTR_ERR(right));
4650+ } else if (right != NULL) {
4651+ /* node containing insertion point, and its right
4652+ neighbor node are write locked by now.
4653+
4654+ shift everything possible on the right of but
4655+ excluding insertion coord into the right neighbor
4656+ */
4657+ result = carry_shift_data(RIGHT_SIDE, coord,
4658+ reiser4_carry_real(right),
4659+ doing, todo,
4660+ flags & COPI_GO_RIGHT);
4661+ /* reget node from coord: shift_right() might move
4662+ insertion coord to the right neighbor */
4663+ node = sync_op(op, right);
4664+ not_enough_space = free_space_shortage(node, op);
4665+ }
4666+ }
4667+ /* If there is still not enough space, allocate new node(s).
4668+
4669+ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4670+ the carry operation flags (currently this is needed during flush
4671+ only).
4672+ */
4673+ for (blk_alloc = 0;
4674+ not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4675+ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4676+ carry_node *fresh; /* new node we are allocating */
4677+ coord_t coord_shadow; /* remembered insertion point before
4678+ * shifting data into new node */
4679+ carry_node *node_shadow; /* remembered insertion node before
4680+ * shifting */
4681+ unsigned int gointo; /* whether insertion point should move
4682+ * into newly allocated node */
4683+
4684+ /* allocate new node on the right of @node. Znode and disk
4685+ fake block number for new node are allocated.
4686+
4687+ add_new_znode() posts carry operation COP_INSERT with
4688+ COPT_CHILD option to the parent level to add
4689+ pointer to newly created node to its parent.
4690+
4691+ Subtle point: if several new nodes are required to complete
4692+ insertion operation at this level, they will be inserted
4693+ into their parents in the order of creation, which means
4694+ that @node will be valid "cookie" at the time of insertion.
4695+
4696+ */
4697+ fresh = add_new_znode(node, op->node, doing, todo);
4698+ if (IS_ERR(fresh))
4699+ return PTR_ERR(fresh);
4700+
4701+ /* Try to shift into new node. */
4702+ result = lock_carry_node(doing, fresh);
4703+ zput(reiser4_carry_real(fresh));
4704+ if (result != 0) {
4705+ warning("nikita-947",
4706+ "Cannot lock new node: %i", result);
4707+ return result;
4708+ }
4709+
4710+ /* both nodes are write locked by now.
4711+
4712+ shift everything possible on the right of and
4713+ including insertion coord into the right neighbor.
4714+ */
4715+ coord_dup(&coord_shadow, op->u.insert.d->coord);
4716+ node_shadow = op->node;
4717+ /* move insertion point into newly created node if:
4718+
4719+ . insertion point is rightmost in the source node, or
4720+ . this is not the first node we are allocating in a row.
4721+ */
4722+ gointo =
4723+ (blk_alloc > 0) ||
4724+ coord_is_after_rightmost(op->u.insert.d->coord);
4725+
4726+ if (gointo &&
4727+ op->op == COP_PASTE &&
4728+ coord_is_existing_item(op->u.insert.d->coord) &&
4729+ is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
4730+ /* paste into solid (atomic) item, which can contain
4731+ only one unit, so we need to shift it right, where
4732+ insertion point supposed to be */
4733+
4734+ assert("edward-1444", op->u.insert.d->data->iplug ==
4735+ item_plugin_by_id(STATIC_STAT_DATA_ID));
4736+ assert("edward-1445",
4737+ op->u.insert.d->data->length >
4738+ node_plugin_by_node(coord->node)->free_space
4739+ (coord->node));
4740+
4741+ op->u.insert.d->coord->between = BEFORE_UNIT;
4742+ }
4743+
4744+ result = carry_shift_data(RIGHT_SIDE, coord,
4745+ reiser4_carry_real(fresh),
4746+ doing, todo, gointo);
4747+ /* if insertion point was actually moved into new node,
4748+ update carry node pointer in operation. */
4749+ node = sync_op(op, fresh);
4750+ not_enough_space = free_space_shortage(node, op);
4751+ if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4752+ /* there is not enough free in new node. Shift
4753+ insertion point back to the @shadow_node so that
4754+ next new node would be inserted between
4755+ @shadow_node and @fresh.
4756+ */
4757+ coord_normalize(&coord_shadow);
4758+ coord_dup(coord, &coord_shadow);
4759+ node = coord->node;
4760+ op->node = node_shadow;
4761+ if (1 || (flags & COPI_STEP_BACK)) {
4762+ /* still not enough space?! Maybe there is
4763+ enough space in the source node (i.e., node
4764+ data are moved from) now.
4765+ */
4766+ not_enough_space =
4767+ free_space_shortage(node, op);
4768+ }
4769+ }
4770+ }
4771+ if (not_enough_space > 0) {
4772+ if (!(flags & COPI_DONT_ALLOCATE))
4773+ warning("nikita-948", "Cannot insert new item");
4774+ result = -E_NODE_FULL;
4775+ }
4776+ assert("nikita-1622", ergo(result == 0,
4777+ reiser4_carry_real(op->node) == coord->node));
4778+ assert("nikita-2616", coord == op->u.insert.d->coord);
4779+ if (result == 0)
4780+ result = make_space_tail(op, doing, orig_node);
4781+ return result;
4782+}
4783+
4784+/* insert_paste_common() - common part of insert and paste operations
4785+
4786+ This function performs common part of COP_INSERT and COP_PASTE.
4787+
4788+ There are two ways in which insertion/paste can be requested:
4789+
4790+ . by directly supplying reiser4_item_data. In this case, op ->
4791+ u.insert.type is set to COPT_ITEM_DATA.
4792+
4793+ . by supplying child pointer to which is to inserted into parent. In this
4794+ case op -> u.insert.type == COPT_CHILD.
4795+
4796+ . by supplying key of new item/unit. This is currently only used during
4797+ extent insertion
4798+
4799+ This is required, because when new node is allocated we don't know at what
4800+ position pointer to it is to be stored in the parent. Actually, we don't
4801+ even know what its parent will be, because parent can be re-balanced
4802+ concurrently and new node re-parented, and because parent can be full and
4803+ pointer to the new node will go into some other node.
4804+
4805+ insert_paste_common() resolves pointer to child node into position in the
4806+ parent by calling find_new_child_coord(), that fills
4807+ reiser4_item_data. After this, insertion/paste proceeds uniformly.
4808+
4809+ Another complication is with finding free space during pasting. It may
4810+ happen that while shifting items to the neighbors and newly allocated
4811+ nodes, insertion coord can no longer be in the item we wanted to paste
4812+ into. At this point, paste becomes (morphs) into insert. Moreover free
4813+ space analysis has to be repeated, because amount of space required for
4814+ insertion is different from that of paste (item header overhead, etc).
4815+
4816+ This function "unifies" different insertion modes (by resolving child
4817+ pointer or key into insertion coord), and then calls make_space() to free
4818+ enough space in the node by shifting data to the left and right and by
4819+ allocating new nodes if necessary. Carry operation knows amount of space
4820+ required for its completion. After enough free space is obtained, caller of
4821+ this function (carry_{insert,paste,etc.}) performs actual insertion/paste
4822+ by calling item plugin method.
4823+
4824+*/
4825+static int insert_paste_common(carry_op * op /* carry operation being
4826+ * performed */ ,
4827+ carry_level * doing /* current carry level */ ,
4828+ carry_level * todo /* next carry level */ ,
4829+ carry_insert_data * cdata /* pointer to
4830+ * cdata */ ,
4831+ coord_t * coord /* insertion/paste coord */ ,
4832+ reiser4_item_data * data /* data to be
4833+ * inserted/pasted */ )
4834+{
4835+ assert("nikita-981", op != NULL);
4836+ assert("nikita-980", todo != NULL);
4837+ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
4838+ || (op->op == COP_EXTENT));
4839+
4840+ if (op->u.insert.type == COPT_PASTE_RESTARTED) {
4841+ /* nothing to do. Fall through to make_space(). */
4842+ ;
4843+ } else if (op->u.insert.type == COPT_KEY) {
4844+ node_search_result intra_node;
4845+ znode *node;
4846+ /* Problem with doing batching at the lowest level, is that
4847+ operations here are given by coords where modification is
4848+ to be performed, and one modification can invalidate coords
4849+ of all following operations.
4850+
4851+ So, we are implementing yet another type for operation that
4852+ will use (the only) "locator" stable across shifting of
4853+ data between nodes, etc.: key (COPT_KEY).
4854+
4855+ This clause resolves key to the coord in the node.
4856+
4857+ But node can change also. Probably some pieces have to be
4858+ added to the lock_carry_node(), to lock node by its key.
4859+
4860+ */
4861+ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
4862+ if you need something else. */
4863+ op->u.insert.d->coord = coord;
4864+ node = reiser4_carry_real(op->node);
4865+ intra_node = node_plugin_by_node(node)->lookup
4866+ (node, op->u.insert.d->key, FIND_EXACT,
4867+ op->u.insert.d->coord);
4868+ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
4869+ warning("nikita-1715", "Intra node lookup failure: %i",
4870+ intra_node);
4871+ return intra_node;
4872+ }
4873+ } else if (op->u.insert.type == COPT_CHILD) {
4874+ /* if we are asked to insert pointer to the child into
4875+ internal node, first convert pointer to the child into
4876+ coord within parent node.
4877+ */
4878+ znode *child;
4879+ int result;
4880+
4881+ op->u.insert.d = cdata;
4882+ op->u.insert.d->coord = coord;
4883+ op->u.insert.d->data = data;
4884+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4885+ result = find_new_child_coord(op);
4886+ child = reiser4_carry_real(op->u.insert.child);
4887+ if (result != NS_NOT_FOUND) {
4888+ warning("nikita-993",
4889+ "Cannot find a place for child pointer: %i",
4890+ result);
4891+ return result;
4892+ }
4893+ /* This only happens when we did multiple insertions at
4894+ the previous level, trying to insert single item and
4895+ it so happened, that insertion of pointers to all new
4896+ nodes before this one already caused parent node to
4897+ split (may be several times).
4898+
4899+ I am going to come up with better solution.
4900+
4901+ You are not expected to understand this.
4902+ -- v6root/usr/sys/ken/slp.c
4903+
4904+ Basically, what happens here is the following: carry came
4905+ to the parent level and is about to insert internal item
4906+ pointing to the child node that it just inserted in the
4907+ level below. Position where internal item is to be inserted
4908+ was found by find_new_child_coord() above, but node of the
4909+ current carry operation (that is, parent node of child
4910+ inserted on the previous level), was determined earlier in
4911+ the lock_carry_level/lock_carry_node. It could so happen
4912+ that other carry operations already performed on the parent
4913+ level already split parent node, so that insertion point
4914+ moved into another node. Handle this by creating new carry
4915+ node for insertion point if necessary.
4916+ */
4917+ if (reiser4_carry_real(op->node) !=
4918+ op->u.insert.d->coord->node) {
4919+ pool_ordering direction;
4920+ znode *z1;
4921+ znode *z2;
4922+ reiser4_key k1;
4923+ reiser4_key k2;
4924+
4925+ /*
4926+ * determine in what direction insertion point
4927+ * moved. Do this by comparing delimiting keys.
4928+ */
4929+ z1 = op->u.insert.d->coord->node;
4930+ z2 = reiser4_carry_real(op->node);
4931+ if (keyle(leftmost_key_in_node(z1, &k1),
4932+ leftmost_key_in_node(z2, &k2)))
4933+ /* insertion point moved to the left */
4934+ direction = POOLO_BEFORE;
4935+ else
4936+ /* insertion point moved to the right */
4937+ direction = POOLO_AFTER;
4938+
4939+ op->node = reiser4_add_carry_skip(doing,
4940+ direction, op->node);
4941+ if (IS_ERR(op->node))
4942+ return PTR_ERR(op->node);
4943+ op->node->node = op->u.insert.d->coord->node;
4944+ op->node->free = 1;
4945+ result = lock_carry_node(doing, op->node);
4946+ if (result != 0)
4947+ return result;
4948+ }
4949+
4950+ /*
4951+ * set up key of an item being inserted: we are inserting
4952+ * internal item and its key is (by the very definition of
4953+ * search tree) is leftmost key in the child node.
4954+ */
4955+ write_lock_dk(znode_get_tree(child));
4956+ op->u.insert.d->key = leftmost_key_in_node(child,
4957+ znode_get_ld_key(child));
4958+ write_unlock_dk(znode_get_tree(child));
4959+ op->u.insert.d->data->arg = op->u.insert.brother;
4960+ } else {
4961+ assert("vs-243", op->u.insert.d->coord != NULL);
4962+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4963+ }
4964+
4965+ /* find free space. */
4966+ return make_space(op, doing, todo);
4967+}
4968+
4969+/* handle carry COP_INSERT operation.
4970+
4971+ Insert new item into node. New item can be given in one of two ways:
4972+
4973+ - by passing &tree_coord and &reiser4_item_data as part of @op. This is
4974+ only applicable at the leaf/twig level.
4975+
4976+ - by passing a child node pointer to which is to be inserted by this
4977+ operation.
4978+
4979+*/
4980+static int carry_insert(carry_op * op /* operation to perform */ ,
4981+ carry_level * doing /* queue of operations @op
4982+ * is part of */ ,
4983+ carry_level * todo /* queue where new operations
4984+ * are accumulated */ )
4985+{
4986+ znode *node;
4987+ carry_insert_data cdata;
4988+ coord_t coord;
4989+ reiser4_item_data data;
4990+ carry_plugin_info info;
4991+ int result;
4992+
4993+ assert("nikita-1036", op != NULL);
4994+ assert("nikita-1037", todo != NULL);
4995+ assert("nikita-1038", op->op == COP_INSERT);
4996+
4997+ coord_init_zero(&coord);
4998+
4999+ /* perform common functionality of insert and paste. */
5000+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5001+ if (result != 0)
5002+ return result;
5003+
5004+ node = op->u.insert.d->coord->node;
5005+ assert("nikita-1039", node != NULL);
5006+ assert("nikita-1040", node_plugin_by_node(node) != NULL);
5007+
5008+ assert("nikita-949",
5009+ space_needed_for_op(node, op) <= znode_free_space(node));
5010+
5011+ /* ask node layout to create new item. */
5012+ info.doing = doing;
5013+ info.todo = todo;
5014+ result = node_plugin_by_node(node)->create_item
5015+ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
5016+ &info);
5017+ doing->restartable = 0;
5018+ znode_make_dirty(node);
5019+
5020+ return result;
5021+}
5022+
5023+/*
5024+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5025+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5026+ * by slicing into multiple items.
5027+ */
5028+
5029+#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
5030+#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
5031+#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
5032+
5033+static size_t item_data_overhead(carry_op * op)
5034+{
5035+ if (flow_insert_data(op)->iplug->b.estimate == NULL)
5036+ return 0;
5037+ return (flow_insert_data(op)->iplug->b.
5038+ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5039+ flow_insert_data(op)->length);
5040+}
5041+
5042+/* FIXME-VS: this is called several times during one make_flow_for_insertion
5043+ and it will always return the same result. Some optimization could be made
5044+ by calculating this value once at the beginning and passing it around. That
5045+ would reduce some flexibility in future changes
5046+*/
5047+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5048+static size_t flow_insertion_overhead(carry_op * op)
5049+{
5050+ znode *node;
5051+ size_t insertion_overhead;
5052+
5053+ node = flow_insert_point(op)->node;
5054+ insertion_overhead = 0;
5055+ if (node->nplug->item_overhead &&
5056+ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5057+ flow_insert_data(op)))
5058+ insertion_overhead =
5059+ node->nplug->item_overhead(node, NULL) +
5060+ item_data_overhead(op);
5061+ return insertion_overhead;
5062+}
5063+
5064+/* how many bytes of flow does fit to the node */
5065+static int what_can_fit_into_node(carry_op * op)
5066+{
5067+ size_t free, overhead;
5068+
5069+ overhead = flow_insertion_overhead(op);
5070+ free = znode_free_space(flow_insert_point(op)->node);
5071+ if (free <= overhead)
5072+ return 0;
5073+ free -= overhead;
5074+ /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5075+ if (free < op->u.insert_flow.flow->length)
5076+ return free;
5077+ return (int)op->u.insert_flow.flow->length;
5078+}
5079+
5080+/* in make_space_for_flow_insertion we need to check either whether whole flow
5081+ fits into a node or whether minimal fraction of flow fits into a node */
5082+static int enough_space_for_whole_flow(carry_op * op)
5083+{
5084+ return (unsigned)what_can_fit_into_node(op) ==
5085+ op->u.insert_flow.flow->length;
5086+}
5087+
5088+#define MIN_FLOW_FRACTION 1
5089+static int enough_space_for_min_flow_fraction(carry_op * op)
5090+{
5091+ assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5092+
5093+ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5094+}
5095+
5096+/* this returns 0 if left neighbor was obtained successfully and everything
5097+ upto insertion point including it were shifted and left neighbor still has
5098+ some free space to put minimal fraction of flow into it */
5099+static int
5100+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5101+{
5102+ carry_node *left;
5103+ znode *orig;
5104+
5105+ left = find_left_neighbor(op, doing);
5106+ if (unlikely(IS_ERR(left))) {
5107+ warning("vs-899",
5108+ "make_space_by_shift_left: "
5109+ "error accessing left neighbor: %li", PTR_ERR(left));
5110+ return 1;
5111+ }
5112+ if (left == NULL)
5113+ /* left neighbor either does not exist or is unformatted
5114+ node */
5115+ return 1;
5116+
5117+ orig = flow_insert_point(op)->node;
5118+ /* try to shift content of node @orig from its head upto insert point
5119+ including insertion point into the left neighbor */
5120+ carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5121+ reiser4_carry_real(left), doing, todo,
5122+ 1 /* including insert point */);
5123+ if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5124+ /* insertion point did not move */
5125+ return 1;
5126+ }
5127+
5128+ /* insertion point is set after last item in the node */
5129+ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5130+
5131+ if (!enough_space_for_min_flow_fraction(op)) {
5132+ /* insertion point node does not have enough free space to put
5133+ even minimal portion of flow into it, therefore, move
5134+ insertion point back to orig node (before first item) */
5135+ coord_init_before_first_item(flow_insert_point(op), orig);
5136+ return 1;
5137+ }
5138+
5139+ /* part of flow is to be written to the end of node */
5140+ op->node = left;
5141+ return 0;
5142+}
5143+
5144+/* this returns 0 if right neighbor was obtained successfully and everything to
5145+ the right of insertion point was shifted to it and node got enough free
5146+ space to put minimal fraction of flow into it */
5147+static int
5148+make_space_by_shift_right(carry_op * op, carry_level * doing,
5149+ carry_level * todo)
5150+{
5151+ carry_node *right;
5152+
5153+ right = find_right_neighbor(op, doing);
5154+ if (unlikely(IS_ERR(right))) {
5155+ warning("nikita-1065", "shift_right_excluding_insert_point: "
5156+ "error accessing right neighbor: %li", PTR_ERR(right));
5157+ return 1;
5158+ }
5159+ if (right) {
5160+ /* shift everything possible on the right of but excluding
5161+ insertion coord into the right neighbor */
5162+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5163+ reiser4_carry_real(right), doing, todo,
5164+ 0 /* not including insert point */);
5165+ } else {
5166+ /* right neighbor either does not exist or is unformatted
5167+ node */
5168+ ;
5169+ }
5170+ if (coord_is_after_rightmost(flow_insert_point(op))) {
5171+ if (enough_space_for_min_flow_fraction(op)) {
5172+ /* part of flow is to be written to the end of node */
5173+ return 0;
5174+ }
5175+ }
5176+
5177+ /* new node is to be added if insert point node did not get enough
5178+ space for whole flow */
5179+ return 1;
5180+}
5181+
5182+/* this returns 0 when insert coord is set at the node end and fraction of flow
5183+ fits into that node */
5184+static int
5185+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5186+{
5187+ int result;
5188+ znode *node;
5189+ carry_node *new;
5190+
5191+ node = flow_insert_point(op)->node;
5192+
5193+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5194+ return RETERR(-E_NODE_FULL);
5195+ /* add new node after insert point node */
5196+ new = add_new_znode(node, op->node, doing, todo);
5197+ if (unlikely(IS_ERR(new))) {
5198+ return PTR_ERR(new);
5199+ }
5200+ result = lock_carry_node(doing, new);
5201+ zput(reiser4_carry_real(new));
5202+ if (unlikely(result)) {
5203+ return result;
5204+ }
5205+ op->u.insert_flow.new_nodes++;
5206+ if (!coord_is_after_rightmost(flow_insert_point(op))) {
5207+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5208+ reiser4_carry_real(new), doing, todo,
5209+ 0 /* not including insert point */);
5210+ assert("vs-901",
5211+ coord_is_after_rightmost(flow_insert_point(op)));
5212+
5213+ if (enough_space_for_min_flow_fraction(op)) {
5214+ return 0;
5215+ }
5216+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5217+ return RETERR(-E_NODE_FULL);
5218+
5219+ /* add one more new node */
5220+ new = add_new_znode(node, op->node, doing, todo);
5221+ if (unlikely(IS_ERR(new))) {
5222+ return PTR_ERR(new);
5223+ }
5224+ result = lock_carry_node(doing, new);
5225+ zput(reiser4_carry_real(new));
5226+ if (unlikely(result)) {
5227+ return result;
5228+ }
5229+ op->u.insert_flow.new_nodes++;
5230+ }
5231+
5232+ /* move insertion point to new node */
5233+ coord_init_before_first_item(flow_insert_point(op),
5234+ reiser4_carry_real(new));
5235+ op->node = new;
5236+ return 0;
5237+}
5238+
5239+static int
5240+make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5241+ carry_level * todo)
5242+{
5243+ __u32 flags = op->u.insert_flow.flags;
5244+
5245+ if (enough_space_for_whole_flow(op)) {
5246+ /* whole flow fits into insert point node */
5247+ return 0;
5248+ }
5249+
5250+ if (!(flags & COPI_DONT_SHIFT_LEFT)
5251+ && (make_space_by_shift_left(op, doing, todo) == 0)) {
5252+ /* insert point is shifted to left neighbor of original insert
5253+ point node and is set after last unit in that node. It has
5254+ enough space to fit at least minimal fraction of flow. */
5255+ return 0;
5256+ }
5257+
5258+ if (enough_space_for_whole_flow(op)) {
5259+ /* whole flow fits into insert point node */
5260+ return 0;
5261+ }
5262+
5263+ if (!(flags & COPI_DONT_SHIFT_RIGHT)
5264+ && (make_space_by_shift_right(op, doing, todo) == 0)) {
5265+ /* insert point is still set to the same node, but there is
5266+ nothing to the right of insert point. */
5267+ return 0;
5268+ }
5269+
5270+ if (enough_space_for_whole_flow(op)) {
5271+ /* whole flow fits into insert point node */
5272+ return 0;
5273+ }
5274+
5275+ return make_space_by_new_nodes(op, doing, todo);
5276+}
5277+
5278+/* implements COP_INSERT_FLOW operation */
5279+static int
5280+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5281+{
5282+ int result;
5283+ flow_t *f;
5284+ coord_t *insert_point;
5285+ node_plugin *nplug;
5286+ carry_plugin_info info;
5287+ znode *orig_node;
5288+ lock_handle *orig_lh;
5289+
5290+ f = op->u.insert_flow.flow;
5291+ result = 0;
5292+
5293+ /* carry system needs this to work */
5294+ info.doing = doing;
5295+ info.todo = todo;
5296+
5297+ orig_node = flow_insert_point(op)->node;
5298+ orig_lh = doing->tracked;
5299+
5300+ while (f->length) {
5301+ result = make_space_for_flow_insertion(op, doing, todo);
5302+ if (result)
5303+ break;
5304+
5305+ insert_point = flow_insert_point(op);
5306+ nplug = node_plugin_by_node(insert_point->node);
5307+
5308+ /* compose item data for insertion/pasting */
5309+ flow_insert_data(op)->data = f->data;
5310+ flow_insert_data(op)->length = what_can_fit_into_node(op);
5311+
5312+ if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5313+ /* insert point is set to item of file we are writing to and we have to append to it */
5314+ assert("vs-903", insert_point->between == AFTER_UNIT);
5315+ nplug->change_item_size(insert_point,
5316+ flow_insert_data(op)->length);
5317+ flow_insert_data(op)->iplug->b.paste(insert_point,
5318+ flow_insert_data
5319+ (op), &info);
5320+ } else {
5321+ /* new item must be inserted */
5322+ pos_in_node_t new_pos;
5323+ flow_insert_data(op)->length += item_data_overhead(op);
5324+
5325+ /* FIXME-VS: this is because node40_create_item changes
5326+ insert_point for obscure reasons */
5327+ switch (insert_point->between) {
5328+ case AFTER_ITEM:
5329+ new_pos = insert_point->item_pos + 1;
5330+ break;
5331+ case EMPTY_NODE:
5332+ new_pos = 0;
5333+ break;
5334+ case BEFORE_ITEM:
5335+ assert("vs-905", insert_point->item_pos == 0);
5336+ new_pos = 0;
5337+ break;
5338+ default:
5339+ impossible("vs-906",
5340+ "carry_insert_flow: invalid coord");
5341+ new_pos = 0;
5342+ break;
5343+ }
5344+
5345+ nplug->create_item(insert_point, &f->key,
5346+ flow_insert_data(op), &info);
5347+ coord_set_item_pos(insert_point, new_pos);
5348+ }
5349+ coord_init_after_item_end(insert_point);
5350+ doing->restartable = 0;
5351+ znode_make_dirty(insert_point->node);
5352+
5353+ move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5354+ }
5355+
5356+ if (orig_node != flow_insert_point(op)->node) {
5357+ /* move lock to new insert point */
5358+ done_lh(orig_lh);
5359+ init_lh(orig_lh);
5360+ result =
5361+ longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5362+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5363+ }
5364+
5365+ return result;
5366+}
5367+
5368+/* implements COP_DELETE operation
5369+
5370+ Remove pointer to @op -> u.delete.child from it's parent.
5371+
5372+ This function also handles killing of a tree root is last pointer from it
5373+ was removed. This is complicated by our handling of "twig" level: root on
5374+ twig level is never killed.
5375+
5376+*/
5377+static int carry_delete(carry_op * op /* operation to be performed */ ,
5378+ carry_level * doing UNUSED_ARG /* current carry
5379+ * level */ ,
5380+ carry_level * todo /* next carry level */ )
5381+{
5382+ int result;
5383+ coord_t coord;
5384+ coord_t coord2;
5385+ znode *parent;
5386+ znode *child;
5387+ carry_plugin_info info;
5388+ reiser4_tree *tree;
5389+
5390+ /*
5391+ * This operation is called to delete internal item pointing to the
5392+ * child node that was removed by carry from the tree on the previous
5393+ * tree level.
5394+ */
5395+
5396+ assert("nikita-893", op != NULL);
5397+ assert("nikita-894", todo != NULL);
5398+ assert("nikita-895", op->op == COP_DELETE);
5399+
5400+ coord_init_zero(&coord);
5401+ coord_init_zero(&coord2);
5402+
5403+ parent = reiser4_carry_real(op->node);
5404+ child = op->u.delete.child ?
5405+ reiser4_carry_real(op->u.delete.child) : op->node->node;
5406+ tree = znode_get_tree(child);
5407+ read_lock_tree(tree);
5408+
5409+ /*
5410+ * @parent was determined when carry entered parent level
5411+ * (lock_carry_level/lock_carry_node). Since then, actual parent of
5412+ * @child node could change due to other carry operations performed on
5413+ * the parent level. Check for this.
5414+ */
5415+
5416+ if (znode_parent(child) != parent) {
5417+ /* NOTE-NIKITA add stat counter for this. */
5418+ parent = znode_parent(child);
5419+ assert("nikita-2581", find_carry_node(doing, parent));
5420+ }
5421+ read_unlock_tree(tree);
5422+
5423+ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5424+
5425+ /* Twig level horrors: tree should be of height at least 2. So, last
5426+ pointer from the root at twig level is preserved even if child is
5427+ empty. This is ugly, but so it was architectured.
5428+ */
5429+
5430+ if (znode_is_root(parent) &&
5431+ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5432+ node_num_items(parent) == 1) {
5433+ /* Delimiting key manipulations. */
5434+ write_lock_dk(tree);
5435+ znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5436+ znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5437+ ZF_SET(child, JNODE_DKSET);
5438+ write_unlock_dk(tree);
5439+
5440+ /* @child escaped imminent death! */
5441+ ZF_CLR(child, JNODE_HEARD_BANSHEE);
5442+ return 0;
5443+ }
5444+
5445+ /* convert child pointer to the coord_t */
5446+ result = find_child_ptr(parent, child, &coord);
5447+ if (result != NS_FOUND) {
5448+ warning("nikita-994", "Cannot find child pointer: %i", result);
5449+ print_coord_content("coord", &coord);
5450+ return result;
5451+ }
5452+
5453+ coord_dup(&coord2, &coord);
5454+ info.doing = doing;
5455+ info.todo = todo;
5456+ {
5457+ /*
5458+ * Actually kill internal item: prepare structure with
5459+ * arguments for ->cut_and_kill() method...
5460+ */
5461+
5462+ struct carry_kill_data kdata;
5463+ kdata.params.from = &coord;
5464+ kdata.params.to = &coord2;
5465+ kdata.params.from_key = NULL;
5466+ kdata.params.to_key = NULL;
5467+ kdata.params.smallest_removed = NULL;
5468+ kdata.params.truncate = 1;
5469+ kdata.flags = op->u.delete.flags;
5470+ kdata.inode = NULL;
5471+ kdata.left = NULL;
5472+ kdata.right = NULL;
5473+ kdata.buf = NULL;
5474+ /* ... and call it. */
5475+ result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5476+ &info);
5477+ }
5478+ doing->restartable = 0;
5479+
5480+ /* check whether root should be killed violently */
5481+ if (znode_is_root(parent) &&
5482+ /* don't kill roots at and lower than twig level */
5483+ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5484+ node_num_items(parent) == 1) {
5485+ result = reiser4_kill_tree_root(coord.node);
5486+ }
5487+
5488+ return result < 0 ? : 0;
5489+}
5490+
5491+/* implements COP_CUT opration
5492+
5493+ Cuts part or whole content of node.
5494+
5495+*/
5496+static int carry_cut(carry_op * op /* operation to be performed */ ,
5497+ carry_level * doing /* current carry level */ ,
5498+ carry_level * todo /* next carry level */ )
5499+{
5500+ int result;
5501+ carry_plugin_info info;
5502+ node_plugin *nplug;
5503+
5504+ assert("nikita-896", op != NULL);
5505+ assert("nikita-897", todo != NULL);
5506+ assert("nikita-898", op->op == COP_CUT);
5507+
5508+ info.doing = doing;
5509+ info.todo = todo;
5510+
5511+ nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5512+ if (op->u.cut_or_kill.is_cut)
5513+ result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5514+ else
5515+ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5516+
5517+ doing->restartable = 0;
5518+ return result < 0 ? : 0;
5519+}
5520+
5521+/* helper function for carry_paste(): returns true if @op can be continued as
5522+ paste */
5523+static int
5524+can_paste(coord_t * icoord, const reiser4_key * key,
5525+ const reiser4_item_data * data)
5526+{
5527+ coord_t circa;
5528+ item_plugin *new_iplug;
5529+ item_plugin *old_iplug;
5530+ int result = 0; /* to keep gcc shut */
5531+
5532+ assert("", icoord->between != AT_UNIT);
5533+
5534+ /* obviously, one cannot paste when node is empty---there is nothing
5535+ to paste into. */
5536+ if (node_is_empty(icoord->node))
5537+ return 0;
5538+ /* if insertion point is at the middle of the item, then paste */
5539+ if (!coord_is_between_items(icoord))
5540+ return 1;
5541+ coord_dup(&circa, icoord);
5542+ circa.between = AT_UNIT;
5543+
5544+ old_iplug = item_plugin_by_coord(&circa);
5545+ new_iplug = data->iplug;
5546+
5547+ /* check whether we can paste to the item @icoord is "at" when we
5548+ ignore ->between field */
5549+ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5550+ result = 1;
5551+ } else if (icoord->between == BEFORE_UNIT
5552+ || icoord->between == BEFORE_ITEM) {
5553+ /* otherwise, try to glue to the item at the left, if any */
5554+ coord_dup(&circa, icoord);
5555+ if (coord_set_to_left(&circa)) {
5556+ result = 0;
5557+ coord_init_before_item(icoord);
5558+ } else {
5559+ old_iplug = item_plugin_by_coord(&circa);
5560+ result = (old_iplug == new_iplug)
5561+ && item_can_contain_key(icoord, key, data);
5562+ if (result) {
5563+ coord_dup(icoord, &circa);
5564+ icoord->between = AFTER_UNIT;
5565+ }
5566+ }
5567+ } else if (icoord->between == AFTER_UNIT
5568+ || icoord->between == AFTER_ITEM) {
5569+ coord_dup(&circa, icoord);
5570+ /* otherwise, try to glue to the item at the right, if any */
5571+ if (coord_set_to_right(&circa)) {
5572+ result = 0;
5573+ coord_init_after_item(icoord);
5574+ } else {
5575+ int (*cck) (const coord_t *, const reiser4_key *,
5576+ const reiser4_item_data *);
5577+
5578+ old_iplug = item_plugin_by_coord(&circa);
5579+
5580+ cck = old_iplug->b.can_contain_key;
5581+ if (cck == NULL)
5582+ /* item doesn't define ->can_contain_key
5583+ method? So it is not expandable. */
5584+ result = 0;
5585+ else {
5586+ result = (old_iplug == new_iplug)
5587+ && cck(&circa /*icoord */ , key, data);
5588+ if (result) {
5589+ coord_dup(icoord, &circa);
5590+ icoord->between = BEFORE_UNIT;
5591+ }
5592+ }
5593+ }
5594+ } else
5595+ impossible("nikita-2513", "Nothing works");
5596+ if (result) {
5597+ if (icoord->between == BEFORE_ITEM) {
5598+ assert("vs-912", icoord->unit_pos == 0);
5599+ icoord->between = BEFORE_UNIT;
5600+ } else if (icoord->between == AFTER_ITEM) {
5601+ coord_init_after_item_end(icoord);
5602+ }
5603+ }
5604+ return result;
5605+}
5606+
5607+/* implements COP_PASTE operation
5608+
5609+ Paste data into existing item. This is complicated by the fact that after
5610+ we shifted something to the left or right neighbors trying to free some
5611+ space, item we were supposed to paste into can be in different node than
5612+ insertion coord. If so, we are no longer doing paste, but insert. See
5613+ comments in insert_paste_common().
5614+
5615+*/
5616+static int carry_paste(carry_op * op /* operation to be performed */ ,
5617+ carry_level * doing UNUSED_ARG /* current carry
5618+ * level */ ,
5619+ carry_level * todo /* next carry level */ )
5620+{
5621+ znode *node;
5622+ carry_insert_data cdata;
5623+ coord_t dcoord;
5624+ reiser4_item_data data;
5625+ int result;
5626+ int real_size;
5627+ item_plugin *iplug;
5628+ carry_plugin_info info;
5629+ coord_t *coord;
5630+
5631+ assert("nikita-982", op != NULL);
5632+ assert("nikita-983", todo != NULL);
5633+ assert("nikita-984", op->op == COP_PASTE);
5634+
5635+ coord_init_zero(&dcoord);
5636+
5637+ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5638+ if (result != 0)
5639+ return result;
5640+
5641+ coord = op->u.insert.d->coord;
5642+
5643+ /* handle case when op -> u.insert.coord doesn't point to the item
5644+ of required type. restart as insert. */
5645+ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5646+ op->op = COP_INSERT;
5647+ op->u.insert.type = COPT_PASTE_RESTARTED;
5648+ result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5649+
5650+ return result;
5651+ }
5652+
5653+ node = coord->node;
5654+ iplug = item_plugin_by_coord(coord);
5655+ assert("nikita-992", iplug != NULL);
5656+
5657+ assert("nikita-985", node != NULL);
5658+ assert("nikita-986", node_plugin_by_node(node) != NULL);
5659+
5660+ assert("nikita-987",
5661+ space_needed_for_op(node, op) <= znode_free_space(node));
5662+
5663+ assert("nikita-1286", coord_is_existing_item(coord));
5664+
5665+ /*
5666+ * if item is expanded as a result of this operation, we should first
5667+ * change item size, than call ->b.paste item method. If item is
5668+ * shrunk, it should be done other way around: first call ->b.paste
5669+ * method, then reduce item size.
5670+ */
5671+
5672+ real_size = space_needed_for_op(node, op);
5673+ if (real_size > 0)
5674+ node->nplug->change_item_size(coord, real_size);
5675+
5676+ doing->restartable = 0;
5677+ info.doing = doing;
5678+ info.todo = todo;
5679+
5680+ result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5681+
5682+ if (real_size < 0)
5683+ node->nplug->change_item_size(coord, real_size);
5684+
5685+ /* if we pasted at the beginning of the item, update item's key. */
5686+ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5687+ node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5688+
5689+ znode_make_dirty(node);
5690+ return result;
5691+}
5692+
5693+/* handle carry COP_EXTENT operation. */
5694+static int carry_extent(carry_op * op /* operation to perform */ ,
5695+ carry_level * doing /* queue of operations @op
5696+ * is part of */ ,
5697+ carry_level * todo /* queue where new operations
5698+ * are accumulated */ )
5699+{
5700+ znode *node;
5701+ carry_insert_data cdata;
5702+ coord_t coord;
5703+ reiser4_item_data data;
5704+ carry_op *delete_dummy;
5705+ carry_op *insert_extent;
5706+ int result;
5707+ carry_plugin_info info;
5708+
5709+ assert("nikita-1751", op != NULL);
5710+ assert("nikita-1752", todo != NULL);
5711+ assert("nikita-1753", op->op == COP_EXTENT);
5712+
5713+ /* extent insertion overview:
5714+
5715+ extents live on the TWIG LEVEL, which is level one above the leaf
5716+ one. This complicates extent insertion logic somewhat: it may
5717+ happen (and going to happen all the time) that in logical key
5718+ ordering extent has to be placed between items I1 and I2, located
5719+ at the leaf level, but I1 and I2 are in the same formatted leaf
5720+ node N1. To insert extent one has to
5721+
5722+ (1) reach node N1 and shift data between N1, its neighbors and
5723+ possibly newly allocated nodes until I1 and I2 fall into different
5724+ nodes. Since I1 and I2 are still neighboring items in logical key
5725+ order, they will be necessary utmost items in their respective
5726+ nodes.
5727+
5728+ (2) After this new extent item is inserted into node on the twig
5729+ level.
5730+
5731+ Fortunately this process can reuse almost all code from standard
5732+ insertion procedure (viz. make_space() and insert_paste_common()),
5733+ due to the following observation: make_space() only shifts data up
5734+ to and excluding or including insertion point. It never
5735+ "over-moves" through insertion point. Thus, one can use
5736+ make_space() to perform step (1). All required for this is just to
5737+ instruct free_space_shortage() to keep make_space() shifting data
5738+ until insertion point is at the node border.
5739+
5740+ */
5741+
5742+ /* perform common functionality of insert and paste. */
5743+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5744+ if (result != 0)
5745+ return result;
5746+
5747+ node = op->u.extent.d->coord->node;
5748+ assert("nikita-1754", node != NULL);
5749+ assert("nikita-1755", node_plugin_by_node(node) != NULL);
5750+ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5751+
5752+ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5753+ extent fits between items. */
5754+
5755+ info.doing = doing;
5756+ info.todo = todo;
5757+
5758+ /* there is another complication due to placement of extents on the
5759+ twig level: extents are "rigid" in the sense that key-range
5760+ occupied by extent cannot grow indefinitely to the right as it is
5761+ for the formatted leaf nodes. Because of this when search finds two
5762+ adjacent extents on the twig level, it has to "drill" to the leaf
5763+ level, creating new node. Here we are removing this node.
5764+ */
5765+ if (node_is_empty(node)) {
5766+ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
5767+ if (IS_ERR(delete_dummy))
5768+ return PTR_ERR(delete_dummy);
5769+ delete_dummy->u.delete.child = NULL;
5770+ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
5771+ ZF_SET(node, JNODE_HEARD_BANSHEE);
5772+ }
5773+
5774+ /* proceed with inserting extent item into parent. We are definitely
5775+ inserting rather than pasting if we get that far. */
5776+ insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
5777+ if (IS_ERR(insert_extent))
5778+ /* @delete_dummy will be automatically destroyed on the level
5779+ exiting */
5780+ return PTR_ERR(insert_extent);
5781+ /* NOTE-NIKITA insertion by key is simplest option here. Another
5782+ possibility is to insert on the left or right of already existing
5783+ item.
5784+ */
5785+ insert_extent->u.insert.type = COPT_KEY;
5786+ insert_extent->u.insert.d = op->u.extent.d;
5787+ assert("nikita-1719", op->u.extent.d->key != NULL);
5788+ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
5789+ insert_extent->u.insert.flags =
5790+ znode_get_tree(node)->carry.new_extent_flags;
5791+
5792+ /*
5793+ * if carry was asked to track lock handle we should actually track
5794+ * lock handle on the twig node rather than on the leaf where
5795+ * operation was started from. Transfer tracked lock handle.
5796+ */
5797+ if (doing->track_type) {
5798+ assert("nikita-3242", doing->tracked != NULL);
5799+ assert("nikita-3244", todo->tracked == NULL);
5800+ todo->tracked = doing->tracked;
5801+ todo->track_type = CARRY_TRACK_NODE;
5802+ doing->tracked = NULL;
5803+ doing->track_type = 0;
5804+ }
5805+
5806+ return 0;
5807+}
5808+
5809+/* update key in @parent between pointers to @left and @right.
5810+
5811+ Find coords of @left and @right and update delimiting key between them.
5812+ This is helper function called by carry_update(). Finds position of
5813+ internal item involved. Updates item key. Updates delimiting keys of child
5814+ nodes involved.
5815+*/
5816+static int update_delimiting_key(znode * parent /* node key is updated
5817+ * in */ ,
5818+ znode * left /* child of @parent */ ,
5819+ znode * right /* child of @parent */ ,
5820+ carry_level * doing /* current carry
5821+ * level */ ,
5822+ carry_level * todo /* parent carry
5823+ * level */ ,
5824+ const char **error_msg /* place to
5825+ * store error
5826+ * message */ )
5827+{
5828+ coord_t left_pos;
5829+ coord_t right_pos;
5830+ int result;
5831+ reiser4_key ldkey;
5832+ carry_plugin_info info;
5833+
5834+ assert("nikita-1177", right != NULL);
5835+ /* find position of right left child in a parent */
5836+ result = find_child_ptr(parent, right, &right_pos);
5837+ if (result != NS_FOUND) {
5838+ *error_msg = "Cannot find position of right child";
5839+ return result;
5840+ }
5841+
5842+ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
5843+ /* find position of the left child in a parent */
5844+ result = find_child_ptr(parent, left, &left_pos);
5845+ if (result != NS_FOUND) {
5846+ *error_msg = "Cannot find position of left child";
5847+ return result;
5848+ }
5849+ assert("nikita-1355", left_pos.node != NULL);
5850+ } else
5851+ left_pos.node = NULL;
5852+
5853+ /* check that they are separated by exactly one key and are basically
5854+ sane */
5855+ if (REISER4_DEBUG) {
5856+ if ((left_pos.node != NULL)
5857+ && !coord_is_existing_unit(&left_pos)) {
5858+ *error_msg = "Left child is bastard";
5859+ return RETERR(-EIO);
5860+ }
5861+ if (!coord_is_existing_unit(&right_pos)) {
5862+ *error_msg = "Right child is bastard";
5863+ return RETERR(-EIO);
5864+ }
5865+ if (left_pos.node != NULL &&
5866+ !coord_are_neighbors(&left_pos, &right_pos)) {
5867+ *error_msg = "Children are not direct siblings";
5868+ return RETERR(-EIO);
5869+ }
5870+ }
5871+ *error_msg = NULL;
5872+
5873+ info.doing = doing;
5874+ info.todo = todo;
5875+
5876+ /*
5877+ * If child node is not empty, new key of internal item is a key of
5878+ * leftmost item in the child node. If the child is empty, take its
5879+ * right delimiting key as a new key of the internal item. Precise key
5880+ * in the latter case is not important per se, because the child (and
5881+ * the internal item) are going to be killed shortly anyway, but we
5882+ * have to preserve correct order of keys in the parent node.
5883+ */
5884+
5885+ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
5886+ leftmost_key_in_node(right, &ldkey);
5887+ else {
5888+ read_lock_dk(znode_get_tree(parent));
5889+ ldkey = *znode_get_rd_key(right);
5890+ read_unlock_dk(znode_get_tree(parent));
5891+ }
5892+ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
5893+ doing->restartable = 0;
5894+ znode_make_dirty(parent);
5895+ return 0;
5896+}
5897+
5898+/* implements COP_UPDATE opration
5899+
5900+ Update delimiting keys.
5901+
5902+*/
5903+static int carry_update(carry_op * op /* operation to be performed */ ,
5904+ carry_level * doing /* current carry level */ ,
5905+ carry_level * todo /* next carry level */ )
5906+{
5907+ int result;
5908+ carry_node *missing UNUSED_ARG;
5909+ znode *left;
5910+ znode *right;
5911+ carry_node *lchild;
5912+ carry_node *rchild;
5913+ const char *error_msg;
5914+ reiser4_tree *tree;
5915+
5916+ /*
5917+ * This operation is called to update key of internal item. This is
5918+ * necessary when carry shifted of cut data on the child
5919+ * level. Arguments of this operation are:
5920+ *
5921+ * @right --- child node. Operation should update key of internal
5922+ * item pointing to @right.
5923+ *
5924+ * @left --- left neighbor of @right. This parameter is optional.
5925+ */
5926+
5927+ assert("nikita-902", op != NULL);
5928+ assert("nikita-903", todo != NULL);
5929+ assert("nikita-904", op->op == COP_UPDATE);
5930+
5931+ lchild = op->u.update.left;
5932+ rchild = op->node;
5933+
5934+ if (lchild != NULL) {
5935+ assert("nikita-1001", lchild->parent);
5936+ assert("nikita-1003", !lchild->left);
5937+ left = reiser4_carry_real(lchild);
5938+ } else
5939+ left = NULL;
5940+
5941+ tree = znode_get_tree(rchild->node);
5942+ read_lock_tree(tree);
5943+ right = znode_parent(rchild->node);
5944+ read_unlock_tree(tree);
5945+
5946+ if (right != NULL) {
5947+ result = update_delimiting_key(right,
5948+ lchild ? lchild->node : NULL,
5949+ rchild->node,
5950+ doing, todo, &error_msg);
5951+ } else {
5952+ error_msg = "Cannot find node to update key in";
5953+ result = RETERR(-EIO);
5954+ }
5955+ /* operation will be reposted to the next level by the
5956+ ->update_item_key() method of node plugin, if necessary. */
5957+
5958+ if (result != 0) {
5959+ warning("nikita-999", "Error updating delimiting key: %s (%i)",
5960+ error_msg ? : "", result);
5961+ }
5962+ return result;
5963+}
5964+
5965+/* move items from @node during carry */
5966+static int carry_shift_data(sideof side /* in what direction to move data */ ,
5967+ coord_t * insert_coord /* coord where new item
5968+ * is to be inserted */ ,
5969+ znode * node /* node which data are moved from */ ,
5970+ carry_level * doing /* active carry queue */ ,
5971+ carry_level * todo /* carry queue where new
5972+ * operations are to be put
5973+ * in */ ,
5974+ unsigned int including_insert_coord_p /* true if
5975+ * @insertion_coord
5976+ * can be moved */ )
5977+{
5978+ int result;
5979+ znode *source;
5980+ carry_plugin_info info;
5981+ node_plugin *nplug;
5982+
5983+ source = insert_coord->node;
5984+
5985+ info.doing = doing;
5986+ info.todo = todo;
5987+
5988+ nplug = node_plugin_by_node(node);
5989+ result = nplug->shift(insert_coord, node,
5990+ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
5991+ (int)including_insert_coord_p, &info);
5992+ /* the only error ->shift() method of node plugin can return is
5993+ -ENOMEM due to carry node/operation allocation. */
5994+ assert("nikita-915", result >= 0 || result == -ENOMEM);
5995+ if (result > 0) {
5996+ /*
5997+ * if some number of bytes was actually shifted, mark nodes
5998+ * dirty, and carry level as non-restartable.
5999+ */
6000+ doing->restartable = 0;
6001+ znode_make_dirty(source);
6002+ znode_make_dirty(node);
6003+ }
6004+
6005+ assert("nikita-2077", coord_check(insert_coord));
6006+ return 0;
6007+}
6008+
6009+typedef carry_node *(*carry_iterator) (carry_node * node);
6010+static carry_node *find_dir_carry(carry_node * node, carry_level * level,
6011+ carry_iterator iterator);
6012+
6013+static carry_node *pool_level_list_prev(carry_node *node)
6014+{
6015+ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
6016+}
6017+
6018+/* look for the left neighbor of given carry node in a carry queue.
6019+
6020+ This is used by find_left_neighbor(), but I am not sure that this
6021+ really gives any advantage. More statistics required.
6022+
6023+*/
6024+carry_node *find_left_carry(carry_node * node /* node to find left neighbor
6025+ * of */ ,
6026+ carry_level * level /* level to scan */ )
6027+{
6028+ return find_dir_carry(node, level,
6029+ (carry_iterator) pool_level_list_prev);
6030+}
6031+
6032+static carry_node *pool_level_list_next(carry_node *node)
6033+{
6034+ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
6035+}
6036+
6037+/* look for the right neighbor of given carry node in a
6038+ carry queue.
6039+
6040+ This is used by find_right_neighbor(), but I am not sure that this
6041+ really gives any advantage. More statistics required.
6042+
6043+*/
6044+carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6045+ * of */ ,
6046+ carry_level * level /* level to scan */ )
6047+{
6048+ return find_dir_carry(node, level,
6049+ (carry_iterator) pool_level_list_next);
6050+}
6051+
6052+/* look for the left or right neighbor of given carry node in a carry
6053+ queue.
6054+
6055+ Helper function used by find_{left|right}_carry().
6056+*/
6057+static carry_node *find_dir_carry(carry_node * node /* node to start scanning
6058+ * from */ ,
6059+ carry_level * level /* level to scan */ ,
6060+ carry_iterator iterator /* operation to
6061+ * move to the next
6062+ * node */ )
6063+{
6064+ carry_node *neighbor;
6065+
6066+ assert("nikita-1059", node != NULL);
6067+ assert("nikita-1060", level != NULL);
6068+
6069+ /* scan list of carry nodes on this list dir-ward, skipping all
6070+ carry nodes referencing the same znode. */
6071+ neighbor = node;
6072+ while (1) {
6073+ neighbor = iterator(neighbor);
6074+ if (carry_node_end(level, neighbor))
6075+ /* list head is reached */
6076+ return NULL;
6077+ if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
6078+ return neighbor;
6079+ }
6080+}
6081+
6082+/*
6083+ * Memory reservation estimation.
6084+ *
6085+ * Carry process proceeds through tree levels upwards. Carry assumes that it
6086+ * takes tree in consistent state (e.g., that search tree invariants hold),
6087+ * and leaves tree consistent after it finishes. This means that when some
6088+ * error occurs carry cannot simply return if there are pending carry
6089+ * operations. Generic solution for this problem is carry-undo either as
6090+ * transaction manager feature (requiring checkpoints and isolation), or
6091+ * through some carry specific mechanism.
6092+ *
6093+ * Our current approach is to panic if carry hits an error while tree is
6094+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6095+ * this "memory reservation" mechanism was added.
6096+ *
6097+ * Memory reservation is implemented by perthread-pages.diff patch from
6098+ * core-patches. Its API is defined in <linux/gfp.h>
6099+ *
6100+ * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6101+ * void perthread_pages_release(int nrpages);
6102+ * int perthread_pages_count(void);
6103+ *
6104+ * carry estimates its worst case memory requirements at the entry, reserved
6105+ * enough memory, and released unused pages before returning.
6106+ *
6107+ * Code below estimates worst case memory requirements for a given carry
6108+ * queue. This is dome by summing worst case memory requirements for each
6109+ * operation in the queue.
6110+ *
6111+ */
6112+
6113+/*
6114+ * Memory memory requirements of many operations depends on the tree
6115+ * height. For example, item insertion requires new node to be inserted at
6116+ * each tree level in the worst case. What tree height should be used for
6117+ * estimation? Current tree height is wrong, because tree height can change
6118+ * between the time when estimation was done and the time when operation is
6119+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6120+ * is also not desirable, because it would lead to the huge over-estimation
6121+ * all the time. Plausible solution is "capped tree height": if current tree
6122+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6123+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6124+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6125+ * to be increased even more during short interval of time.
6126+ */
6127+#define TREE_HEIGHT_CAP (5)
6128+
6129+/* return capped tree height for the @tree. See comment above. */
6130+static int cap_tree_height(reiser4_tree * tree)
6131+{
6132+ return max_t(int, tree->height, TREE_HEIGHT_CAP);
6133+}
6134+
6135+/* return capped tree height for the current tree. */
6136+static int capped_height(void)
6137+{
6138+ return cap_tree_height(current_tree);
6139+}
6140+
6141+/* return number of pages required to store given number of bytes */
6142+static int bytes_to_pages(int bytes)
6143+{
6144+ return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6145+}
6146+
6147+/* how many pages are required to allocate znodes during item insertion. */
6148+static int carry_estimate_znodes(void)
6149+{
6150+ /*
6151+ * Note, that there we have some problem here: there is no way to
6152+ * reserve pages specifically for the given slab. This means that
6153+ * these pages can be hijacked for some other end.
6154+ */
6155+
6156+ /* in the worst case we need 3 new znode on each tree level */
6157+ return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6158+}
6159+
6160+/*
6161+ * how many pages are required to load bitmaps. One bitmap per level.
6162+ */
6163+static int carry_estimate_bitmaps(void)
6164+{
6165+ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6166+ int bytes;
6167+
6168+ bytes = capped_height() * (0 + /* bnode should be added, but its is private to
6169+ * bitmap.c, skip for now. */
6170+ 2 * sizeof(jnode)); /* working and commit jnodes */
6171+ return bytes_to_pages(bytes) + 2; /* and their contents */
6172+ } else
6173+ /* bitmaps were pre-loaded during mount */
6174+ return 0;
6175+}
6176+
6177+/* worst case item insertion memory requirements */
6178+static int carry_estimate_insert(carry_op * op, carry_level * level)
6179+{
6180+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6181+ capped_height() + /* new block on each level */
6182+ 1 + /* and possibly extra new block at the leaf level */
6183+ 3; /* loading of leaves into memory */
6184+}
6185+
6186+/* worst case item deletion memory requirements */
6187+static int carry_estimate_delete(carry_op * op, carry_level * level)
6188+{
6189+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6190+ 3; /* loading of leaves into memory */
6191+}
6192+
6193+/* worst case tree cut memory requirements */
6194+static int carry_estimate_cut(carry_op * op, carry_level * level)
6195+{
6196+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6197+ 3; /* loading of leaves into memory */
6198+}
6199+
6200+/* worst case memory requirements of pasting into item */
6201+static int carry_estimate_paste(carry_op * op, carry_level * level)
6202+{
6203+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6204+ capped_height() + /* new block on each level */
6205+ 1 + /* and possibly extra new block at the leaf level */
6206+ 3; /* loading of leaves into memory */
6207+}
6208+
6209+/* worst case memory requirements of extent insertion */
6210+static int carry_estimate_extent(carry_op * op, carry_level * level)
6211+{
6212+ return carry_estimate_insert(op, level) + /* insert extent */
6213+ carry_estimate_delete(op, level); /* kill leaf */
6214+}
6215+
6216+/* worst case memory requirements of key update */
6217+static int carry_estimate_update(carry_op * op, carry_level * level)
6218+{
6219+ return 0;
6220+}
6221+
6222+/* worst case memory requirements of flow insertion */
6223+static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6224+{
6225+ int newnodes;
6226+
6227+ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6228+ CARRY_FLOW_NEW_NODES_LIMIT);
6229+ /*
6230+ * roughly estimate insert_flow as a sequence of insertions.
6231+ */
6232+ return newnodes * carry_estimate_insert(op, level);
6233+}
6234+
6235+/* This is dispatch table for carry operations. It can be trivially
6236+ abstracted into useful plugin: tunable balancing policy is a good
6237+ thing. */
6238+carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6239+ [COP_INSERT] = {
6240+ .handler = carry_insert,
6241+ .estimate = carry_estimate_insert}
6242+ ,
6243+ [COP_DELETE] = {
6244+ .handler = carry_delete,
6245+ .estimate = carry_estimate_delete}
6246+ ,
6247+ [COP_CUT] = {
6248+ .handler = carry_cut,
6249+ .estimate = carry_estimate_cut}
6250+ ,
6251+ [COP_PASTE] = {
6252+ .handler = carry_paste,
6253+ .estimate = carry_estimate_paste}
6254+ ,
6255+ [COP_EXTENT] = {
6256+ .handler = carry_extent,
6257+ .estimate = carry_estimate_extent}
6258+ ,
6259+ [COP_UPDATE] = {
6260+ .handler = carry_update,
6261+ .estimate = carry_estimate_update}
6262+ ,
6263+ [COP_INSERT_FLOW] = {
6264+ .handler = carry_insert_flow,
6265+ .estimate = carry_estimate_insert_flow}
6266+};
6267+
6268+/* Make Linus happy.
6269+ Local variables:
6270+ c-indentation-style: "K&R"
6271+ mode-name: "LC"
6272+ c-basic-offset: 8
6273+ tab-width: 8
6274+ fill-column: 120
6275+ scroll-step: 1
6276+ End:
6277+*/
6278diff -urN linux-2.6.24.orig/fs/reiser4/carry_ops.h linux-2.6.24/fs/reiser4/carry_ops.h
6279--- linux-2.6.24.orig/fs/reiser4/carry_ops.h 1970-01-01 03:00:00.000000000 +0300
6280+++ linux-2.6.24/fs/reiser4/carry_ops.h 2008-01-25 11:39:06.904199446 +0300
6281@@ -0,0 +1,42 @@
6282+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6283+
6284+/* implementation of carry operations. See carry_ops.c for details. */
6285+
6286+#if !defined( __CARRY_OPS_H__ )
6287+#define __CARRY_OPS_H__
6288+
6289+#include "forward.h"
6290+#include "znode.h"
6291+#include "carry.h"
6292+
6293+/* carry operation handlers */
6294+typedef struct carry_op_handler {
6295+ /* perform operation */
6296+ int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6297+ /* estimate memory requirements for @op */
6298+ int (*estimate) (carry_op * op, carry_level * level);
6299+} carry_op_handler;
6300+
6301+/* This is dispatch table for carry operations. It can be trivially
6302+ abstracted into useful plugin: tunable balancing policy is a good
6303+ thing. */
6304+extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6305+
6306+unsigned int space_needed(const znode * node, const coord_t * coord,
6307+ const reiser4_item_data * data, int inserting);
6308+extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6309+extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6310+
6311+/* __CARRY_OPS_H__ */
6312+#endif
6313+
6314+/* Make Linus happy.
6315+ Local variables:
6316+ c-indentation-style: "K&R"
6317+ mode-name: "LC"
6318+ c-basic-offset: 8
6319+ tab-width: 8
6320+ fill-column: 120
6321+ scroll-step: 1
6322+ End:
6323+*/
6324diff -urN linux-2.6.24.orig/fs/reiser4/context.c linux-2.6.24/fs/reiser4/context.c
6325--- linux-2.6.24.orig/fs/reiser4/context.c 1970-01-01 03:00:00.000000000 +0300
6326+++ linux-2.6.24/fs/reiser4/context.c 2008-01-25 11:39:06.904199446 +0300
6327@@ -0,0 +1,288 @@
6328+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6329+
6330+/* Manipulation of reiser4_context */
6331+
6332+/*
6333+ * global context used during system call. Variable of this type is allocated
6334+ * on the stack at the beginning of the reiser4 part of the system call and
6335+ * pointer to it is stored in the current->fs_context. This allows us to avoid
6336+ * passing pointer to current transaction and current lockstack (both in
6337+ * one-to-one mapping with threads) all over the call chain.
6338+ *
6339+ * It's kind of like those global variables the prof used to tell you not to
6340+ * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6341+ *
6342+ * In some situations it is desirable to have ability to enter reiser4_context
6343+ * more than once for the same thread (nested contexts). For example, there
6344+ * are some functions that can be called either directly from VFS/VM or from
6345+ * already active reiser4 context (->writepage, for example).
6346+ *
6347+ * In such situations "child" context acts like dummy: all activity is
6348+ * actually performed in the top level context, and get_current_context()
6349+ * always returns top level context.
6350+ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6351+ * nested any way.
6352+ *
6353+ * Note that there is an important difference between reiser4 uses
6354+ * ->fs_context and the way other file systems use it. Other file systems
6355+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6356+ * (this is why ->fs_context was initially called ->journal_info). This means,
6357+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6358+ * to the file system, they assume that some transaction is already underway,
6359+ * and usually bail out, because starting nested transaction would most likely
6360+ * lead to the deadlock. This gives false positives with reiser4, because we
6361+ * set ->fs_context before starting transaction.
6362+ */
6363+
6364+#include "debug.h"
6365+#include "super.h"
6366+#include "context.h"
6367+
6368+#include <linux/writeback.h> /* balance_dirty_pages() */
6369+#include <linux/hardirq.h>
6370+
6371+static void _reiser4_init_context(reiser4_context * context,
6372+ struct super_block *super)
6373+{
6374+ memset(context, 0, sizeof(*context));
6375+
6376+ context->super = super;
6377+ context->magic = context_magic;
6378+ context->outer = current->journal_info;
6379+ current->journal_info = (void *)context;
6380+ context->nr_children = 0;
6381+ context->gfp_mask = GFP_KERNEL;
6382+
6383+ init_lock_stack(&context->stack);
6384+
6385+ reiser4_txn_begin(context);
6386+
6387+ /* initialize head of tap list */
6388+ INIT_LIST_HEAD(&context->taps);
6389+#if REISER4_DEBUG
6390+ context->task = current;
6391+#endif
6392+ grab_space_enable();
6393+}
6394+
6395+/* initialize context and bind it to the current thread
6396+
6397+ This function should be called at the beginning of reiser4 part of
6398+ syscall.
6399+*/
6400+reiser4_context * reiser4_init_context(struct super_block * super)
6401+{
6402+ reiser4_context *context;
6403+
6404+ assert("nikita-2662", !in_interrupt() && !in_irq());
6405+ assert("nikita-3357", super != NULL);
6406+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6407+
6408+ context = get_current_context_check();
6409+ if (context && context->super == super) {
6410+ context = (reiser4_context *) current->journal_info;
6411+ context->nr_children++;
6412+ return context;
6413+ }
6414+
6415+ context = kmalloc(sizeof(*context), GFP_KERNEL);
6416+ if (context == NULL)
6417+ return ERR_PTR(RETERR(-ENOMEM));
6418+
6419+ _reiser4_init_context(context, super);
6420+ return context;
6421+}
6422+
6423+/* this is used in scan_mgr which is called with spinlock held and in
6424+ reiser4_fill_super magic */
6425+void init_stack_context(reiser4_context *context, struct super_block *super)
6426+{
6427+ assert("nikita-2662", !in_interrupt() && !in_irq());
6428+ assert("nikita-3357", super != NULL);
6429+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6430+ assert("vs-12", !is_in_reiser4_context());
6431+
6432+ _reiser4_init_context(context, super);
6433+ context->on_stack = 1;
6434+ return;
6435+}
6436+
6437+/* cast lock stack embedded into reiser4 context up to its container */
6438+reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6439+{
6440+ return container_of(owner, reiser4_context, stack);
6441+}
6442+
6443+/* true if there is already _any_ reiser4 context for the current thread */
6444+int is_in_reiser4_context(void)
6445+{
6446+ reiser4_context *ctx;
6447+
6448+ ctx = current->journal_info;
6449+ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6450+}
6451+
6452+/*
6453+ * call balance dirty pages for the current context.
6454+ *
6455+ * File system is expected to call balance_dirty_pages_ratelimited() whenever
6456+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6457+ * write---this covers vast majority of all dirty traffic), but we cannot do
6458+ * this immediately when formatted node is dirtied, because long term lock is
6459+ * usually held at that time. To work around this, dirtying of formatted node
6460+ * simply increases ->nr_marked_dirty counter in the current reiser4
6461+ * context. When we are about to leave this context,
6462+ * balance_dirty_pages_ratelimited() is called, if necessary.
6463+ *
6464+ * This introduces another problem: sometimes we do not want to run
6465+ * balance_dirty_pages_ratelimited() when leaving a context, for example
6466+ * because some important lock (like ->i_mutex on the parent directory) is
6467+ * held. To achieve this, ->nobalance flag can be set in the current context.
6468+ */
6469+static void balance_dirty_pages_at(reiser4_context *context)
6470+{
6471+ reiser4_super_info_data *sbinfo = get_super_private(context->super);
6472+
6473+ /*
6474+ * call balance_dirty_pages_ratelimited() to process formatted nodes
6475+ * dirtied during this system call. Do that only if we are not in mount
6476+ * and there were nodes dirtied in this context and we are not in
6477+ * writepage (to avoid deadlock) and not in pdflush
6478+ */
6479+ if (sbinfo != NULL && sbinfo->fake != NULL &&
6480+ context->nr_marked_dirty != 0 &&
6481+ !(current->flags & PF_MEMALLOC) &&
6482+ !current_is_pdflush())
6483+ balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6484+}
6485+
6486+/* release resources associated with context.
6487+
6488+ This function should be called at the end of "session" with reiser4,
6489+ typically just before leaving reiser4 driver back to VFS.
6490+
6491+ This is good place to put some degugging consistency checks, like that
6492+ thread released all locks and closed transcrash etc.
6493+
6494+*/
6495+static void reiser4_done_context(reiser4_context * context /* context being released */ )
6496+{
6497+ assert("nikita-860", context != NULL);
6498+ assert("nikita-859", context->magic == context_magic);
6499+ assert("vs-646", (reiser4_context *) current->journal_info == context);
6500+ assert("zam-686", !in_interrupt() && !in_irq());
6501+
6502+ /* only do anything when leaving top-level reiser4 context. All nested
6503+ * contexts are just dummies. */
6504+ if (context->nr_children == 0) {
6505+ assert("jmacd-673", context->trans == NULL);
6506+ assert("jmacd-1002", lock_stack_isclean(&context->stack));
6507+ assert("nikita-1936", reiser4_no_counters_are_held());
6508+ assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6509+ assert("zam-1004", ergo(get_super_private(context->super),
6510+ get_super_private(context->super)->delete_mutex_owner !=
6511+ current));
6512+
6513+ /* release all grabbed but as yet unused blocks */
6514+ if (context->grabbed_blocks != 0)
6515+ all_grabbed2free();
6516+
6517+ /*
6518+ * synchronize against longterm_unlock_znode():
6519+ * wake_up_requestor() wakes up requestors without holding
6520+ * zlock (otherwise they will immediately bump into that lock
6521+ * after wake up on another CPU). To work around (rare)
6522+ * situation where requestor has been woken up asynchronously
6523+ * and managed to run until completion (and destroy its
6524+ * context and lock stack) before wake_up_requestor() called
6525+ * wake_up() on it, wake_up_requestor() synchronize on lock
6526+ * stack spin lock. It has actually been observed that spin
6527+ * lock _was_ locked at this point, because
6528+ * wake_up_requestor() took interrupt.
6529+ */
6530+ spin_lock_stack(&context->stack);
6531+ spin_unlock_stack(&context->stack);
6532+
6533+ assert("zam-684", context->nr_children == 0);
6534+ /* restore original ->fs_context value */
6535+ current->journal_info = context->outer;
6536+ if (context->on_stack == 0)
6537+ kfree(context);
6538+ } else {
6539+ context->nr_children--;
6540+#if REISER4_DEBUG
6541+ assert("zam-685", context->nr_children >= 0);
6542+#endif
6543+ }
6544+}
6545+
6546+/*
6547+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6548+ * transaction. Call done_context() to do context related book-keeping.
6549+ */
6550+void reiser4_exit_context(reiser4_context * context)
6551+{
6552+ assert("nikita-3021", reiser4_schedulable());
6553+
6554+ if (context->nr_children == 0) {
6555+ if (!context->nobalance) {
6556+ reiser4_txn_restart(context);
6557+ balance_dirty_pages_at(context);
6558+ }
6559+
6560+ /* if filesystem is mounted with -o sync or -o dirsync - commit
6561+ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
6562+ commiting on exit_context when inode semaphore is held and
6563+ to have ktxnmgrd to do commit instead to get better
6564+ concurrent filesystem accesses. But, when one mounts with -o
6565+ sync, he cares more about reliability than about
6566+ performance. So, for now we have this simple mount -o sync
6567+ support. */
6568+ if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6569+ txn_atom *atom;
6570+
6571+ atom = get_current_atom_locked_nocheck();
6572+ if (atom) {
6573+ atom->flags |= ATOM_FORCE_COMMIT;
6574+ context->trans->flags &= ~TXNH_DONT_COMMIT;
6575+ spin_unlock_atom(atom);
6576+ }
6577+ }
6578+ reiser4_txn_end(context);
6579+ }
6580+ reiser4_done_context(context);
6581+}
6582+
6583+void reiser4_ctx_gfp_mask_set(void)
6584+{
6585+ reiser4_context *ctx;
6586+
6587+ ctx = get_current_context();
6588+ if (ctx->entd == 0 &&
6589+ list_empty(&ctx->stack.locks) &&
6590+ ctx->trans->atom == NULL)
6591+ ctx->gfp_mask = GFP_KERNEL;
6592+ else
6593+ ctx->gfp_mask = GFP_NOFS;
6594+}
6595+
6596+void reiser4_ctx_gfp_mask_force (gfp_t mask)
6597+{
6598+ reiser4_context *ctx;
6599+ ctx = get_current_context();
6600+
6601+ assert("edward-1454", ctx != NULL);
6602+
6603+ ctx->gfp_mask = mask;
6604+}
6605+
6606+/*
6607+ * Local variables:
6608+ * c-indentation-style: "K&R"
6609+ * mode-name: "LC"
6610+ * c-basic-offset: 8
6611+ * tab-width: 8
6612+ * fill-column: 120
6613+ * scroll-step: 1
6614+ * End:
6615+ */
6616diff -urN linux-2.6.24.orig/fs/reiser4/context.h linux-2.6.24/fs/reiser4/context.h
6617--- linux-2.6.24.orig/fs/reiser4/context.h 1970-01-01 03:00:00.000000000 +0300
6618+++ linux-2.6.24/fs/reiser4/context.h 2008-01-25 11:39:06.904199446 +0300
6619@@ -0,0 +1,228 @@
6620+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6621+ * reiser4/README */
6622+
6623+/* Reiser4 context. See context.c for details. */
6624+
6625+#if !defined( __REISER4_CONTEXT_H__ )
6626+#define __REISER4_CONTEXT_H__
6627+
6628+#include "forward.h"
6629+#include "debug.h"
6630+#include "dformat.h"
6631+#include "tap.h"
6632+#include "lock.h"
6633+
6634+#include <linux/types.h> /* for __u?? */
6635+#include <linux/fs.h> /* for struct super_block */
6636+#include <linux/spinlock.h>
6637+#include <linux/sched.h> /* for struct task_struct */
6638+
6639+/* reiser4 per-thread context */
6640+struct reiser4_context {
6641+ /* magic constant. For identification of reiser4 contexts. */
6642+ __u32 magic;
6643+
6644+ /* current lock stack. See lock.[ch]. This is where list of all
6645+ locks taken by current thread is kept. This is also used in
6646+ deadlock detection. */
6647+ lock_stack stack;
6648+
6649+ /* current transcrash. */
6650+ txn_handle *trans;
6651+ /* transaction handle embedded into reiser4_context. ->trans points
6652+ * here by default. */
6653+ txn_handle trans_in_ctx;
6654+
6655+ /* super block we are working with. To get the current tree
6656+ use &get_super_private (reiser4_get_current_sb ())->tree. */
6657+ struct super_block *super;
6658+
6659+ /* parent fs activation */
6660+ struct fs_activation *outer;
6661+
6662+ /* per-thread grabbed (for further allocation) blocks counter */
6663+ reiser4_block_nr grabbed_blocks;
6664+
6665+ /* list of taps currently monitored. See tap.c */
6666+ struct list_head taps;
6667+
6668+ /* grabbing space is enabled */
6669+ unsigned int grab_enabled:1;
6670+ /* should be set when we are write dirty nodes to disk in jnode_flush or
6671+ * reiser4_write_logs() */
6672+ unsigned int writeout_mode:1;
6673+ /* true, if current thread is an ent thread */
6674+ unsigned int entd:1;
6675+ /* true, if balance_dirty_pages() should not be run when leaving this
6676+ * context. This is used to avoid lengthly balance_dirty_pages()
6677+ * operation when holding some important resource, like directory
6678+ * ->i_mutex */
6679+ unsigned int nobalance:1;
6680+
6681+ /* this bit is used on reiser4_done_context to decide whether context is
6682+ kmalloc-ed and has to be kfree-ed */
6683+ unsigned int on_stack:1;
6684+
6685+ /* count non-trivial jnode_set_dirty() calls */
6686+ unsigned long nr_marked_dirty;
6687+
6688+ /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
6689+ * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6690+ * captures pages. When number of pages captured in one
6691+ * reiser4_sync_inodes reaches some threshold - some atoms get
6692+ * flushed */
6693+ int nr_captured;
6694+ int nr_children; /* number of child contexts */
6695+#if REISER4_DEBUG
6696+ /* debugging information about reiser4 locks held by the current
6697+ * thread */
6698+ reiser4_lock_cnt_info locks;
6699+ struct task_struct *task; /* so we can easily find owner of the stack */
6700+
6701+ /*
6702+ * disk space grabbing debugging support
6703+ */
6704+ /* how many disk blocks were grabbed by the first call to
6705+ * reiser4_grab_space() in this context */
6706+ reiser4_block_nr grabbed_initially;
6707+
6708+ /* list of all threads doing flush currently */
6709+ struct list_head flushers_link;
6710+ /* information about last error encountered by reiser4 */
6711+ err_site err;
6712+#endif
6713+ void *vp;
6714+ gfp_t gfp_mask;
6715+};
6716+
6717+extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6718+
6719+/* Debugging helps. */
6720+#if REISER4_DEBUG
6721+extern void print_contexts(void);
6722+#endif
6723+
6724+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6725+#define current_blocksize reiser4_get_current_sb()->s_blocksize
6726+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6727+
6728+extern reiser4_context *reiser4_init_context(struct super_block *);
6729+extern void init_stack_context(reiser4_context *, struct super_block *);
6730+extern void reiser4_exit_context(reiser4_context *);
6731+
6732+/* magic constant we store in reiser4_context allocated at the stack. Used to
6733+ catch accesses to staled or uninitialized contexts. */
6734+#define context_magic ((__u32) 0x4b1b5d0b)
6735+
6736+extern int is_in_reiser4_context(void);
6737+
6738+/*
6739+ * return reiser4_context for the thread @tsk
6740+ */
6741+static inline reiser4_context *get_context(const struct task_struct *tsk)
6742+{
6743+ assert("vs-1682",
6744+ ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6745+ return (reiser4_context *) tsk->journal_info;
6746+}
6747+
6748+/*
6749+ * return reiser4 context of the current thread, or NULL if there is none.
6750+ */
6751+static inline reiser4_context *get_current_context_check(void)
6752+{
6753+ if (is_in_reiser4_context())
6754+ return get_context(current);
6755+ else
6756+ return NULL;
6757+}
6758+
6759+static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
6760+
6761+/* return context associated with current thread */
6762+static inline reiser4_context *get_current_context(void)
6763+{
6764+ return get_context(current);
6765+}
6766+
6767+static inline gfp_t reiser4_ctx_gfp_mask_get(void)
6768+{
6769+ reiser4_context *ctx;
6770+
6771+ ctx = get_current_context_check();
6772+ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
6773+}
6774+
6775+void reiser4_ctx_gfp_mask_set(void);
6776+void reiser4_ctx_gfp_mask_force (gfp_t mask);
6777+
6778+/*
6779+ * true if current thread is in the write-out mode. Thread enters write-out
6780+ * mode during jnode_flush and reiser4_write_logs().
6781+ */
6782+static inline int is_writeout_mode(void)
6783+{
6784+ return get_current_context()->writeout_mode;
6785+}
6786+
6787+/*
6788+ * enter write-out mode
6789+ */
6790+static inline void writeout_mode_enable(void)
6791+{
6792+ assert("zam-941", !get_current_context()->writeout_mode);
6793+ get_current_context()->writeout_mode = 1;
6794+}
6795+
6796+/*
6797+ * leave write-out mode
6798+ */
6799+static inline void writeout_mode_disable(void)
6800+{
6801+ assert("zam-942", get_current_context()->writeout_mode);
6802+ get_current_context()->writeout_mode = 0;
6803+}
6804+
6805+static inline void grab_space_enable(void)
6806+{
6807+ get_current_context()->grab_enabled = 1;
6808+}
6809+
6810+static inline void grab_space_disable(void)
6811+{
6812+ get_current_context()->grab_enabled = 0;
6813+}
6814+
6815+static inline void grab_space_set_enabled(int enabled)
6816+{
6817+ get_current_context()->grab_enabled = enabled;
6818+}
6819+
6820+static inline int is_grab_enabled(reiser4_context * ctx)
6821+{
6822+ return ctx->grab_enabled;
6823+}
6824+
6825+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
6826+ * flush would be performed when it is closed. This is necessary when handle
6827+ * has to be closed under some coarse semaphore, like i_mutex of
6828+ * directory. Commit will be performed by ktxnmgrd. */
6829+static inline void context_set_commit_async(reiser4_context * context)
6830+{
6831+ context->nobalance = 1;
6832+ context->trans->flags |= TXNH_DONT_COMMIT;
6833+}
6834+
6835+/* __REISER4_CONTEXT_H__ */
6836+#endif
6837+
6838+/* Make Linus happy.
6839+ Local variables:
6840+ c-indentation-style: "K&R"
6841+ mode-name: "LC"
6842+ c-basic-offset: 8
6843+ tab-width: 8
6844+ fill-column: 120
6845+ scroll-step: 1
6846+ End:
6847+*/
6848diff -urN linux-2.6.24.orig/fs/reiser4/coord.c linux-2.6.24/fs/reiser4/coord.c
6849--- linux-2.6.24.orig/fs/reiser4/coord.c 1970-01-01 03:00:00.000000000 +0300
6850+++ linux-2.6.24/fs/reiser4/coord.c 2008-01-25 11:39:06.904199446 +0300
6851@@ -0,0 +1,935 @@
6852+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6853+
6854+#include "forward.h"
6855+#include "debug.h"
6856+#include "dformat.h"
6857+#include "tree.h"
6858+#include "plugin/item/item.h"
6859+#include "znode.h"
6860+#include "coord.h"
6861+
6862+/* Internal constructor. */
6863+static inline void
6864+coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
6865+ pos_in_node_t unit_pos, between_enum between)
6866+{
6867+ coord->node = (znode *) node;
6868+ coord_set_item_pos(coord, item_pos);
6869+ coord->unit_pos = unit_pos;
6870+ coord->between = between;
6871+ ON_DEBUG(coord->plug_v = 0);
6872+ ON_DEBUG(coord->body_v = 0);
6873+
6874+ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
6875+}
6876+
6877+/* after shifting of node content, coord previously set properly may become
6878+ invalid, try to "normalize" it. */
6879+void coord_normalize(coord_t * coord)
6880+{
6881+ znode *node;
6882+
6883+ node = coord->node;
6884+ assert("vs-683", node);
6885+
6886+ coord_clear_iplug(coord);
6887+
6888+ if (node_is_empty(node)) {
6889+ coord_init_first_unit(coord, node);
6890+ } else if ((coord->between == AFTER_ITEM)
6891+ || (coord->between == AFTER_UNIT)) {
6892+ return;
6893+ } else if (coord->item_pos == coord_num_items(coord)
6894+ && coord->between == BEFORE_ITEM) {
6895+ coord_dec_item_pos(coord);
6896+ coord->between = AFTER_ITEM;
6897+ } else if (coord->unit_pos == coord_num_units(coord)
6898+ && coord->between == BEFORE_UNIT) {
6899+ coord->unit_pos--;
6900+ coord->between = AFTER_UNIT;
6901+ } else if (coord->item_pos == coord_num_items(coord)
6902+ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
6903+ coord_dec_item_pos(coord);
6904+ coord->unit_pos = 0;
6905+ coord->between = AFTER_ITEM;
6906+ }
6907+}
6908+
6909+/* Copy a coordinate. */
6910+void coord_dup(coord_t * coord, const coord_t * old_coord)
6911+{
6912+ assert("jmacd-9800", coord_check(old_coord));
6913+ coord_dup_nocheck(coord, old_coord);
6914+}
6915+
6916+/* Copy a coordinate without check. Useful when old_coord->node is not
6917+ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
6918+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
6919+{
6920+ coord->node = old_coord->node;
6921+ coord_set_item_pos(coord, old_coord->item_pos);
6922+ coord->unit_pos = old_coord->unit_pos;
6923+ coord->between = old_coord->between;
6924+ coord->iplugid = old_coord->iplugid;
6925+ ON_DEBUG(coord->plug_v = old_coord->plug_v);
6926+ ON_DEBUG(coord->body_v = old_coord->body_v);
6927+}
6928+
6929+/* Initialize an invalid coordinate. */
6930+void coord_init_invalid(coord_t * coord, const znode * node)
6931+{
6932+ coord_init_values(coord, node, 0, 0, INVALID_COORD);
6933+}
6934+
6935+void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
6936+{
6937+ coord_init_values(coord, node, 0, 0, AT_UNIT);
6938+}
6939+
6940+/* Initialize a coordinate to point at the first unit of the first item. If the node is
6941+ empty, it is positioned at the EMPTY_NODE. */
6942+void coord_init_first_unit(coord_t * coord, const znode * node)
6943+{
6944+ int is_empty = node_is_empty(node);
6945+
6946+ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
6947+
6948+ assert("jmacd-9801", coord_check(coord));
6949+}
6950+
6951+/* Initialize a coordinate to point at the last unit of the last item. If the node is
6952+ empty, it is positioned at the EMPTY_NODE. */
6953+void coord_init_last_unit(coord_t * coord, const znode * node)
6954+{
6955+ int is_empty = node_is_empty(node);
6956+
6957+ coord_init_values(coord, node,
6958+ (is_empty ? 0 : node_num_items(node) - 1), 0,
6959+ (is_empty ? EMPTY_NODE : AT_UNIT));
6960+ if (!is_empty)
6961+ coord->unit_pos = coord_last_unit_pos(coord);
6962+ assert("jmacd-9802", coord_check(coord));
6963+}
6964+
6965+/* Initialize a coordinate to before the first item. If the node is empty, it is
6966+ positioned at the EMPTY_NODE. */
6967+void coord_init_before_first_item(coord_t * coord, const znode * node)
6968+{
6969+ int is_empty = node_is_empty(node);
6970+
6971+ coord_init_values(coord, node, 0, 0,
6972+ (is_empty ? EMPTY_NODE : BEFORE_UNIT));
6973+
6974+ assert("jmacd-9803", coord_check(coord));
6975+}
6976+
6977+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
6978+ at the EMPTY_NODE. */
6979+void coord_init_after_last_item(coord_t * coord, const znode * node)
6980+{
6981+ int is_empty = node_is_empty(node);
6982+
6983+ coord_init_values(coord, node,
6984+ (is_empty ? 0 : node_num_items(node) - 1), 0,
6985+ (is_empty ? EMPTY_NODE : AFTER_ITEM));
6986+
6987+ assert("jmacd-9804", coord_check(coord));
6988+}
6989+
6990+/* Initialize a coordinate to after last unit in the item. Coord must be set
6991+ already to existing item */
6992+void coord_init_after_item_end(coord_t * coord)
6993+{
6994+ coord->between = AFTER_UNIT;
6995+ coord->unit_pos = coord_last_unit_pos(coord);
6996+}
6997+
6998+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
6999+void coord_init_before_item(coord_t * coord)
7000+{
7001+ coord->unit_pos = 0;
7002+ coord->between = BEFORE_ITEM;
7003+}
7004+
7005+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7006+void coord_init_after_item(coord_t * coord)
7007+{
7008+ coord->unit_pos = 0;
7009+ coord->between = AFTER_ITEM;
7010+}
7011+
7012+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7013+ it was not clear how actually */
7014+void coord_init_zero(coord_t * coord)
7015+{
7016+ memset(coord, 0, sizeof(*coord));
7017+}
7018+
7019+/* Return the number of units at the present item. Asserts coord_is_existing_item(). */
7020+unsigned coord_num_units(const coord_t * coord)
7021+{
7022+ assert("jmacd-9806", coord_is_existing_item(coord));
7023+
7024+ return item_plugin_by_coord(coord)->b.nr_units(coord);
7025+}
7026+
7027+/* Returns true if the coord was initializewd by coord_init_invalid (). */
7028+/* Audited by: green(2002.06.15) */
7029+int coord_is_invalid(const coord_t * coord)
7030+{
7031+ return coord->between == INVALID_COORD;
7032+}
7033+
7034+/* Returns true if the coordinate is positioned at an existing item, not before or after
7035+ an item. It may be placed at, before, or after any unit within the item, whether
7036+ existing or not. */
7037+int coord_is_existing_item(const coord_t * coord)
7038+{
7039+ switch (coord->between) {
7040+ case EMPTY_NODE:
7041+ case BEFORE_ITEM:
7042+ case AFTER_ITEM:
7043+ case INVALID_COORD:
7044+ return 0;
7045+
7046+ case BEFORE_UNIT:
7047+ case AT_UNIT:
7048+ case AFTER_UNIT:
7049+ return coord->item_pos < coord_num_items(coord);
7050+ }
7051+
7052+ impossible("jmacd-9900", "unreachable coord: %p", coord);
7053+ return 0;
7054+}
7055+
7056+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7057+ unit. */
7058+/* Audited by: green(2002.06.15) */
7059+int coord_is_existing_unit(const coord_t * coord)
7060+{
7061+ switch (coord->between) {
7062+ case EMPTY_NODE:
7063+ case BEFORE_UNIT:
7064+ case AFTER_UNIT:
7065+ case BEFORE_ITEM:
7066+ case AFTER_ITEM:
7067+ case INVALID_COORD:
7068+ return 0;
7069+
7070+ case AT_UNIT:
7071+ return (coord->item_pos < coord_num_items(coord)
7072+ && coord->unit_pos < coord_num_units(coord));
7073+ }
7074+
7075+ impossible("jmacd-9902", "unreachable");
7076+ return 0;
7077+}
7078+
7079+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7080+ true for empty nodes nor coordinates positioned before the first item. */
7081+/* Audited by: green(2002.06.15) */
7082+int coord_is_leftmost_unit(const coord_t * coord)
7083+{
7084+ return (coord->between == AT_UNIT && coord->item_pos == 0
7085+ && coord->unit_pos == 0);
7086+}
7087+
7088+#if REISER4_DEBUG
7089+/* For assertions only, checks for a valid coordinate. */
7090+int coord_check(const coord_t * coord)
7091+{
7092+ if (coord->node == NULL) {
7093+ return 0;
7094+ }
7095+ if (znode_above_root(coord->node))
7096+ return 1;
7097+
7098+ switch (coord->between) {
7099+ default:
7100+ case INVALID_COORD:
7101+ return 0;
7102+ case EMPTY_NODE:
7103+ if (!node_is_empty(coord->node)) {
7104+ return 0;
7105+ }
7106+ return coord->item_pos == 0 && coord->unit_pos == 0;
7107+
7108+ case BEFORE_UNIT:
7109+ case AFTER_UNIT:
7110+ if (node_is_empty(coord->node) && (coord->item_pos == 0)
7111+ && (coord->unit_pos == 0))
7112+ return 1;
7113+ case AT_UNIT:
7114+ break;
7115+ case AFTER_ITEM:
7116+ case BEFORE_ITEM:
7117+ /* before/after item should not set unit_pos. */
7118+ if (coord->unit_pos != 0) {
7119+ return 0;
7120+ }
7121+ break;
7122+ }
7123+
7124+ if (coord->item_pos >= node_num_items(coord->node)) {
7125+ return 0;
7126+ }
7127+
7128+ /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7129+ between is set either AFTER_ITEM or BEFORE_ITEM */
7130+ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7131+ return 1;
7132+
7133+ if (coord_is_iplug_set(coord) &&
7134+ coord->unit_pos >
7135+ item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7136+ return 0;
7137+ }
7138+ return 1;
7139+}
7140+#endif
7141+
7142+/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7143+ Returns 1 if the new position is does not exist. */
7144+static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7145+{
7146+ /* If the node is invalid, leave it. */
7147+ if (coord->between == INVALID_COORD) {
7148+ return 1;
7149+ }
7150+
7151+ /* If the node is empty, set it appropriately. */
7152+ if (items == 0) {
7153+ coord->between = EMPTY_NODE;
7154+ coord_set_item_pos(coord, 0);
7155+ coord->unit_pos = 0;
7156+ return 1;
7157+ }
7158+
7159+ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7160+ if (coord->between == EMPTY_NODE) {
7161+ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7162+ coord_set_item_pos(coord, 0);
7163+ coord->unit_pos = 0;
7164+ return 0;
7165+ }
7166+
7167+ /* If the item_pos is out-of-range, set it appropriatly. */
7168+ if (coord->item_pos >= items) {
7169+ coord->between = AFTER_ITEM;
7170+ coord_set_item_pos(coord, items - 1);
7171+ coord->unit_pos = 0;
7172+ /* If is_next, return 1 (can't go any further). */
7173+ return is_next;
7174+ }
7175+
7176+ return 0;
7177+}
7178+
7179+/* Advances the coordinate by one unit to the right. If empty, no change. If
7180+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an
7181+ existing unit. */
7182+int coord_next_unit(coord_t * coord)
7183+{
7184+ unsigned items = coord_num_items(coord);
7185+
7186+ if (coord_adjust_items(coord, items, 1) == 1) {
7187+ return 1;
7188+ }
7189+
7190+ switch (coord->between) {
7191+ case BEFORE_UNIT:
7192+ /* Now it is positioned at the same unit. */
7193+ coord->between = AT_UNIT;
7194+ return 0;
7195+
7196+ case AFTER_UNIT:
7197+ case AT_UNIT:
7198+ /* If it was at or after a unit and there are more units in this item,
7199+ advance to the next one. */
7200+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7201+ coord->unit_pos += 1;
7202+ coord->between = AT_UNIT;
7203+ return 0;
7204+ }
7205+
7206+ /* Otherwise, it is crossing an item boundary and treated as if it was
7207+ after the current item. */
7208+ coord->between = AFTER_ITEM;
7209+ coord->unit_pos = 0;
7210+ /* FALLTHROUGH */
7211+
7212+ case AFTER_ITEM:
7213+ /* Check for end-of-node. */
7214+ if (coord->item_pos == items - 1) {
7215+ return 1;
7216+ }
7217+
7218+ coord_inc_item_pos(coord);
7219+ coord->unit_pos = 0;
7220+ coord->between = AT_UNIT;
7221+ return 0;
7222+
7223+ case BEFORE_ITEM:
7224+ /* The adjust_items checks ensure that we are valid here. */
7225+ coord->unit_pos = 0;
7226+ coord->between = AT_UNIT;
7227+ return 0;
7228+
7229+ case INVALID_COORD:
7230+ case EMPTY_NODE:
7231+ /* Handled in coord_adjust_items(). */
7232+ break;
7233+ }
7234+
7235+ impossible("jmacd-9902", "unreachable");
7236+ return 0;
7237+}
7238+
7239+/* Advances the coordinate by one item to the right. If empty, no change. If
7240+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
7241+ an existing item. */
7242+int coord_next_item(coord_t * coord)
7243+{
7244+ unsigned items = coord_num_items(coord);
7245+
7246+ if (coord_adjust_items(coord, items, 1) == 1) {
7247+ return 1;
7248+ }
7249+
7250+ switch (coord->between) {
7251+ case AFTER_UNIT:
7252+ case AT_UNIT:
7253+ case BEFORE_UNIT:
7254+ case AFTER_ITEM:
7255+ /* Check for end-of-node. */
7256+ if (coord->item_pos == items - 1) {
7257+ coord->between = AFTER_ITEM;
7258+ coord->unit_pos = 0;
7259+ coord_clear_iplug(coord);
7260+ return 1;
7261+ }
7262+
7263+ /* Anywhere in an item, go to the next one. */
7264+ coord->between = AT_UNIT;
7265+ coord_inc_item_pos(coord);
7266+ coord->unit_pos = 0;
7267+ return 0;
7268+
7269+ case BEFORE_ITEM:
7270+ /* The out-of-range check ensures that we are valid here. */
7271+ coord->unit_pos = 0;
7272+ coord->between = AT_UNIT;
7273+ return 0;
7274+ case INVALID_COORD:
7275+ case EMPTY_NODE:
7276+ /* Handled in coord_adjust_items(). */
7277+ break;
7278+ }
7279+
7280+ impossible("jmacd-9903", "unreachable");
7281+ return 0;
7282+}
7283+
7284+/* Advances the coordinate by one unit to the left. If empty, no change. If
7285+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7286+ is an existing unit. */
7287+int coord_prev_unit(coord_t * coord)
7288+{
7289+ unsigned items = coord_num_items(coord);
7290+
7291+ if (coord_adjust_items(coord, items, 0) == 1) {
7292+ return 1;
7293+ }
7294+
7295+ switch (coord->between) {
7296+ case AT_UNIT:
7297+ case BEFORE_UNIT:
7298+ if (coord->unit_pos > 0) {
7299+ coord->unit_pos -= 1;
7300+ coord->between = AT_UNIT;
7301+ return 0;
7302+ }
7303+
7304+ if (coord->item_pos == 0) {
7305+ coord->between = BEFORE_ITEM;
7306+ return 1;
7307+ }
7308+
7309+ coord_dec_item_pos(coord);
7310+ coord->unit_pos = coord_last_unit_pos(coord);
7311+ coord->between = AT_UNIT;
7312+ return 0;
7313+
7314+ case AFTER_UNIT:
7315+ /* What if unit_pos is out-of-range? */
7316+ assert("jmacd-5442",
7317+ coord->unit_pos <= coord_last_unit_pos(coord));
7318+ coord->between = AT_UNIT;
7319+ return 0;
7320+
7321+ case BEFORE_ITEM:
7322+ if (coord->item_pos == 0) {
7323+ return 1;
7324+ }
7325+
7326+ coord_dec_item_pos(coord);
7327+ /* FALLTHROUGH */
7328+
7329+ case AFTER_ITEM:
7330+ coord->between = AT_UNIT;
7331+ coord->unit_pos = coord_last_unit_pos(coord);
7332+ return 0;
7333+
7334+ case INVALID_COORD:
7335+ case EMPTY_NODE:
7336+ break;
7337+ }
7338+
7339+ impossible("jmacd-9904", "unreachable");
7340+ return 0;
7341+}
7342+
7343+/* Advances the coordinate by one item to the left. If empty, no change. If
7344+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7345+ is an existing item. */
7346+int coord_prev_item(coord_t * coord)
7347+{
7348+ unsigned items = coord_num_items(coord);
7349+
7350+ if (coord_adjust_items(coord, items, 0) == 1) {
7351+ return 1;
7352+ }
7353+
7354+ switch (coord->between) {
7355+ case AT_UNIT:
7356+ case AFTER_UNIT:
7357+ case BEFORE_UNIT:
7358+ case BEFORE_ITEM:
7359+
7360+ if (coord->item_pos == 0) {
7361+ coord->between = BEFORE_ITEM;
7362+ coord->unit_pos = 0;
7363+ return 1;
7364+ }
7365+
7366+ coord_dec_item_pos(coord);
7367+ coord->unit_pos = 0;
7368+ coord->between = AT_UNIT;
7369+ return 0;
7370+
7371+ case AFTER_ITEM:
7372+ coord->between = AT_UNIT;
7373+ coord->unit_pos = 0;
7374+ return 0;
7375+
7376+ case INVALID_COORD:
7377+ case EMPTY_NODE:
7378+ break;
7379+ }
7380+
7381+ impossible("jmacd-9905", "unreachable");
7382+ return 0;
7383+}
7384+
7385+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7386+void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7387+{
7388+ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7389+ if (dir == LEFT_SIDE) {
7390+ coord_init_first_unit(coord, node);
7391+ } else {
7392+ coord_init_last_unit(coord, node);
7393+ }
7394+}
7395+
7396+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7397+ argument. */
7398+/* Audited by: green(2002.06.15) */
7399+int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7400+{
7401+ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7402+ if (dir == LEFT_SIDE) {
7403+ return coord_is_before_leftmost(coord);
7404+ } else {
7405+ return coord_is_after_rightmost(coord);
7406+ }
7407+}
7408+
7409+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7410+/* Audited by: green(2002.06.15) */
7411+int coord_sideof_unit(coord_t * coord, sideof dir)
7412+{
7413+ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7414+ if (dir == LEFT_SIDE) {
7415+ return coord_prev_unit(coord);
7416+ } else {
7417+ return coord_next_unit(coord);
7418+ }
7419+}
7420+
7421+#if REISER4_DEBUG
7422+int coords_equal(const coord_t * c1, const coord_t * c2)
7423+{
7424+ assert("nikita-2840", c1 != NULL);
7425+ assert("nikita-2841", c2 != NULL);
7426+
7427+ return
7428+ c1->node == c2->node &&
7429+ c1->item_pos == c2->item_pos &&
7430+ c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7431+}
7432+#endif /* REISER4_DEBUG */
7433+
7434+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7435+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7436+/* Audited by: green(2002.06.15) */
7437+coord_wrt_node coord_wrt(const coord_t * coord)
7438+{
7439+ if (coord_is_before_leftmost(coord)) {
7440+ return COORD_ON_THE_LEFT;
7441+ }
7442+
7443+ if (coord_is_after_rightmost(coord)) {
7444+ return COORD_ON_THE_RIGHT;
7445+ }
7446+
7447+ return COORD_INSIDE;
7448+}
7449+
7450+/* Returns true if the coordinate is positioned after the last item or after the last unit
7451+ of the last item or it is an empty node. */
7452+/* Audited by: green(2002.06.15) */
7453+int coord_is_after_rightmost(const coord_t * coord)
7454+{
7455+ assert("jmacd-7313", coord_check(coord));
7456+
7457+ switch (coord->between) {
7458+ case INVALID_COORD:
7459+ case AT_UNIT:
7460+ case BEFORE_UNIT:
7461+ case BEFORE_ITEM:
7462+ return 0;
7463+
7464+ case EMPTY_NODE:
7465+ return 1;
7466+
7467+ case AFTER_ITEM:
7468+ return (coord->item_pos == node_num_items(coord->node) - 1);
7469+
7470+ case AFTER_UNIT:
7471+ return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7472+ coord->unit_pos == coord_last_unit_pos(coord));
7473+ }
7474+
7475+ impossible("jmacd-9908", "unreachable");
7476+ return 0;
7477+}
7478+
7479+/* Returns true if the coordinate is positioned before the first item or it is an empty
7480+ node. */
7481+int coord_is_before_leftmost(const coord_t * coord)
7482+{
7483+ /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7484+ necessary to check if coord is set before leftmost
7485+ assert ("jmacd-7313", coord_check (coord)); */
7486+ switch (coord->between) {
7487+ case INVALID_COORD:
7488+ case AT_UNIT:
7489+ case AFTER_ITEM:
7490+ case AFTER_UNIT:
7491+ return 0;
7492+
7493+ case EMPTY_NODE:
7494+ return 1;
7495+
7496+ case BEFORE_ITEM:
7497+ case BEFORE_UNIT:
7498+ return (coord->item_pos == 0) && (coord->unit_pos == 0);
7499+ }
7500+
7501+ impossible("jmacd-9908", "unreachable");
7502+ return 0;
7503+}
7504+
7505+/* Returns true if the coordinate is positioned after a item, before a item, after the
7506+ last unit of an item, before the first unit of an item, or at an empty node. */
7507+/* Audited by: green(2002.06.15) */
7508+int coord_is_between_items(const coord_t * coord)
7509+{
7510+ assert("jmacd-7313", coord_check(coord));
7511+
7512+ switch (coord->between) {
7513+ case INVALID_COORD:
7514+ case AT_UNIT:
7515+ return 0;
7516+
7517+ case AFTER_ITEM:
7518+ case BEFORE_ITEM:
7519+ case EMPTY_NODE:
7520+ return 1;
7521+
7522+ case BEFORE_UNIT:
7523+ return coord->unit_pos == 0;
7524+
7525+ case AFTER_UNIT:
7526+ return coord->unit_pos == coord_last_unit_pos(coord);
7527+ }
7528+
7529+ impossible("jmacd-9908", "unreachable");
7530+ return 0;
7531+}
7532+
7533+#if REISER4_DEBUG
7534+/* Returns true if the coordinates are positioned at adjacent units, regardless of
7535+ before-after or item boundaries. */
7536+int coord_are_neighbors(coord_t * c1, coord_t * c2)
7537+{
7538+ coord_t *left;
7539+ coord_t *right;
7540+
7541+ assert("nikita-1241", c1 != NULL);
7542+ assert("nikita-1242", c2 != NULL);
7543+ assert("nikita-1243", c1->node == c2->node);
7544+ assert("nikita-1244", coord_is_existing_unit(c1));
7545+ assert("nikita-1245", coord_is_existing_unit(c2));
7546+
7547+ left = right = NULL;
7548+ switch (coord_compare(c1, c2)) {
7549+ case COORD_CMP_ON_LEFT:
7550+ left = c1;
7551+ right = c2;
7552+ break;
7553+ case COORD_CMP_ON_RIGHT:
7554+ left = c2;
7555+ right = c1;
7556+ break;
7557+ case COORD_CMP_SAME:
7558+ return 0;
7559+ default:
7560+ wrong_return_value("nikita-1246", "compare_coords()");
7561+ }
7562+ assert("vs-731", left && right);
7563+ if (left->item_pos == right->item_pos) {
7564+ return left->unit_pos + 1 == right->unit_pos;
7565+ } else if (left->item_pos + 1 == right->item_pos) {
7566+ return (left->unit_pos == coord_last_unit_pos(left))
7567+ && (right->unit_pos == 0);
7568+ } else {
7569+ return 0;
7570+ }
7571+}
7572+#endif /* REISER4_DEBUG */
7573+
7574+/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
7575+ COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */
7576+/* Audited by: green(2002.06.15) */
7577+coord_cmp coord_compare(coord_t * c1, coord_t * c2)
7578+{
7579+ assert("vs-209", c1->node == c2->node);
7580+ assert("vs-194", coord_is_existing_unit(c1)
7581+ && coord_is_existing_unit(c2));
7582+
7583+ if (c1->item_pos > c2->item_pos)
7584+ return COORD_CMP_ON_RIGHT;
7585+ if (c1->item_pos < c2->item_pos)
7586+ return COORD_CMP_ON_LEFT;
7587+ if (c1->unit_pos > c2->unit_pos)
7588+ return COORD_CMP_ON_RIGHT;
7589+ if (c1->unit_pos < c2->unit_pos)
7590+ return COORD_CMP_ON_LEFT;
7591+ return COORD_CMP_SAME;
7592+}
7593+
7594+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
7595+ non-zero if there is no position to the right. */
7596+int coord_set_to_right(coord_t * coord)
7597+{
7598+ unsigned items = coord_num_items(coord);
7599+
7600+ if (coord_adjust_items(coord, items, 1) == 1) {
7601+ return 1;
7602+ }
7603+
7604+ switch (coord->between) {
7605+ case AT_UNIT:
7606+ return 0;
7607+
7608+ case BEFORE_ITEM:
7609+ case BEFORE_UNIT:
7610+ coord->between = AT_UNIT;
7611+ return 0;
7612+
7613+ case AFTER_UNIT:
7614+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7615+ coord->unit_pos += 1;
7616+ coord->between = AT_UNIT;
7617+ return 0;
7618+ } else {
7619+
7620+ coord->unit_pos = 0;
7621+
7622+ if (coord->item_pos == items - 1) {
7623+ coord->between = AFTER_ITEM;
7624+ return 1;
7625+ }
7626+
7627+ coord_inc_item_pos(coord);
7628+ coord->between = AT_UNIT;
7629+ return 0;
7630+ }
7631+
7632+ case AFTER_ITEM:
7633+ if (coord->item_pos == items - 1) {
7634+ return 1;
7635+ }
7636+
7637+ coord_inc_item_pos(coord);
7638+ coord->unit_pos = 0;
7639+ coord->between = AT_UNIT;
7640+ return 0;
7641+
7642+ case EMPTY_NODE:
7643+ return 1;
7644+
7645+ case INVALID_COORD:
7646+ break;
7647+ }
7648+
7649+ impossible("jmacd-9920", "unreachable");
7650+ return 0;
7651+}
7652+
7653+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
7654+ non-zero if there is no position to the left. */
7655+int coord_set_to_left(coord_t * coord)
7656+{
7657+ unsigned items = coord_num_items(coord);
7658+
7659+ if (coord_adjust_items(coord, items, 0) == 1) {
7660+ return 1;
7661+ }
7662+
7663+ switch (coord->between) {
7664+ case AT_UNIT:
7665+ return 0;
7666+
7667+ case AFTER_UNIT:
7668+ coord->between = AT_UNIT;
7669+ return 0;
7670+
7671+ case AFTER_ITEM:
7672+ coord->between = AT_UNIT;
7673+ coord->unit_pos = coord_last_unit_pos(coord);
7674+ return 0;
7675+
7676+ case BEFORE_UNIT:
7677+ if (coord->unit_pos > 0) {
7678+ coord->unit_pos -= 1;
7679+ coord->between = AT_UNIT;
7680+ return 0;
7681+ } else {
7682+
7683+ if (coord->item_pos == 0) {
7684+ coord->between = BEFORE_ITEM;
7685+ return 1;
7686+ }
7687+
7688+ coord->unit_pos = coord_last_unit_pos(coord);
7689+ coord_dec_item_pos(coord);
7690+ coord->between = AT_UNIT;
7691+ return 0;
7692+ }
7693+
7694+ case BEFORE_ITEM:
7695+ if (coord->item_pos == 0) {
7696+ return 1;
7697+ }
7698+
7699+ coord_dec_item_pos(coord);
7700+ coord->unit_pos = coord_last_unit_pos(coord);
7701+ coord->between = AT_UNIT;
7702+ return 0;
7703+
7704+ case EMPTY_NODE:
7705+ return 1;
7706+
7707+ case INVALID_COORD:
7708+ break;
7709+ }
7710+
7711+ impossible("jmacd-9920", "unreachable");
7712+ return 0;
7713+}
7714+
7715+static const char *coord_tween_tostring(between_enum n)
7716+{
7717+ switch (n) {
7718+ case BEFORE_UNIT:
7719+ return "before unit";
7720+ case BEFORE_ITEM:
7721+ return "before item";
7722+ case AT_UNIT:
7723+ return "at unit";
7724+ case AFTER_UNIT:
7725+ return "after unit";
7726+ case AFTER_ITEM:
7727+ return "after item";
7728+ case EMPTY_NODE:
7729+ return "empty node";
7730+ case INVALID_COORD:
7731+ return "invalid";
7732+ default:
7733+ {
7734+ static char buf[30];
7735+
7736+ sprintf(buf, "unknown: %i", n);
7737+ return buf;
7738+ }
7739+ }
7740+}
7741+
7742+void print_coord(const char *mes, const coord_t * coord, int node)
7743+{
7744+ if (coord == NULL) {
7745+ printk("%s: null\n", mes);
7746+ return;
7747+ }
7748+ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7749+ mes, coord->item_pos, coord->unit_pos,
7750+ coord_tween_tostring(coord->between), coord->iplugid);
7751+}
7752+
7753+int
7754+item_utmost_child_real_block(const coord_t * coord, sideof side,
7755+ reiser4_block_nr * blk)
7756+{
7757+ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7758+ side,
7759+ blk);
7760+}
7761+
7762+int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
7763+{
7764+ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
7765+}
7766+
7767+/* @count bytes of flow @f got written, update correspondingly f->length,
7768+ f->data and f->key */
7769+void move_flow_forward(flow_t * f, unsigned count)
7770+{
7771+ if (f->data)
7772+ f->data += count;
7773+ f->length -= count;
7774+ set_key_offset(&f->key, get_key_offset(&f->key) + count);
7775+}
7776+
7777+/*
7778+ Local variables:
7779+ c-indentation-style: "K&R"
7780+ mode-name: "LC"
7781+ c-basic-offset: 8
7782+ tab-width: 8
7783+ fill-column: 120
7784+ scroll-step: 1
7785+ End:
7786+*/
7787diff -urN linux-2.6.24.orig/fs/reiser4/coord.h linux-2.6.24/fs/reiser4/coord.h
7788--- linux-2.6.24.orig/fs/reiser4/coord.h 1970-01-01 03:00:00.000000000 +0300
7789+++ linux-2.6.24/fs/reiser4/coord.h 2008-01-25 11:39:06.908200476 +0300
7790@@ -0,0 +1,389 @@
7791+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7792+
7793+/* Coords */
7794+
7795+#if !defined( __REISER4_COORD_H__ )
7796+#define __REISER4_COORD_H__
7797+
7798+#include "forward.h"
7799+#include "debug.h"
7800+#include "dformat.h"
7801+#include "key.h"
7802+
7803+/* insertions happen between coords in the tree, so we need some means
7804+ of specifying the sense of betweenness. */
7805+typedef enum {
7806+ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
7807+ AT_UNIT,
7808+ AFTER_UNIT,
7809+ BEFORE_ITEM,
7810+ AFTER_ITEM,
7811+ INVALID_COORD,
7812+ EMPTY_NODE,
7813+} between_enum;
7814+
7815+/* location of coord w.r.t. its node */
7816+typedef enum {
7817+ COORD_ON_THE_LEFT = -1,
7818+ COORD_ON_THE_RIGHT = +1,
7819+ COORD_INSIDE = 0
7820+} coord_wrt_node;
7821+
7822+typedef enum {
7823+ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
7824+} coord_cmp;
7825+
7826+struct coord {
7827+ /* node in a tree */
7828+ /* 0 */ znode *node;
7829+
7830+ /* position of item within node */
7831+ /* 4 */ pos_in_node_t item_pos;
7832+ /* position of unit within item */
7833+ /* 6 */ pos_in_node_t unit_pos;
7834+ /* optimization: plugin of item is stored in coord_t. Until this was
7835+ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
7836+ is invalidated (set to 0xff) on each modification of ->item_pos,
7837+ and all such modifications are funneled through coord_*_item_pos()
7838+ functions below.
7839+ */
7840+ /* 8 */ char iplugid;
7841+ /* position of coord w.r.t. to neighboring items and/or units.
7842+ Values are taken from &between_enum above.
7843+ */
7844+ /* 9 */ char between;
7845+ /* padding. It will be added by the compiler anyway to conform to the
7846+ * C language alignment requirements. We keep it here to be on the
7847+ * safe side and to have a clear picture of the memory layout of this
7848+ * structure. */
7849+ /* 10 */ __u16 pad;
7850+ /* 12 */ int offset;
7851+#if REISER4_DEBUG
7852+ unsigned long plug_v;
7853+ unsigned long body_v;
7854+#endif
7855+};
7856+
7857+#define INVALID_PLUGID ((char)((1 << 8) - 1))
7858+#define INVALID_OFFSET -1
7859+
7860+static inline void coord_clear_iplug(coord_t * coord)
7861+{
7862+ assert("nikita-2835", coord != NULL);
7863+ coord->iplugid = INVALID_PLUGID;
7864+ coord->offset = INVALID_OFFSET;
7865+}
7866+
7867+static inline int coord_is_iplug_set(const coord_t * coord)
7868+{
7869+ assert("nikita-2836", coord != NULL);
7870+ return coord->iplugid != INVALID_PLUGID;
7871+}
7872+
7873+static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
7874+{
7875+ assert("nikita-2478", coord != NULL);
7876+ coord->item_pos = pos;
7877+ coord_clear_iplug(coord);
7878+}
7879+
7880+static inline void coord_dec_item_pos(coord_t * coord)
7881+{
7882+ assert("nikita-2480", coord != NULL);
7883+ --coord->item_pos;
7884+ coord_clear_iplug(coord);
7885+}
7886+
7887+static inline void coord_inc_item_pos(coord_t * coord)
7888+{
7889+ assert("nikita-2481", coord != NULL);
7890+ ++coord->item_pos;
7891+ coord_clear_iplug(coord);
7892+}
7893+
7894+static inline void coord_add_item_pos(coord_t * coord, int delta)
7895+{
7896+ assert("nikita-2482", coord != NULL);
7897+ coord->item_pos += delta;
7898+ coord_clear_iplug(coord);
7899+}
7900+
7901+static inline void coord_invalid_item_pos(coord_t * coord)
7902+{
7903+ assert("nikita-2832", coord != NULL);
7904+ coord->item_pos = (unsigned short)~0;
7905+ coord_clear_iplug(coord);
7906+}
7907+
7908+/* Reverse a direction. */
7909+static inline sideof sideof_reverse(sideof side)
7910+{
7911+ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
7912+}
7913+
7914+/* NOTE: There is a somewhat odd mixture of the following opposed terms:
7915+
7916+ "first" and "last"
7917+ "next" and "prev"
7918+ "before" and "after"
7919+ "leftmost" and "rightmost"
7920+
7921+ But I think the chosen names are decent the way they are.
7922+*/
7923+
7924+/* COORD INITIALIZERS */
7925+
7926+/* Initialize an invalid coordinate. */
7927+extern void coord_init_invalid(coord_t * coord, const znode * node);
7928+
7929+extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
7930+
7931+/* Initialize a coordinate to point at the first unit of the first item. If the node is
7932+ empty, it is positioned at the EMPTY_NODE. */
7933+extern void coord_init_first_unit(coord_t * coord, const znode * node);
7934+
7935+/* Initialize a coordinate to point at the last unit of the last item. If the node is
7936+ empty, it is positioned at the EMPTY_NODE. */
7937+extern void coord_init_last_unit(coord_t * coord, const znode * node);
7938+
7939+/* Initialize a coordinate to before the first item. If the node is empty, it is
7940+ positioned at the EMPTY_NODE. */
7941+extern void coord_init_before_first_item(coord_t * coord, const znode * node);
7942+
7943+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
7944+ at the EMPTY_NODE. */
7945+extern void coord_init_after_last_item(coord_t * coord, const znode * node);
7946+
7947+/* Initialize a coordinate to after last unit in the item. Coord must be set
7948+ already to existing item */
7949+void coord_init_after_item_end(coord_t * coord);
7950+
7951+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7952+void coord_init_before_item(coord_t *);
7953+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7954+void coord_init_after_item(coord_t *);
7955+
7956+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7957+extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
7958+ sideof dir);
7959+
7960+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7961+ it was not clear how actually
7962+ FIXME-VS: added by vs (2002, june, 8) */
7963+extern void coord_init_zero(coord_t * coord);
7964+
7965+/* COORD METHODS */
7966+
7967+/* after shifting of node content, coord previously set properly may become
7968+ invalid, try to "normalize" it. */
7969+void coord_normalize(coord_t * coord);
7970+
7971+/* Copy a coordinate. */
7972+extern void coord_dup(coord_t * coord, const coord_t * old_coord);
7973+
7974+/* Copy a coordinate without check. */
7975+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
7976+
7977+unsigned coord_num_units(const coord_t * coord);
7978+
7979+/* Return the last valid unit number at the present item (i.e.,
7980+ coord_num_units() - 1). */
7981+static inline unsigned coord_last_unit_pos(const coord_t * coord)
7982+{
7983+ return coord_num_units(coord) - 1;
7984+}
7985+
7986+#if REISER4_DEBUG
7987+/* For assertions only, checks for a valid coordinate. */
7988+extern int coord_check(const coord_t * coord);
7989+
7990+extern unsigned long znode_times_locked(const znode * z);
7991+
7992+static inline void coord_update_v(coord_t * coord)
7993+{
7994+ coord->plug_v = coord->body_v = znode_times_locked(coord->node);
7995+}
7996+#endif
7997+
7998+extern int coords_equal(const coord_t * c1, const coord_t * c2);
7999+
8000+extern void print_coord(const char *mes, const coord_t * coord, int print_node);
8001+
8002+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
8003+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
8004+extern coord_wrt_node coord_wrt(const coord_t * coord);
8005+
8006+/* Returns true if the coordinates are positioned at adjacent units, regardless of
8007+ before-after or item boundaries. */
8008+extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
8009+
8010+/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
8011+ NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */
8012+extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
8013+
8014+/* COORD PREDICATES */
8015+
8016+/* Returns true if the coord was initializewd by coord_init_invalid (). */
8017+extern int coord_is_invalid(const coord_t * coord);
8018+
8019+/* Returns true if the coordinate is positioned at an existing item, not before or after
8020+ an item. It may be placed at, before, or after any unit within the item, whether
8021+ existing or not. If this is true you can call methods of the item plugin. */
8022+extern int coord_is_existing_item(const coord_t * coord);
8023+
8024+/* Returns true if the coordinate is positioned after a item, before a item, after the
8025+ last unit of an item, before the first unit of an item, or at an empty node. */
8026+extern int coord_is_between_items(const coord_t * coord);
8027+
8028+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
8029+ unit. */
8030+extern int coord_is_existing_unit(const coord_t * coord);
8031+
8032+/* Returns true if the coordinate is positioned at an empty node. */
8033+extern int coord_is_empty(const coord_t * coord);
8034+
8035+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
8036+ true for empty nodes nor coordinates positioned before the first item. */
8037+extern int coord_is_leftmost_unit(const coord_t * coord);
8038+
8039+/* Returns true if the coordinate is positioned after the last item or after the last unit
8040+ of the last item or it is an empty node. */
8041+extern int coord_is_after_rightmost(const coord_t * coord);
8042+
8043+/* Returns true if the coordinate is positioned before the first item or it is an empty
8044+ node. */
8045+extern int coord_is_before_leftmost(const coord_t * coord);
8046+
8047+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8048+ argument. */
8049+extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8050+
8051+/* COORD MODIFIERS */
8052+
8053+/* Advances the coordinate by one unit to the right. If empty, no change. If
8054+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8055+ an existing unit. */
8056+extern int coord_next_unit(coord_t * coord);
8057+
8058+/* Advances the coordinate by one item to the right. If empty, no change. If
8059+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8060+ an existing item. */
8061+extern int coord_next_item(coord_t * coord);
8062+
8063+/* Advances the coordinate by one unit to the left. If empty, no change. If
8064+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8065+ is an existing unit. */
8066+extern int coord_prev_unit(coord_t * coord);
8067+
8068+/* Advances the coordinate by one item to the left. If empty, no change. If
8069+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8070+ is an existing item. */
8071+extern int coord_prev_item(coord_t * coord);
8072+
8073+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8074+ non-zero if there is no position to the right. */
8075+extern int coord_set_to_right(coord_t * coord);
8076+
8077+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8078+ non-zero if there is no position to the left. */
8079+extern int coord_set_to_left(coord_t * coord);
8080+
8081+/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success
8082+ and non-zero if the unit did not exist. */
8083+extern int coord_set_after_unit(coord_t * coord);
8084+
8085+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8086+extern int coord_sideof_unit(coord_t * coord, sideof dir);
8087+
8088+/* iterate over all units in @node */
8089+#define for_all_units( coord, node ) \
8090+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8091+ coord_next_unit( coord ) == 0 ; )
8092+
8093+/* iterate over all items in @node */
8094+#define for_all_items( coord, node ) \
8095+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8096+ coord_next_item( coord ) == 0 ; )
8097+
8098+/* COORD/ITEM METHODS */
8099+
8100+extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8101+ reiser4_block_nr * blk);
8102+extern int item_utmost_child(const coord_t * coord, sideof side,
8103+ jnode ** child);
8104+
8105+/* a flow is a sequence of bytes being written to or read from the tree. The
8106+ tree will slice the flow into items while storing it into nodes, but all of
8107+ that is hidden from anything outside the tree. */
8108+
8109+struct flow {
8110+ reiser4_key key; /* key of start of flow's sequence of bytes */
8111+ loff_t length; /* length of flow's sequence of bytes */
8112+ char *data; /* start of flow's sequence of bytes */
8113+ int user; /* if 1 data is user space, 0 - kernel space */
8114+ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8115+};
8116+
8117+void move_flow_forward(flow_t * f, unsigned count);
8118+
8119+/* &reiser4_item_data - description of data to be inserted or pasted
8120+
8121+ Q: articulate the reasons for the difference between this and flow.
8122+
8123+ A: Becides flow we insert into tree other things: stat data, directory
8124+ entry, etc. To insert them into tree one has to provide this structure. If
8125+ one is going to insert flow - he can use insert_flow, where this structure
8126+ does not have to be created
8127+*/
8128+struct reiser4_item_data {
8129+ /* actual data to be inserted. If NULL, ->create_item() will not
8130+ do xmemcpy itself, leaving this up to the caller. This can
8131+ save some amount of unnecessary memory copying, for example,
8132+ during insertion of stat data.
8133+
8134+ */
8135+ char *data;
8136+ /* 1 if 'char * data' contains pointer to user space and 0 if it is
8137+ kernel space */
8138+ int user;
8139+ /* amount of data we are going to insert or paste */
8140+ int length;
8141+ /* "Arg" is opaque data that is passed down to the
8142+ ->create_item() method of node layout, which in turn
8143+ hands it to the ->create_hook() of item being created. This
8144+ arg is currently used by:
8145+
8146+ . ->create_hook() of internal item
8147+ (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8148+ . ->paste() method of directory item.
8149+ . ->create_hook() of extent item
8150+
8151+ For internal item, this is left "brother" of new node being
8152+ inserted and it is used to add new node into sibling list
8153+ after parent to it was just inserted into parent.
8154+
8155+ While ->arg does look somewhat of unnecessary compication,
8156+ it actually saves a lot of headache in many places, because
8157+ all data necessary to insert or paste new data into tree are
8158+ collected in one place, and this eliminates a lot of extra
8159+ argument passing and storing everywhere.
8160+
8161+ */
8162+ void *arg;
8163+ /* plugin of item we are inserting */
8164+ item_plugin *iplug;
8165+};
8166+
8167+/* __REISER4_COORD_H__ */
8168+#endif
8169+
8170+/* Make Linus happy.
8171+ Local variables:
8172+ c-indentation-style: "K&R"
8173+ mode-name: "LC"
8174+ c-basic-offset: 8
8175+ tab-width: 8
8176+ fill-column: 120
8177+ scroll-step: 1
8178+ End:
8179+*/
8180diff -urN linux-2.6.24.orig/fs/reiser4/debug.c linux-2.6.24/fs/reiser4/debug.c
8181--- linux-2.6.24.orig/fs/reiser4/debug.c 1970-01-01 03:00:00.000000000 +0300
8182+++ linux-2.6.24/fs/reiser4/debug.c 2008-01-25 11:39:06.908200476 +0300
8183@@ -0,0 +1,308 @@
8184+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8185+ * reiser4/README */
8186+
8187+/* Debugging facilities. */
8188+
8189+/*
8190+ * This file contains generic debugging functions used by reiser4. Roughly
8191+ * following:
8192+ *
8193+ * panicking: reiser4_do_panic(), reiser4_print_prefix().
8194+ *
8195+ * locking:
8196+ * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8197+ * reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8198+ *
8199+ * error code monitoring (see comment before RETERR macro):
8200+ * reiser4_return_err(), reiser4_report_err().
8201+ *
8202+ * stack back-tracing: fill_backtrace()
8203+ *
8204+ * miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8205+ * reiser4_debugtrap().
8206+ *
8207+ */
8208+
8209+#include "reiser4.h"
8210+#include "context.h"
8211+#include "super.h"
8212+#include "txnmgr.h"
8213+#include "znode.h"
8214+
8215+#include <linux/sysfs.h>
8216+#include <linux/slab.h>
8217+#include <linux/types.h>
8218+#include <linux/fs.h>
8219+#include <linux/spinlock.h>
8220+#include <linux/kallsyms.h>
8221+#include <linux/vmalloc.h>
8222+#include <linux/ctype.h>
8223+#include <linux/sysctl.h>
8224+#include <linux/hardirq.h>
8225+
8226+#if 0
8227+#if REISER4_DEBUG
8228+static void reiser4_report_err(void);
8229+#else
8230+#define reiser4_report_err() noop
8231+#endif
8232+#endif /* 0 */
8233+
8234+/*
8235+ * global buffer where message given to reiser4_panic is formatted.
8236+ */
8237+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8238+
8239+/*
8240+ * lock protecting consistency of panic_buf under concurrent panics
8241+ */
8242+static DEFINE_SPINLOCK(panic_guard);
8243+
8244+/* Your best friend. Call it on each occasion. This is called by
8245+ fs/reiser4/debug.h:reiser4_panic(). */
8246+void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8247+{
8248+ static int in_panic = 0;
8249+ va_list args;
8250+
8251+ /*
8252+ * check for recursive panic.
8253+ */
8254+ if (in_panic == 0) {
8255+ in_panic = 1;
8256+
8257+ spin_lock(&panic_guard);
8258+ va_start(args, format);
8259+ vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8260+ va_end(args);
8261+ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8262+ spin_unlock(&panic_guard);
8263+
8264+ /*
8265+ * if kernel debugger is configured---drop in. Early dropping
8266+ * into kgdb is not always convenient, because panic message
8267+ * is not yet printed most of the times. But:
8268+ *
8269+ * (1) message can be extracted from printk_buf[]
8270+ * (declared static inside of printk()), and
8271+ *
8272+ * (2) sometimes serial/kgdb combo dies while printing
8273+ * long panic message, so it's more prudent to break into
8274+ * debugger earlier.
8275+ *
8276+ */
8277+ DEBUGON(1);
8278+ }
8279+ /* to make gcc happy about noreturn attribute */
8280+ panic("%s", panic_buf);
8281+}
8282+
8283+#if 0
8284+void
8285+reiser4_print_prefix(const char *level, int reperr, const char *mid,
8286+ const char *function, const char *file, int lineno)
8287+{
8288+ const char *comm;
8289+ int pid;
8290+
8291+ if (unlikely(in_interrupt() || in_irq())) {
8292+ comm = "interrupt";
8293+ pid = 0;
8294+ } else {
8295+ comm = current->comm;
8296+ pid = current->pid;
8297+ }
8298+ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8299+ level, comm, pid, function, file, lineno, mid);
8300+ if (reperr)
8301+ reiser4_report_err();
8302+}
8303+#endif /* 0 */
8304+
8305+/* Preemption point: this should be called periodically during long running
8306+ operations (carry, allocate, and squeeze are best examples) */
8307+int reiser4_preempt_point(void)
8308+{
8309+ assert("nikita-3008", reiser4_schedulable());
8310+ cond_resched();
8311+ return signal_pending(current);
8312+}
8313+
8314+#if REISER4_DEBUG
8315+/* Debugging aid: return struct where information about locks taken by current
8316+ thread is accumulated. This can be used to formulate lock ordering
8317+ constraints and various assertions.
8318+
8319+*/
8320+reiser4_lock_cnt_info *reiser4_lock_counters(void)
8321+{
8322+ reiser4_context *ctx = get_current_context();
8323+ assert("jmacd-1123", ctx != NULL);
8324+ return &ctx->locks;
8325+}
8326+
8327+/*
8328+ * print human readable information about locks held by the reiser4 context.
8329+ */
8330+static void print_lock_counters(const char *prefix,
8331+ const reiser4_lock_cnt_info * info)
8332+{
8333+ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8334+ "jload: %i, "
8335+ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8336+ "ktxnmgrd: %i, fq: %i\n"
8337+ "inode: %i, "
8338+ "cbk_cache: %i (r:%i,w%i), "
8339+ "eflush: %i, "
8340+ "zlock: %i,\n"
8341+ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8342+ "d: %i, x: %i, t: %i\n", prefix,
8343+ info->spin_locked_jnode,
8344+ info->rw_locked_tree, info->read_locked_tree,
8345+ info->write_locked_tree,
8346+ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8347+ info->spin_locked_jload,
8348+ info->spin_locked_txnh,
8349+ info->spin_locked_atom, info->spin_locked_stack,
8350+ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8351+ info->spin_locked_fq,
8352+ info->spin_locked_inode,
8353+ info->rw_locked_cbk_cache,
8354+ info->read_locked_cbk_cache,
8355+ info->write_locked_cbk_cache,
8356+ info->spin_locked_super_eflush,
8357+ info->spin_locked_zlock,
8358+ info->spin_locked,
8359+ info->long_term_locked_znode,
8360+ info->inode_sem_r, info->inode_sem_w,
8361+ info->d_refs, info->x_refs, info->t_refs);
8362+}
8363+
8364+/* check that no spinlocks are held */
8365+int reiser4_schedulable(void)
8366+{
8367+ if (get_current_context_check() != NULL) {
8368+ if (!LOCK_CNT_NIL(spin_locked)) {
8369+ print_lock_counters("in atomic", reiser4_lock_counters());
8370+ return 0;
8371+ }
8372+ }
8373+ might_sleep();
8374+ return 1;
8375+}
8376+/*
8377+ * return true, iff no locks are held.
8378+ */
8379+int reiser4_no_counters_are_held(void)
8380+{
8381+ reiser4_lock_cnt_info *counters;
8382+
8383+ counters = reiser4_lock_counters();
8384+ return
8385+ (counters->spin_locked_zlock == 0) &&
8386+ (counters->spin_locked_jnode == 0) &&
8387+ (counters->rw_locked_tree == 0) &&
8388+ (counters->read_locked_tree == 0) &&
8389+ (counters->write_locked_tree == 0) &&
8390+ (counters->rw_locked_dk == 0) &&
8391+ (counters->read_locked_dk == 0) &&
8392+ (counters->write_locked_dk == 0) &&
8393+ (counters->spin_locked_txnh == 0) &&
8394+ (counters->spin_locked_atom == 0) &&
8395+ (counters->spin_locked_stack == 0) &&
8396+ (counters->spin_locked_txnmgr == 0) &&
8397+ (counters->spin_locked_inode == 0) &&
8398+ (counters->spin_locked == 0) &&
8399+ (counters->long_term_locked_znode == 0) &&
8400+ (counters->inode_sem_r == 0) &&
8401+ (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8402+}
8403+
8404+/*
8405+ * return true, iff transaction commit can be done under locks held by the
8406+ * current thread.
8407+ */
8408+int reiser4_commit_check_locks(void)
8409+{
8410+ reiser4_lock_cnt_info *counters;
8411+ int inode_sem_r;
8412+ int inode_sem_w;
8413+ int result;
8414+
8415+ /*
8416+ * inode's read/write semaphore is the only reiser4 lock that can be
8417+ * held during commit.
8418+ */
8419+
8420+ counters = reiser4_lock_counters();
8421+ inode_sem_r = counters->inode_sem_r;
8422+ inode_sem_w = counters->inode_sem_w;
8423+
8424+ counters->inode_sem_r = counters->inode_sem_w = 0;
8425+ result = reiser4_no_counters_are_held();
8426+ counters->inode_sem_r = inode_sem_r;
8427+ counters->inode_sem_w = inode_sem_w;
8428+ return result;
8429+}
8430+
8431+/*
8432+ * fill "error site" in the current reiser4 context. See comment before RETERR
8433+ * macro for more details.
8434+ */
8435+void reiser4_return_err(int code, const char *file, int line)
8436+{
8437+ if (code < 0 && is_in_reiser4_context()) {
8438+ reiser4_context *ctx = get_current_context();
8439+
8440+ if (ctx != NULL) {
8441+ ctx->err.code = code;
8442+ ctx->err.file = file;
8443+ ctx->err.line = line;
8444+ }
8445+ }
8446+}
8447+
8448+#if 0
8449+/*
8450+ * report error information recorder by reiser4_return_err().
8451+ */
8452+static void reiser4_report_err(void)
8453+{
8454+ reiser4_context *ctx = get_current_context_check();
8455+
8456+ if (ctx != NULL) {
8457+ if (ctx->err.code != 0) {
8458+ printk("code: %i at %s:%i\n",
8459+ ctx->err.code, ctx->err.file, ctx->err.line);
8460+ }
8461+ }
8462+}
8463+#endif /* 0 */
8464+
8465+#endif /* REISER4_DEBUG */
8466+
8467+#if KERNEL_DEBUGGER
8468+
8469+/*
8470+ * this functions just drops into kernel debugger. It is a convenient place to
8471+ * put breakpoint in.
8472+ */
8473+void reiser4_debugtrap(void)
8474+{
8475+ /* do nothing. Put break point here. */
8476+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8477+ extern void breakpoint(void);
8478+ breakpoint();
8479+#endif
8480+}
8481+#endif
8482+
8483+/* Make Linus happy.
8484+ Local variables:
8485+ c-indentation-style: "K&R"
8486+ mode-name: "LC"
8487+ c-basic-offset: 8
8488+ tab-width: 8
8489+ fill-column: 120
8490+ End:
8491+*/
8492diff -urN linux-2.6.24.orig/fs/reiser4/debug.h linux-2.6.24/fs/reiser4/debug.h
8493--- linux-2.6.24.orig/fs/reiser4/debug.h 1970-01-01 03:00:00.000000000 +0300
8494+++ linux-2.6.24/fs/reiser4/debug.h 2008-01-25 11:39:06.908200476 +0300
8495@@ -0,0 +1,350 @@
8496+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8497+
8498+/* Declarations of debug macros. */
8499+
8500+#if !defined( __FS_REISER4_DEBUG_H__ )
8501+#define __FS_REISER4_DEBUG_H__
8502+
8503+#include "forward.h"
8504+#include "reiser4.h"
8505+
8506+/* generic function to produce formatted output, decorating it with
8507+ whatever standard prefixes/postfixes we want. "Fun" is a function
8508+ that will be actually called, can be printk, panic etc.
8509+ This is for use by other debugging macros, not by users. */
8510+#define DCALL(lev, fun, reperr, label, format, ...) \
8511+({ \
8512+ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8513+ current->comm, current->pid, __FUNCTION__, \
8514+ __FILE__, __LINE__, label, ## __VA_ARGS__); \
8515+})
8516+
8517+/*
8518+ * cause kernel to crash
8519+ */
8520+#define reiser4_panic(mid, format, ...) \
8521+ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8522+
8523+/* print message with indication of current process, file, line and
8524+ function */
8525+#define reiser4_log(label, format, ...) \
8526+ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8527+
8528+/* Assertion checked during compilation.
8529+ If "cond" is false (0) we get duplicate case label in switch.
8530+ Use this to check something like famous
8531+ cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8532+ in 3.x journal.c. If cassertion fails you get compiler error,
8533+ so no "maintainer-id".
8534+*/
8535+#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8536+
8537+#define noop do {;} while(0)
8538+
8539+#if REISER4_DEBUG
8540+/* version of info that only actually prints anything when _d_ebugging
8541+ is on */
8542+#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8543+/* macro to catch logical errors. Put it into `default' clause of
8544+ switch() statement. */
8545+#define impossible(label, format, ...) \
8546+ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8547+/* assert assures that @cond is true. If it is not, reiser4_panic() is
8548+ called. Use this for checking logical consistency and _never_ call
8549+ this to check correctness of external data: disk blocks and user-input . */
8550+#define assert(label, cond) \
8551+({ \
8552+ /* call_on_each_assert(); */ \
8553+ if (cond) { \
8554+ /* put negated check to avoid using !(cond) that would lose \
8555+ * warnings for things like assert(a = b); */ \
8556+ ; \
8557+ } else { \
8558+ DEBUGON(1); \
8559+ reiser4_panic(label, "assertion failed: %s", #cond); \
8560+ } \
8561+})
8562+
8563+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8564+#define check_me( label, expr ) assert( label, ( expr ) )
8565+
8566+#define ON_DEBUG( exp ) exp
8567+
8568+extern int reiser4_schedulable(void);
8569+extern void call_on_each_assert(void);
8570+
8571+#else
8572+
8573+#define dinfo( format, args... ) noop
8574+#define impossible( label, format, args... ) noop
8575+#define assert( label, cond ) noop
8576+#define check_me( label, expr ) ( ( void ) ( expr ) )
8577+#define ON_DEBUG( exp )
8578+#define reiser4_schedulable() might_sleep()
8579+
8580+/* REISER4_DEBUG */
8581+#endif
8582+
8583+#if REISER4_DEBUG
8584+/* per-thread information about lock acquired by this thread. Used by lock
8585+ * ordering checking in spin_macros.h */
8586+typedef struct reiser4_lock_cnt_info {
8587+ int rw_locked_tree;
8588+ int read_locked_tree;
8589+ int write_locked_tree;
8590+
8591+ int rw_locked_dk;
8592+ int read_locked_dk;
8593+ int write_locked_dk;
8594+
8595+ int rw_locked_cbk_cache;
8596+ int read_locked_cbk_cache;
8597+ int write_locked_cbk_cache;
8598+
8599+ int spin_locked_zlock;
8600+ int spin_locked_jnode;
8601+ int spin_locked_jload;
8602+ int spin_locked_txnh;
8603+ int spin_locked_atom;
8604+ int spin_locked_stack;
8605+ int spin_locked_txnmgr;
8606+ int spin_locked_ktxnmgrd;
8607+ int spin_locked_fq;
8608+ int spin_locked_inode;
8609+ int spin_locked_super_eflush;
8610+ int spin_locked;
8611+ int long_term_locked_znode;
8612+
8613+ int inode_sem_r;
8614+ int inode_sem_w;
8615+
8616+ int d_refs;
8617+ int x_refs;
8618+ int t_refs;
8619+} reiser4_lock_cnt_info;
8620+
8621+extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
8622+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8623+
8624+/* increment lock-counter @counter, if present */
8625+#define LOCK_CNT_INC(counter) \
8626+ IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
8627+
8628+/* decrement lock-counter @counter, if present */
8629+#define LOCK_CNT_DEC(counter) \
8630+ IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
8631+
8632+/* check that lock-counter is zero. This is for use in assertions */
8633+#define LOCK_CNT_NIL(counter) \
8634+ IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
8635+
8636+/* check that lock-counter is greater than zero. This is for use in
8637+ * assertions */
8638+#define LOCK_CNT_GTZ(counter) \
8639+ IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
8640+#define LOCK_CNT_LT(counter,n) \
8641+ IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
8642+
8643+#else /* REISER4_DEBUG */
8644+
8645+/* no-op versions on the above */
8646+
8647+typedef struct reiser4_lock_cnt_info {
8648+} reiser4_lock_cnt_info;
8649+
8650+#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
8651+#define LOCK_CNT_INC(counter) noop
8652+#define LOCK_CNT_DEC(counter) noop
8653+#define LOCK_CNT_NIL(counter) (1)
8654+#define LOCK_CNT_GTZ(counter) (1)
8655+#define LOCK_CNT_LT(counter,n) (1)
8656+
8657+#endif /* REISER4_DEBUG */
8658+
8659+#define assert_spin_not_locked(lock) BUG_ON(0)
8660+#define assert_rw_write_locked(lock) BUG_ON(0)
8661+#define assert_rw_read_locked(lock) BUG_ON(0)
8662+#define assert_rw_locked(lock) BUG_ON(0)
8663+#define assert_rw_not_write_locked(lock) BUG_ON(0)
8664+#define assert_rw_not_read_locked(lock) BUG_ON(0)
8665+#define assert_rw_not_locked(lock) BUG_ON(0)
8666+
8667+/* flags controlling debugging behavior. Are set through debug_flags=N mount
8668+ option. */
8669+typedef enum {
8670+ /* print a lot of information during panic. When this is on all jnodes
8671+ * are listed. This can be *very* large output. Usually you don't want
8672+ * this. Especially over serial line. */
8673+ REISER4_VERBOSE_PANIC = 0x00000001,
8674+ /* print a lot of information during umount */
8675+ REISER4_VERBOSE_UMOUNT = 0x00000002,
8676+ /* print gathered statistics on umount */
8677+ REISER4_STATS_ON_UMOUNT = 0x00000004,
8678+ /* check node consistency */
8679+ REISER4_CHECK_NODE = 0x00000008
8680+} reiser4_debug_flags;
8681+
8682+extern int is_in_reiser4_context(void);
8683+
8684+/*
8685+ * evaluate expression @e only if with reiser4 context
8686+ */
8687+#define ON_CONTEXT(e) do { \
8688+ if(is_in_reiser4_context()) { \
8689+ e; \
8690+ } } while(0)
8691+
8692+/*
8693+ * evaluate expression @e only when within reiser4_context and debugging is
8694+ * on.
8695+ */
8696+#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
8697+
8698+/*
8699+ * complain about unexpected function result and crash. Used in "default"
8700+ * branches of switch statements and alike to assert that invalid results are
8701+ * not silently ignored.
8702+ */
8703+#define wrong_return_value( label, function ) \
8704+ impossible( label, "wrong return value from " function )
8705+
8706+/* Issue different types of reiser4 messages to the console */
8707+#define warning( label, format, ... ) \
8708+ DCALL( KERN_WARNING, \
8709+ printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
8710+#define notice( label, format, ... ) \
8711+ DCALL( KERN_NOTICE, \
8712+ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
8713+
8714+/* mark not yet implemented functionality */
8715+#define not_yet( label, format, ... ) \
8716+ reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
8717+
8718+extern void reiser4_do_panic(const char *format, ...)
8719+ __attribute__ ((noreturn, format(printf, 1, 2)));
8720+
8721+extern int reiser4_preempt_point(void);
8722+extern void reiser4_print_stats(void);
8723+
8724+#if REISER4_DEBUG
8725+extern int reiser4_no_counters_are_held(void);
8726+extern int reiser4_commit_check_locks(void);
8727+#else
8728+#define reiser4_no_counters_are_held() (1)
8729+#define reiser4_commit_check_locks() (1)
8730+#endif
8731+
8732+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8733+#define IS_POW(i) \
8734+({ \
8735+ typeof(i) __i; \
8736+ \
8737+ __i = (i); \
8738+ !(__i & (__i - 1)); \
8739+})
8740+
8741+#define KERNEL_DEBUGGER (1)
8742+
8743+#if KERNEL_DEBUGGER
8744+
8745+extern void reiser4_debugtrap(void);
8746+
8747+/*
8748+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8749+ * kgdb is not compiled in, do nothing.
8750+ */
8751+#define DEBUGON(cond) \
8752+({ \
8753+ if (unlikely(cond)) \
8754+ reiser4_debugtrap(); \
8755+})
8756+#else
8757+#define DEBUGON(cond) noop
8758+#endif
8759+
8760+/*
8761+ * Error code tracing facility. (Idea is borrowed from XFS code.)
8762+ *
8763+ * Suppose some strange and/or unexpected code is returned from some function
8764+ * (for example, write(2) returns -EEXIST). It is possible to place a
8765+ * breakpoint in the reiser4_write(), but it is too late here. How to find out
8766+ * in what particular place -EEXIST was generated first?
8767+ *
8768+ * In reiser4 all places where actual error codes are produced (that is,
8769+ * statements of the form
8770+ *
8771+ * return -EFOO; // (1), or
8772+ *
8773+ * result = -EFOO; // (2)
8774+ *
8775+ * are replaced with
8776+ *
8777+ * return RETERR(-EFOO); // (1a), and
8778+ *
8779+ * result = RETERR(-EFOO); // (2a) respectively
8780+ *
8781+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
8782+ * printed in error and warning messages. Moreover, it's possible to put a
8783+ * conditional breakpoint in reiser4_return_err (low-level function called
8784+ * by RETERR() to do the actual work) to break into debugger immediately
8785+ * when particular error happens.
8786+ *
8787+ */
8788+
8789+#if REISER4_DEBUG
8790+
8791+/*
8792+ * data-type to store information about where error happened ("error site").
8793+ */
8794+typedef struct err_site {
8795+ int code; /* error code */
8796+ const char *file; /* source file, filled by __FILE__ */
8797+ int line; /* source file line, filled by __LINE__ */
8798+} err_site;
8799+
8800+extern void reiser4_return_err(int code, const char *file, int line);
8801+
8802+/*
8803+ * fill &get_current_context()->err_site with error information.
8804+ */
8805+#define RETERR(code) \
8806+({ \
8807+ typeof(code) __code; \
8808+ \
8809+ __code = (code); \
8810+ reiser4_return_err(__code, __FILE__, __LINE__); \
8811+ __code; \
8812+})
8813+
8814+#else
8815+
8816+/*
8817+ * no-op versions of the above
8818+ */
8819+
8820+typedef struct err_site {
8821+} err_site;
8822+#define RETERR(code) code
8823+#endif
8824+
8825+#if REISER4_LARGE_KEY
8826+/*
8827+ * conditionally compile arguments only if REISER4_LARGE_KEY is on.
8828+ */
8829+#define ON_LARGE_KEY(...) __VA_ARGS__
8830+#else
8831+#define ON_LARGE_KEY(...)
8832+#endif
8833+
8834+/* __FS_REISER4_DEBUG_H__ */
8835+#endif
8836+
8837+/* Make Linus happy.
8838+ Local variables:
8839+ c-indentation-style: "K&R"
8840+ mode-name: "LC"
8841+ c-basic-offset: 8
8842+ tab-width: 8
8843+ fill-column: 120
8844+ End:
8845+*/
8846diff -urN linux-2.6.24.orig/fs/reiser4/dformat.h linux-2.6.24/fs/reiser4/dformat.h
8847--- linux-2.6.24.orig/fs/reiser4/dformat.h 1970-01-01 03:00:00.000000000 +0300
8848+++ linux-2.6.24/fs/reiser4/dformat.h 2008-01-25 11:39:06.908200476 +0300
8849@@ -0,0 +1,70 @@
8850+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8851+
8852+/* Formats of on-disk data and conversion functions. */
8853+
8854+/* put all item formats in the files describing the particular items,
8855+ our model is, everything you need to do to add an item to reiser4,
8856+ (excepting the changes to the plugin that uses the item which go
8857+ into the file defining that plugin), you put into one file. */
8858+/* Data on disk are stored in little-endian format.
8859+ To declare fields of on-disk structures, use d8, d16, d32 and d64.
8860+ d??tocpu() and cputod??() to convert. */
8861+
8862+#if !defined( __FS_REISER4_DFORMAT_H__ )
8863+#define __FS_REISER4_DFORMAT_H__
8864+
8865+#include <asm/byteorder.h>
8866+#include <asm/unaligned.h>
8867+#include <linux/types.h>
8868+
8869+typedef __u8 d8;
8870+typedef __le16 d16;
8871+typedef __le32 d32;
8872+typedef __le64 d64;
8873+
8874+#define PACKED __attribute__((packed))
8875+
8876+/* data-type for block number */
8877+typedef __u64 reiser4_block_nr;
8878+
8879+/* data-type for block number on disk, disk format */
8880+typedef __le64 reiser4_dblock_nr;
8881+
8882+/**
8883+ * disk_addr_eq - compare disk addresses
8884+ * @b1: pointer to block number ot compare
8885+ * @b2: pointer to block number ot compare
8886+ *
8887+ * Returns true if if disk addresses are the same
8888+ */
8889+static inline int disk_addr_eq(const reiser4_block_nr *b1,
8890+ const reiser4_block_nr * b2)
8891+{
8892+ assert("nikita-1033", b1 != NULL);
8893+ assert("nikita-1266", b2 != NULL);
8894+
8895+ return !memcmp(b1, b2, sizeof *b1);
8896+}
8897+
8898+/* structure of master reiser4 super block */
8899+typedef struct reiser4_master_sb {
8900+ char magic[16]; /* "ReIsEr4" */
8901+ __le16 disk_plugin_id; /* id of disk layout plugin */
8902+ __le16 blocksize;
8903+ char uuid[16]; /* unique id */
8904+ char label[16]; /* filesystem label */
8905+ __le64 diskmap; /* location of the diskmap. 0 if not present */
8906+} reiser4_master_sb;
8907+
8908+/* __FS_REISER4_DFORMAT_H__ */
8909+#endif
8910+
8911+/*
8912+ * Local variables:
8913+ * c-indentation-style: "K&R"
8914+ * mode-name: "LC"
8915+ * c-basic-offset: 8
8916+ * tab-width: 8
8917+ * fill-column: 79
8918+ * End:
8919+ */
8920diff -urN linux-2.6.24.orig/fs/reiser4/dscale.c linux-2.6.24/fs/reiser4/dscale.c
8921--- linux-2.6.24.orig/fs/reiser4/dscale.c 1970-01-01 03:00:00.000000000 +0300
8922+++ linux-2.6.24/fs/reiser4/dscale.c 2008-01-25 11:55:43.884539336 +0300
8923@@ -0,0 +1,192 @@
8924+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8925+ * reiser4/README */
8926+
8927+/* Scalable on-disk integers */
8928+
8929+/*
8930+ * Various on-disk structures contain integer-like structures. Stat-data
8931+ * contain [yes, "data" is plural, check the dictionary] file size, link
8932+ * count; extent unit contains extent width etc. To accommodate for general
8933+ * case enough space is reserved to keep largest possible value. 64 bits in
8934+ * all cases above. But in overwhelming majority of cases numbers actually
8935+ * stored in these fields will be comparatively small and reserving 8 bytes is
8936+ * a waste of precious disk bandwidth.
8937+ *
8938+ * Scalable integers are one way to solve this problem. dscale_write()
8939+ * function stores __u64 value in the given area consuming from 1 to 9 bytes,
8940+ * depending on the magnitude of the value supplied. dscale_read() reads value
8941+ * previously stored by dscale_write().
8942+ *
8943+ * dscale_write() produces format not completely unlike of UTF: two highest
8944+ * bits of the first byte are used to store "tag". One of 4 possible tag
8945+ * values is chosen depending on the number being encoded:
8946+ *
8947+ * 0 ... 0x3f => 0 [table 1]
8948+ * 0x40 ... 0x3fff => 1
8949+ * 0x4000 ... 0x3fffffff => 2
8950+ * 0x40000000 ... 0xffffffffffffffff => 3
8951+ *
8952+ * (see dscale_range() function)
8953+ *
8954+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
8955+ * to be stored, so in this case there is no place in the first byte to store
8956+ * tag. For such values tag is stored in an extra 9th byte.
8957+ *
8958+ * As _highest_ bits are used for the test (which is natural) scaled integers
8959+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
8960+ * uses LITTLE-ENDIAN.
8961+ *
8962+ */
8963+
8964+#include "debug.h"
8965+#include "dscale.h"
8966+
8967+/* return tag of scaled integer stored at @address */
8968+static int gettag(const unsigned char *address)
8969+{
8970+ /* tag is stored in two highest bits */
8971+ return (*address) >> 6;
8972+}
8973+
8974+/* clear tag from value. Clear tag embedded into @value. */
8975+static void cleartag(__u64 * value, int tag)
8976+{
8977+ /*
8978+ * W-w-what ?!
8979+ *
8980+ * Actually, this is rather simple: @value passed here was read by
8981+ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
8982+ * zeroes. Tag is still stored in the highest (arithmetically)
8983+ * non-zero bits of @value, but relative position of tag within __u64
8984+ * depends on @tag.
8985+ *
8986+ * For example if @tag is 0, it's stored 2 highest bits of lowest
8987+ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
8988+ *
8989+ * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
8990+ * and it's offset if (2 * 8) - 2 == 14 bits.
8991+ *
8992+ * See table 1 above for details.
8993+ *
8994+ * All these cases are captured by the formula:
8995+ */
8996+ *value &= ~(3 << (((1 << tag) << 3) - 2));
8997+ /*
8998+ * That is, clear two (3 == 0t11) bits at the offset
8999+ *
9000+ * 8 * (2 ^ tag) - 2,
9001+ *
9002+ * that is, two highest bits of (2 ^ tag)-th byte of @value.
9003+ */
9004+}
9005+
9006+/* return tag for @value. See table 1 above for details. */
9007+static int dscale_range(__u64 value)
9008+{
9009+ if (value > 0x3fffffff)
9010+ return 3;
9011+ if (value > 0x3fff)
9012+ return 2;
9013+ if (value > 0x3f)
9014+ return 1;
9015+ return 0;
9016+}
9017+
9018+/* restore value stored at @adderss by dscale_write() and return number of
9019+ * bytes consumed */
9020+int dscale_read(unsigned char *address, __u64 * value)
9021+{
9022+ int tag;
9023+
9024+ /* read tag */
9025+ tag = gettag(address);
9026+ switch (tag) {
9027+ case 3:
9028+ /* In this case tag is stored in an extra byte, skip this byte
9029+ * and decode value stored in the next 8 bytes.*/
9030+ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
9031+ /* worst case: 8 bytes for value itself plus one byte for
9032+ * tag. */
9033+ return 9;
9034+ case 0:
9035+ *value = get_unaligned(address);
9036+ break;
9037+ case 1:
9038+ *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9039+ break;
9040+ case 2:
9041+ *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9042+ break;
9043+ default:
9044+ return RETERR(-EIO);
9045+ }
9046+ /* clear tag embedded into @value */
9047+ cleartag(value, tag);
9048+ /* number of bytes consumed is (2 ^ tag)---see table 1. */
9049+ return 1 << tag;
9050+}
9051+
9052+/* number of bytes consumed */
9053+int dscale_bytes_to_read(unsigned char *address)
9054+{
9055+ int tag;
9056+
9057+ tag = gettag(address);
9058+ switch (tag) {
9059+ case 0:
9060+ case 1:
9061+ case 2:
9062+ return 1 << tag;
9063+ case 3:
9064+ return 9;
9065+ default:
9066+ return RETERR(-EIO);
9067+ }
9068+}
9069+
9070+/* store @value at @address and return number of bytes consumed */
9071+int dscale_write(unsigned char *address, __u64 value)
9072+{
9073+ int tag;
9074+ int shift;
9075+ __be64 v;
9076+ unsigned char *valarr;
9077+
9078+ tag = dscale_range(value);
9079+ v = __cpu_to_be64(value);
9080+ valarr = (unsigned char *)&v;
9081+ shift = (tag == 3) ? 1 : 0;
9082+ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9083+ *address |= (tag << 6);
9084+ return shift + (1 << tag);
9085+}
9086+
9087+/* number of bytes required to store @value */
9088+int dscale_bytes_to_write(__u64 value)
9089+{
9090+ int bytes;
9091+
9092+ bytes = 1 << dscale_range(value);
9093+ if (bytes == 8)
9094+ ++bytes;
9095+ return bytes;
9096+}
9097+
9098+/* returns true if @value and @other require the same number of bytes to be
9099+ * stored. Used by detect when data structure (like stat-data) has to be
9100+ * expanded or contracted. */
9101+int dscale_fit(__u64 value, __u64 other)
9102+{
9103+ return dscale_range(value) == dscale_range(other);
9104+}
9105+
9106+/* Make Linus happy.
9107+ Local variables:
9108+ c-indentation-style: "K&R"
9109+ mode-name: "LC"
9110+ c-basic-offset: 8
9111+ tab-width: 8
9112+ fill-column: 120
9113+ scroll-step: 1
9114+ End:
9115+*/
9116diff -urN linux-2.6.24.orig/fs/reiser4/dscale.h linux-2.6.24/fs/reiser4/dscale.h
9117--- linux-2.6.24.orig/fs/reiser4/dscale.h 1970-01-01 03:00:00.000000000 +0300
9118+++ linux-2.6.24/fs/reiser4/dscale.h 2008-01-25 11:55:43.884539336 +0300
9119@@ -0,0 +1,28 @@
9120+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9121+ * reiser4/README */
9122+
9123+/* Scalable on-disk integers. See dscale.h for details. */
9124+
9125+#if !defined( __FS_REISER4_DSCALE_H__ )
9126+#define __FS_REISER4_DSCALE_H__
9127+
9128+#include "dformat.h"
9129+
9130+extern int dscale_read(unsigned char *address, __u64 * value);
9131+extern int dscale_write(unsigned char *address, __u64 value);
9132+extern int dscale_bytes_to_read(unsigned char *address);
9133+extern int dscale_bytes_to_write(__u64 value);
9134+extern int dscale_fit(__u64 value, __u64 other);
9135+
9136+/* __FS_REISER4_DSCALE_H__ */
9137+#endif
9138+
9139+/* Make Linus happy.
9140+ Local variables:
9141+ c-indentation-style: "K&R"
9142+ mode-name: "LC"
9143+ c-basic-offset: 8
9144+ tab-width: 8
9145+ fill-column: 120
9146+ End:
9147+*/
9148diff -urN linux-2.6.24.orig/fs/reiser4/entd.c linux-2.6.24/fs/reiser4/entd.c
9149--- linux-2.6.24.orig/fs/reiser4/entd.c 1970-01-01 03:00:00.000000000 +0300
9150+++ linux-2.6.24/fs/reiser4/entd.c 2008-01-25 11:39:06.912201506 +0300
9151@@ -0,0 +1,335 @@
9152+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9153+ * reiser4/README */
9154+
9155+/* Ent daemon. */
9156+
9157+#include "debug.h"
9158+#include "txnmgr.h"
9159+#include "tree.h"
9160+#include "entd.h"
9161+#include "super.h"
9162+#include "context.h"
9163+#include "reiser4.h"
9164+#include "vfs_ops.h"
9165+#include "page_cache.h"
9166+#include "inode.h"
9167+
9168+#include <linux/sched.h> /* struct task_struct */
9169+#include <linux/suspend.h>
9170+#include <linux/kernel.h>
9171+#include <linux/writeback.h>
9172+#include <linux/time.h> /* INITIAL_JIFFIES */
9173+#include <linux/backing-dev.h> /* bdi_write_congested */
9174+#include <linux/wait.h>
9175+#include <linux/kthread.h>
9176+#include <linux/freezer.h>
9177+
9178+#define DEF_PRIORITY 12
9179+#define MAX_ENTD_ITERS 10
9180+
9181+static void entd_flush(struct super_block *, struct wbq *);
9182+static int entd(void *arg);
9183+
9184+/*
9185+ * set ->comm field of end thread to make its state visible to the user level
9186+ */
9187+#define entd_set_comm(state) \
9188+ snprintf(current->comm, sizeof(current->comm), \
9189+ "ent:%s%s", super->s_id, (state))
9190+
9191+/**
9192+ * reiser4_init_entd - initialize entd context and start kernel daemon
9193+ * @super: super block to start ent thread for
9194+ *
9195+ * Creates entd contexts, starts kernel thread and waits until it
9196+ * initializes.
9197+ */
9198+int reiser4_init_entd(struct super_block *super)
9199+{
9200+ entd_context *ctx;
9201+
9202+ assert("nikita-3104", super != NULL);
9203+
9204+ ctx = get_entd_context(super);
9205+
9206+ memset(ctx, 0, sizeof *ctx);
9207+ spin_lock_init(&ctx->guard);
9208+ init_waitqueue_head(&ctx->wait);
9209+#if REISER4_DEBUG
9210+ INIT_LIST_HEAD(&ctx->flushers_list);
9211+#endif
9212+ /* lists of writepage requests */
9213+ INIT_LIST_HEAD(&ctx->todo_list);
9214+ INIT_LIST_HEAD(&ctx->done_list);
9215+ /* start entd */
9216+ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9217+ if (IS_ERR(ctx->tsk))
9218+ return PTR_ERR(ctx->tsk);
9219+ return 0;
9220+}
9221+
9222+static void put_wbq(struct wbq *rq)
9223+{
9224+ iput(rq->mapping->host);
9225+ complete(&rq->completion);
9226+}
9227+
9228+/* ent should be locked */
9229+static struct wbq *__get_wbq(entd_context * ent)
9230+{
9231+ struct wbq *wbq;
9232+
9233+ if (list_empty(&ent->todo_list))
9234+ return NULL;
9235+
9236+ ent->nr_todo_reqs --;
9237+ wbq = list_entry(ent->todo_list.next, struct wbq, link);
9238+ list_del_init(&wbq->link);
9239+ return wbq;
9240+}
9241+
9242+/* ent thread function */
9243+static int entd(void *arg)
9244+{
9245+ struct super_block *super;
9246+ entd_context *ent;
9247+ int done = 0;
9248+
9249+ super = arg;
9250+ /* do_fork() just copies task_struct into the new
9251+ thread. ->fs_context shouldn't be copied of course. This shouldn't
9252+ be a problem for the rest of the code though.
9253+ */
9254+ current->journal_info = NULL;
9255+
9256+ ent = get_entd_context(super);
9257+
9258+ while (!done) {
9259+ try_to_freeze();
9260+
9261+ spin_lock(&ent->guard);
9262+ while (ent->nr_todo_reqs != 0) {
9263+ struct wbq *rq;
9264+
9265+ assert("", list_empty(&ent->done_list));
9266+
9267+ /* take request from the queue head */
9268+ rq = __get_wbq(ent);
9269+ assert("", rq != NULL);
9270+ ent->cur_request = rq;
9271+ spin_unlock(&ent->guard);
9272+
9273+ entd_set_comm("!");
9274+ entd_flush(super, rq);
9275+
9276+ put_wbq(rq);
9277+
9278+ /*
9279+ * wakeup all requestors and iput their inodes
9280+ */
9281+ spin_lock(&ent->guard);
9282+ while (!list_empty(&ent->done_list)) {
9283+ rq = list_entry(ent->done_list.next, struct wbq, link);
9284+ list_del_init(&rq->link);
9285+ ent->nr_done_reqs --;
9286+ spin_unlock(&ent->guard);
9287+ assert("", rq->written == 1);
9288+ put_wbq(rq);
9289+ spin_lock(&ent->guard);
9290+ }
9291+ }
9292+ spin_unlock(&ent->guard);
9293+
9294+ entd_set_comm(".");
9295+
9296+ {
9297+ DEFINE_WAIT(__wait);
9298+
9299+ do {
9300+ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9301+ if (kthread_should_stop()) {
9302+ done = 1;
9303+ break;
9304+ }
9305+ if (ent->nr_todo_reqs != 0)
9306+ break;
9307+ schedule();
9308+ } while (0);
9309+ finish_wait(&ent->wait, &__wait);
9310+ }
9311+ }
9312+ BUG_ON(ent->nr_todo_reqs != 0);
9313+ return 0;
9314+}
9315+
9316+/**
9317+ * reiser4_done_entd - stop entd kernel thread
9318+ * @super: super block to stop ent thread for
9319+ *
9320+ * It is called on umount. Sends stop signal to entd and wait until it handles
9321+ * it.
9322+ */
9323+void reiser4_done_entd(struct super_block *super)
9324+{
9325+ entd_context *ent;
9326+
9327+ assert("nikita-3103", super != NULL);
9328+
9329+ ent = get_entd_context(super);
9330+ assert("zam-1055", ent->tsk != NULL);
9331+ kthread_stop(ent->tsk);
9332+}
9333+
9334+/* called at the beginning of jnode_flush to register flusher thread with ent
9335+ * daemon */
9336+void reiser4_enter_flush(struct super_block *super)
9337+{
9338+ entd_context *ent;
9339+
9340+ assert("zam-1029", super != NULL);
9341+ ent = get_entd_context(super);
9342+
9343+ assert("zam-1030", ent != NULL);
9344+
9345+ spin_lock(&ent->guard);
9346+ ent->flushers++;
9347+#if REISER4_DEBUG
9348+ list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9349+#endif
9350+ spin_unlock(&ent->guard);
9351+}
9352+
9353+/* called at the end of jnode_flush */
9354+void reiser4_leave_flush(struct super_block *super)
9355+{
9356+ entd_context *ent;
9357+ int wake_up_ent;
9358+
9359+ assert("zam-1027", super != NULL);
9360+ ent = get_entd_context(super);
9361+
9362+ assert("zam-1028", ent != NULL);
9363+
9364+ spin_lock(&ent->guard);
9365+ ent->flushers--;
9366+ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9367+#if REISER4_DEBUG
9368+ list_del_init(&get_current_context()->flushers_link);
9369+#endif
9370+ spin_unlock(&ent->guard);
9371+ if (wake_up_ent)
9372+ wake_up(&ent->wait);
9373+}
9374+
9375+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9376+
9377+static void entd_flush(struct super_block *super, struct wbq *rq)
9378+{
9379+ reiser4_context ctx;
9380+ int tmp;
9381+
9382+ init_stack_context(&ctx, super);
9383+ ctx.entd = 1;
9384+ ctx.gfp_mask = GFP_NOFS;
9385+
9386+ rq->wbc->range_start = page_offset(rq->page);
9387+ rq->wbc->range_end = rq->wbc->range_start +
9388+ (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9389+ tmp = rq->wbc->nr_to_write;
9390+ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9391+
9392+ if (rq->wbc->nr_to_write > 0) {
9393+ rq->wbc->range_start = 0;
9394+ rq->wbc->range_end = LLONG_MAX;
9395+ generic_sync_sb_inodes(super, rq->wbc);
9396+ }
9397+ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9398+ reiser4_writeout(super, rq->wbc);
9399+
9400+ context_set_commit_async(&ctx);
9401+ reiser4_exit_context(&ctx);
9402+}
9403+
9404+/**
9405+ * write_page_by_ent - ask entd thread to flush this page as part of slum
9406+ * @page: page to be written
9407+ * @wbc: writeback control passed to reiser4_writepage
9408+ *
9409+ * Creates a request, puts it on entd list of requests, wakeups entd if
9410+ * necessary, waits until entd completes with the request.
9411+ */
9412+int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9413+{
9414+ struct super_block *sb;
9415+ struct inode *inode;
9416+ entd_context *ent;
9417+ struct wbq rq;
9418+
9419+ assert("", PageLocked(page));
9420+ assert("", page->mapping != NULL);
9421+
9422+ sb = page->mapping->host->i_sb;
9423+ ent = get_entd_context(sb);
9424+ assert("", ent && ent->done == 0);
9425+
9426+ /*
9427+ * we are going to unlock page and ask ent thread to write the
9428+ * page. Re-dirty page before unlocking so that if ent thread fails to
9429+ * write it - it will remain dirty
9430+ */
9431+ reiser4_set_page_dirty_internal(page);
9432+
9433+ /*
9434+ * pin inode in memory, unlock page, entd_flush will iput. We can not
9435+ * iput here becasue we can not allow delete_inode to be called here
9436+ */
9437+ inode = igrab(page->mapping->host);
9438+ unlock_page(page);
9439+ if (inode == NULL)
9440+ /* inode is getting freed */
9441+ return 0;
9442+
9443+ /* init wbq */
9444+ INIT_LIST_HEAD(&rq.link);
9445+ rq.magic = WBQ_MAGIC;
9446+ rq.wbc = wbc;
9447+ rq.page = page;
9448+ rq.mapping = inode->i_mapping;
9449+ rq.node = NULL;
9450+ rq.written = 0;
9451+ init_completion(&rq.completion);
9452+
9453+ /* add request to entd's list of writepage requests */
9454+ spin_lock(&ent->guard);
9455+ ent->nr_todo_reqs++;
9456+ list_add_tail(&rq.link, &ent->todo_list);
9457+ if (ent->nr_todo_reqs == 1)
9458+ wake_up(&ent->wait);
9459+
9460+ spin_unlock(&ent->guard);
9461+
9462+ /* wait until entd finishes */
9463+ wait_for_completion(&rq.completion);
9464+
9465+ if (rq.written)
9466+ /* Eventually ENTD has written the page to disk. */
9467+ return 0;
9468+ return 0;
9469+}
9470+
9471+int wbq_available(void)
9472+{
9473+ struct super_block *sb = reiser4_get_current_sb();
9474+ entd_context *ent = get_entd_context(sb);
9475+ return ent->nr_todo_reqs;
9476+}
9477+
9478+/*
9479+ * Local variables:
9480+ * c-indentation-style: "K&R"
9481+ * mode-name: "LC"
9482+ * c-basic-offset: 8
9483+ * tab-width: 8
9484+ * fill-column: 79
9485+ * End:
9486+ */
9487diff -urN linux-2.6.24.orig/fs/reiser4/entd.h linux-2.6.24/fs/reiser4/entd.h
9488--- linux-2.6.24.orig/fs/reiser4/entd.h 1970-01-01 03:00:00.000000000 +0300
9489+++ linux-2.6.24/fs/reiser4/entd.h 2008-01-25 11:39:06.912201506 +0300
9490@@ -0,0 +1,90 @@
9491+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9492+
9493+/* Ent daemon. */
9494+
9495+#ifndef __ENTD_H__
9496+#define __ENTD_H__
9497+
9498+#include "context.h"
9499+
9500+#include <linux/fs.h>
9501+#include <linux/completion.h>
9502+#include <linux/wait.h>
9503+#include <linux/spinlock.h>
9504+#include <linux/sched.h> /* for struct task_struct */
9505+
9506+#define WBQ_MAGIC 0x7876dc76
9507+
9508+/* write-back request. */
9509+struct wbq {
9510+ int magic;
9511+ struct list_head link; /* list head of this list is in entd context */
9512+ struct writeback_control *wbc;
9513+ struct page *page;
9514+ struct address_space *mapping;
9515+ struct completion completion;
9516+ jnode *node; /* set if ent thread captured requested page */
9517+ int written; /* set if ent thread wrote requested page */
9518+};
9519+
9520+/* ent-thread context. This is used to synchronize starting/stopping ent
9521+ * threads. */
9522+typedef struct entd_context {
9523+ /* wait queue that ent thread waits on for more work. It's
9524+ * signaled by write_page_by_ent(). */
9525+ wait_queue_head_t wait;
9526+ /* spinlock protecting other fields */
9527+ spinlock_t guard;
9528+ /* ent thread */
9529+ struct task_struct *tsk;
9530+ /* set to indicate that ent thread should leave. */
9531+ int done;
9532+ /* counter of active flushers */
9533+ int flushers;
9534+ /*
9535+ * when reiser4_writepage asks entd to write a page - it adds struct
9536+ * wbq to this list
9537+ */
9538+ struct list_head todo_list;
9539+ /* number of elements on the above list */
9540+ int nr_todo_reqs;
9541+
9542+ struct wbq *cur_request;
9543+ /*
9544+ * when entd writes a page it moves write-back request from todo_list
9545+ * to done_list. This list is used at the end of entd iteration to
9546+ * wakeup requestors and iput inodes.
9547+ */
9548+ struct list_head done_list;
9549+ /* number of elements on the above list */
9550+ int nr_done_reqs;
9551+
9552+#if REISER4_DEBUG
9553+ /* list of all active flushers */
9554+ struct list_head flushers_list;
9555+#endif
9556+} entd_context;
9557+
9558+extern int reiser4_init_entd(struct super_block *);
9559+extern void reiser4_done_entd(struct super_block *);
9560+
9561+extern void reiser4_enter_flush(struct super_block *);
9562+extern void reiser4_leave_flush(struct super_block *);
9563+
9564+extern int write_page_by_ent(struct page *, struct writeback_control *);
9565+extern int wbq_available(void);
9566+extern void ent_writes_page(struct super_block *, struct page *);
9567+
9568+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9569+/* __ENTD_H__ */
9570+#endif
9571+
9572+/* Make Linus happy.
9573+ Local variables:
9574+ c-indentation-style: "K&R"
9575+ mode-name: "LC"
9576+ c-basic-offset: 8
9577+ tab-width: 8
9578+ fill-column: 120
9579+ End:
9580+*/
9581diff -urN linux-2.6.24.orig/fs/reiser4/eottl.c linux-2.6.24/fs/reiser4/eottl.c
9582--- linux-2.6.24.orig/fs/reiser4/eottl.c 1970-01-01 03:00:00.000000000 +0300
9583+++ linux-2.6.24/fs/reiser4/eottl.c 2008-01-25 11:39:06.912201506 +0300
9584@@ -0,0 +1,509 @@
9585+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9586+
9587+#include "forward.h"
9588+#include "debug.h"
9589+#include "key.h"
9590+#include "coord.h"
9591+#include "plugin/item/item.h"
9592+#include "plugin/node/node.h"
9593+#include "znode.h"
9594+#include "block_alloc.h"
9595+#include "tree_walk.h"
9596+#include "tree_mod.h"
9597+#include "carry.h"
9598+#include "tree.h"
9599+#include "super.h"
9600+
9601+#include <linux/types.h> /* for __u?? */
9602+
9603+/*
9604+ * Extents on the twig level (EOTTL) handling.
9605+ *
9606+ * EOTTL poses some problems to the tree traversal, that are better explained
9607+ * by example.
9608+ *
9609+ * Suppose we have block B1 on the twig level with the following items:
9610+ *
9611+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9612+ * offset)
9613+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9614+ * 2. internal item I2 with key (10:0:0:0)
9615+ *
9616+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9617+ * then intra-node lookup is done. This lookup finished on the E1, because the
9618+ * key we are looking for is larger than the key of E1 and is smaller than key
9619+ * the of I2.
9620+ *
9621+ * Here search is stuck.
9622+ *
9623+ * After some thought it is clear what is wrong here: extents on the twig level
9624+ * break some basic property of the *search* tree (on the pretext, that they
9625+ * restore property of balanced tree).
9626+ *
9627+ * Said property is the following: if in the internal node of the search tree
9628+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9629+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9630+ * through the Pointer.
9631+ *
9632+ * This is not true, when Pointer is Extent-Pointer, simply because extent
9633+ * cannot expand indefinitely to the right to include any item with
9634+ *
9635+ * Key1 <= Key <= Key2.
9636+ *
9637+ * For example, our E1 extent is only responsible for the data with keys
9638+ *
9639+ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9640+ *
9641+ * so, key range
9642+ *
9643+ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9644+ *
9645+ * is orphaned: there is no way to get there from the tree root.
9646+ *
9647+ * In other words, extent pointers are different than normal child pointers as
9648+ * far as search tree is concerned, and this creates such problems.
9649+ *
9650+ * Possible solution for this problem is to insert our item into node pointed
9651+ * to by I2. There are some problems through:
9652+ *
9653+ * (1) I2 can be in a different node.
9654+ * (2) E1 can be immediately followed by another extent E2.
9655+ *
9656+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9657+ * for locks/coords as necessary.
9658+ *
9659+ * (2) is more complex. Solution here is to insert new empty leaf node and
9660+ * insert internal item between E1 and E2 pointing to said leaf node. This is
9661+ * further complicated by possibility that E2 is in a different node, etc.
9662+ *
9663+ * Problems:
9664+ *
9665+ * (1) if there was internal item I2 immediately on the right of an extent E1
9666+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9667+ * key of S1 will be less than smallest key in the N2. Normally, search key
9668+ * checks that key we are looking for is in the range of keys covered by the
9669+ * node key is being looked in. To work around of this situation, while
9670+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9671+ * cbk falgs bitmask. This flag is automatically set on entrance to the
9672+ * coord_by_key() and is only cleared when we are about to enter situation
9673+ * described above.
9674+ *
9675+ * (2) If extent E1 is immediately followed by another extent E2 and we are
9676+ * searching for the key that is between E1 and E2 we only have to insert new
9677+ * empty leaf node when coord_by_key was called for insertion, rather than just
9678+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9679+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9680+ * performed by insert_by_key() and friends.
9681+ *
9682+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9683+ * case it requires modification of node content which is only possible under
9684+ * write lock. It may well happen that we only have read lock on the node where
9685+ * new internal pointer is to be inserted (common case: lookup of non-existent
9686+ * stat-data that fells between two extents). If only read lock is held, tree
9687+ * traversal is restarted with lock_level modified so that next time we hit
9688+ * this problem, write lock will be held. Once we have write lock, balancing
9689+ * will be performed.
9690+ */
9691+
9692+/**
9693+ * is_next_item_internal - check whether next item is internal
9694+ * @coord: coordinate of extent item in twig node
9695+ * @key: search key
9696+ * @lh: twig node lock handle
9697+ *
9698+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9699+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9700+ * to that node, @coord is set to its first unit. If next item is not internal
9701+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9702+ * is returned if search restart has to be done.
9703+ */
9704+static int
9705+is_next_item_internal(coord_t *coord, const reiser4_key *key,
9706+ lock_handle *lh)
9707+{
9708+ coord_t next;
9709+ lock_handle rn;
9710+ int result;
9711+
9712+ coord_dup(&next, coord);
9713+ if (coord_next_unit(&next) == 0) {
9714+ /* next unit is in this node */
9715+ if (item_is_internal(&next)) {
9716+ coord_dup(coord, &next);
9717+ return 1;
9718+ }
9719+ assert("vs-3", item_is_extent(&next));
9720+ return 0;
9721+ }
9722+
9723+ /*
9724+ * next unit either does not exist or is in right neighbor. If it is in
9725+ * right neighbor we have to check right delimiting key because
9726+ * concurrent thread could get their first and insert item with a key
9727+ * smaller than @key
9728+ */
9729+ read_lock_dk(current_tree);
9730+ result = keycmp(key, znode_get_rd_key(coord->node));
9731+ read_unlock_dk(current_tree);
9732+ assert("vs-6", result != EQUAL_TO);
9733+ if (result == GREATER_THAN)
9734+ return 2;
9735+
9736+ /* lock right neighbor */
9737+ init_lh(&rn);
9738+ result = reiser4_get_right_neighbor(&rn, coord->node,
9739+ znode_is_wlocked(coord->node) ?
9740+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9741+ GN_CAN_USE_UPPER_LEVELS);
9742+ if (result == -E_NO_NEIGHBOR) {
9743+ /* we are on the rightmost edge of the tree */
9744+ done_lh(&rn);
9745+ return 0;
9746+ }
9747+
9748+ if (result) {
9749+ assert("vs-4", result < 0);
9750+ done_lh(&rn);
9751+ return result;
9752+ }
9753+
9754+ /*
9755+ * check whether concurrent thread managed to insert item with a key
9756+ * smaller than @key
9757+ */
9758+ read_lock_dk(current_tree);
9759+ result = keycmp(key, znode_get_ld_key(rn.node));
9760+ read_unlock_dk(current_tree);
9761+ assert("vs-6", result != EQUAL_TO);
9762+ if (result == GREATER_THAN) {
9763+ done_lh(&rn);
9764+ return 2;
9765+ }
9766+
9767+ result = zload(rn.node);
9768+ if (result) {
9769+ assert("vs-5", result < 0);
9770+ done_lh(&rn);
9771+ return result;
9772+ }
9773+
9774+ coord_init_first_unit(&next, rn.node);
9775+ if (item_is_internal(&next)) {
9776+ /*
9777+ * next unit is in right neighbor and it is an unit of internal
9778+ * item. Unlock coord->node. Move @lh to right neighbor. @coord
9779+ * is set to the first unit of right neighbor.
9780+ */
9781+ coord_dup(coord, &next);
9782+ zrelse(rn.node);
9783+ done_lh(lh);
9784+ move_lh(lh, &rn);
9785+ return 1;
9786+ }
9787+
9788+ /*
9789+ * next unit is unit of extent item. Return without chaning @lh and
9790+ * @coord.
9791+ */
9792+ assert("vs-6", item_is_extent(&next));
9793+ zrelse(rn.node);
9794+ done_lh(&rn);
9795+ return 0;
9796+}
9797+
9798+/**
9799+ * rd_key - calculate key of an item next to the given one
9800+ * @coord: position in a node
9801+ * @key: storage for result key
9802+ *
9803+ * @coord is set between items or after the last item in a node. Calculate key
9804+ * of item to the right of @coord.
9805+ */
9806+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
9807+{
9808+ coord_t dup;
9809+
9810+ assert("nikita-2281", coord_is_between_items(coord));
9811+ coord_dup(&dup, coord);
9812+
9813+ if (coord_set_to_right(&dup) == 0)
9814+ /* next item is in this node. Return its key. */
9815+ unit_key_by_coord(&dup, key);
9816+ else {
9817+ /*
9818+ * next item either does not exist or is in right
9819+ * neighbor. Return znode's right delimiting key.
9820+ */
9821+ read_lock_dk(current_tree);
9822+ *key = *znode_get_rd_key(coord->node);
9823+ read_unlock_dk(current_tree);
9824+ }
9825+ return key;
9826+}
9827+
9828+/**
9829+ * add_empty_leaf - insert empty leaf between two extents
9830+ * @insert_coord: position in twig node between two extents
9831+ * @lh: twig node lock handle
9832+ * @key: left delimiting key of new node
9833+ * @rdkey: right delimiting key of new node
9834+ *
9835+ * Inserts empty leaf node between two extent items. It is necessary when we
9836+ * have to insert an item on leaf level between two extents (items on the twig
9837+ * level).
9838+ */
9839+static int
9840+add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
9841+ const reiser4_key *key, const reiser4_key *rdkey)
9842+{
9843+ int result;
9844+ carry_pool *pool;
9845+ carry_level *todo;
9846+ reiser4_item_data *item;
9847+ carry_insert_data *cdata;
9848+ carry_op *op;
9849+ znode *node;
9850+ reiser4_tree *tree;
9851+
9852+ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
9853+ tree = znode_get_tree(insert_coord->node);
9854+ node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
9855+ if (IS_ERR(node))
9856+ return PTR_ERR(node);
9857+
9858+ /* setup delimiting keys for node being inserted */
9859+ write_lock_dk(tree);
9860+ znode_set_ld_key(node, key);
9861+ znode_set_rd_key(node, rdkey);
9862+ ON_DEBUG(node->creator = current);
9863+ ON_DEBUG(node->first_key = *key);
9864+ write_unlock_dk(tree);
9865+
9866+ ZF_SET(node, JNODE_ORPHAN);
9867+
9868+ /*
9869+ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
9870+ * carry_insert_data
9871+ */
9872+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
9873+ sizeof(*item) + sizeof(*cdata));
9874+ if (IS_ERR(pool))
9875+ return PTR_ERR(pool);
9876+ todo = (carry_level *) (pool + 1);
9877+ init_carry_level(todo, pool);
9878+
9879+ item = (reiser4_item_data *) (todo + 3);
9880+ cdata = (carry_insert_data *) (item + 1);
9881+
9882+ op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
9883+ if (!IS_ERR(op)) {
9884+ cdata->coord = insert_coord;
9885+ cdata->key = key;
9886+ cdata->data = item;
9887+ op->u.insert.d = cdata;
9888+ op->u.insert.type = COPT_ITEM_DATA;
9889+ build_child_ptr_data(node, item);
9890+ item->arg = NULL;
9891+ /* have @insert_coord to be set at inserted item after
9892+ insertion is done */
9893+ todo->track_type = CARRY_TRACK_CHANGE;
9894+ todo->tracked = lh;
9895+
9896+ result = reiser4_carry(todo, NULL);
9897+ if (result == 0) {
9898+ /*
9899+ * pin node in memory. This is necessary for
9900+ * znode_make_dirty() below.
9901+ */
9902+ result = zload(node);
9903+ if (result == 0) {
9904+ lock_handle local_lh;
9905+
9906+ /*
9907+ * if we inserted new child into tree we have
9908+ * to mark it dirty so that flush will be able
9909+ * to process it.
9910+ */
9911+ init_lh(&local_lh);
9912+ result = longterm_lock_znode(&local_lh, node,
9913+ ZNODE_WRITE_LOCK,
9914+ ZNODE_LOCK_LOPRI);
9915+ if (result == 0) {
9916+ znode_make_dirty(node);
9917+
9918+ /*
9919+ * when internal item pointing to @node
9920+ * was inserted into twig node
9921+ * create_hook_internal did not connect
9922+ * it properly because its right
9923+ * neighbor was not known. Do it
9924+ * here
9925+ */
9926+ write_lock_tree(tree);
9927+ assert("nikita-3312",
9928+ znode_is_right_connected(node));
9929+ assert("nikita-2984",
9930+ node->right == NULL);
9931+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
9932+ write_unlock_tree(tree);
9933+ result =
9934+ connect_znode(insert_coord, node);
9935+ ON_DEBUG(if (result == 0) check_dkeys(node););
9936+
9937+ done_lh(lh);
9938+ move_lh(lh, &local_lh);
9939+ assert("vs-1676", node_is_empty(node));
9940+ coord_init_first_unit(insert_coord,
9941+ node);
9942+ } else {
9943+ warning("nikita-3136",
9944+ "Cannot lock child");
9945+ }
9946+ done_lh(&local_lh);
9947+ zrelse(node);
9948+ }
9949+ }
9950+ } else
9951+ result = PTR_ERR(op);
9952+ zput(node);
9953+ done_carry_pool(pool);
9954+ return result;
9955+}
9956+
9957+/**
9958+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
9959+ * @h: search handle
9960+ * @outcome: flag saying whether search has to restart or is done
9961+ *
9962+ * Handles search on twig level. If this function completes search itself then
9963+ * it returns 1. If search has to go one level down then 0 is returned. If
9964+ * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
9965+ * in @h->result.
9966+ */
9967+int handle_eottl(cbk_handle *h, int *outcome)
9968+{
9969+ int result;
9970+ reiser4_key key;
9971+ coord_t *coord;
9972+
9973+ coord = h->coord;
9974+
9975+ if (h->level != TWIG_LEVEL ||
9976+ (coord_is_existing_item(coord) && item_is_internal(coord))) {
9977+ /* Continue to traverse tree downward. */
9978+ return 0;
9979+ }
9980+
9981+ /*
9982+ * make sure that @h->coord is set to twig node and that it is either
9983+ * set to extent item or after extent item
9984+ */
9985+ assert("vs-356", h->level == TWIG_LEVEL);
9986+ assert("vs-357", ( {
9987+ coord_t lcoord;
9988+ coord_dup(&lcoord, coord);
9989+ check_me("vs-733", coord_set_to_left(&lcoord) == 0);
9990+ item_is_extent(&lcoord);
9991+ }
9992+ ));
9993+
9994+ if (*outcome == NS_FOUND) {
9995+ /* we have found desired key on twig level in extent item */
9996+ h->result = CBK_COORD_FOUND;
9997+ *outcome = LOOKUP_DONE;
9998+ return 1;
9999+ }
10000+
10001+ if (!(h->flags & CBK_FOR_INSERT)) {
10002+ /* tree traversal is not for insertion. Just return
10003+ CBK_COORD_NOTFOUND. */
10004+ h->result = CBK_COORD_NOTFOUND;
10005+ *outcome = LOOKUP_DONE;
10006+ return 1;
10007+ }
10008+
10009+ /* take a look at the item to the right of h -> coord */
10010+ result = is_next_item_internal(coord, h->key, h->active_lh);
10011+ if (unlikely(result < 0)) {
10012+ h->error = "get_right_neighbor failed";
10013+ h->result = result;
10014+ *outcome = LOOKUP_DONE;
10015+ return 1;
10016+ }
10017+ if (result == 0) {
10018+ /*
10019+ * item to the right is also an extent one. Allocate a new node
10020+ * and insert pointer to it after item h -> coord.
10021+ *
10022+ * This is a result of extents being located at the twig
10023+ * level. For explanation, see comment just above
10024+ * is_next_item_internal().
10025+ */
10026+ znode *loaded;
10027+
10028+ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
10029+ /*
10030+ * we got node read locked, restart coord_by_key to
10031+ * have write lock on twig level
10032+ */
10033+ h->lock_level = TWIG_LEVEL;
10034+ h->lock_mode = ZNODE_WRITE_LOCK;
10035+ *outcome = LOOKUP_REST;
10036+ return 1;
10037+ }
10038+
10039+ loaded = coord->node;
10040+ result =
10041+ add_empty_leaf(coord, h->active_lh, h->key,
10042+ rd_key(coord, &key));
10043+ if (result) {
10044+ h->error = "could not add empty leaf";
10045+ h->result = result;
10046+ *outcome = LOOKUP_DONE;
10047+ return 1;
10048+ }
10049+ /* added empty leaf is locked (h->active_lh), its parent node
10050+ is unlocked, h->coord is set as EMPTY */
10051+ assert("vs-13", coord->between == EMPTY_NODE);
10052+ assert("vs-14", znode_is_write_locked(coord->node));
10053+ assert("vs-15",
10054+ WITH_DATA(coord->node, node_is_empty(coord->node)));
10055+ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10056+ assert("vs-17", coord->node == h->active_lh->node);
10057+ *outcome = LOOKUP_DONE;
10058+ h->result = CBK_COORD_NOTFOUND;
10059+ return 1;
10060+ } else if (result == 1) {
10061+ /*
10062+ * this is special case mentioned in the comment on
10063+ * tree.h:cbk_flags. We have found internal item immediately on
10064+ * the right of extent, and we are going to insert new item
10065+ * there. Key of item we are going to insert is smaller than
10066+ * leftmost key in the node pointed to by said internal item
10067+ * (otherwise search wouldn't come to the extent in the first
10068+ * place).
10069+ *
10070+ * This is a result of extents being located at the twig
10071+ * level. For explanation, see comment just above
10072+ * is_next_item_internal().
10073+ */
10074+ h->flags &= ~CBK_TRUST_DK;
10075+ } else {
10076+ assert("vs-8", result == 2);
10077+ *outcome = LOOKUP_REST;
10078+ return 1;
10079+ }
10080+ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10081+ return 0;
10082+}
10083+
10084+/*
10085+ * Local variables:
10086+ * c-indentation-style: "K&R"
10087+ * mode-name: "LC"
10088+ * c-basic-offset: 8
10089+ * tab-width: 8
10090+ * fill-column: 120
10091+ * scroll-step: 1
10092+ * End:
10093+ */
10094diff -urN linux-2.6.24.orig/fs/reiser4/estimate.c linux-2.6.24/fs/reiser4/estimate.c
10095--- linux-2.6.24.orig/fs/reiser4/estimate.c 1970-01-01 03:00:00.000000000 +0300
10096+++ linux-2.6.24/fs/reiser4/estimate.c 2008-01-25 11:39:06.912201506 +0300
10097@@ -0,0 +1,120 @@
10098+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10099+
10100+#include "debug.h"
10101+#include "dformat.h"
10102+#include "tree.h"
10103+#include "carry.h"
10104+#include "inode.h"
10105+#include "plugin/cluster.h"
10106+#include "plugin/item/ctail.h"
10107+
10108+/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10109+
10110+ Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10111+ is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10112+ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10113+ leaf level, 3 for twig level, 2 on upper + 1 for root.
10114+
10115+ Do not calculate the current node of the lowest level here - this is overhead only.
10116+
10117+ children is almost always 1 here. Exception is flow insertion
10118+*/
10119+static reiser4_block_nr
10120+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10121+{
10122+ reiser4_block_nr ten_percent;
10123+
10124+ ten_percent = ((103 * childen) >> 10);
10125+
10126+ /* If we have too many balancings at the time, tree height can raise on more
10127+ then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10128+ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10129+}
10130+
10131+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10132+ perform insertion of one item into the tree */
10133+/* it is only called when tree height changes, or gets initialized */
10134+reiser4_block_nr calc_estimate_one_insert(tree_level height)
10135+{
10136+ return 1 + max_balance_overhead(1, height);
10137+}
10138+
10139+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10140+{
10141+ return tree->estimate_one_insert;
10142+}
10143+
10144+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10145+ perform insertion of one unit into an item in the tree */
10146+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10147+{
10148+ /* estimate insert into item just like item insertion */
10149+ return tree->estimate_one_insert;
10150+}
10151+
10152+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10153+{
10154+ /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10155+ level */
10156+ return tree->estimate_one_insert;
10157+}
10158+
10159+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10160+ both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10161+ levels */
10162+reiser4_block_nr estimate_insert_flow(tree_level height)
10163+{
10164+ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10165+ CARRY_FLOW_NEW_NODES_LIMIT,
10166+ height);
10167+}
10168+
10169+/* returnes max number of nodes can be occupied by disk cluster */
10170+static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10171+{
10172+ int per_cluster;
10173+ per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10174+ return 3 + per_cluster +
10175+ max_balance_overhead(3 + per_cluster,
10176+ REISER4_MAX_ZTREE_HEIGHT);
10177+}
10178+
10179+/* how many nodes might get dirty and added
10180+ during insertion of a disk cluster */
10181+reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10182+{
10183+ return estimate_cluster(inode, 1); /* 24 */
10184+}
10185+
10186+/* how many nodes might get dirty and added
10187+ during update of a (prepped or unprepped) disk cluster */
10188+reiser4_block_nr estimate_update_cluster(struct inode * inode)
10189+{
10190+ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10191+}
10192+
10193+/* How many nodes occupied by a disk cluster might get dirty.
10194+ Note that this estimation is not precise (i.e. disk cluster
10195+ can occupy more nodes).
10196+ Q: Why we don't use precise estimation?
10197+ A: 1.Because precise estimation is fairly bad: 65536 nodes
10198+ for 64K logical cluster, it means 256M of dead space on
10199+ a partition
10200+ 2.It is a very rare case when disk cluster occupies more
10201+ nodes then this estimation returns.
10202+*/
10203+reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10204+{
10205+ return cluster_nrpages(inode) + 4;
10206+}
10207+
10208+/* Make Linus happy.
10209+ Local variables:
10210+ c-indentation-style: "K&R"
10211+ mode-name: "LC"
10212+ c-basic-offset: 8
10213+ tab-width: 8
10214+ fill-column: 120
10215+ scroll-step: 1
10216+ End:
10217+*/
10218diff -urN linux-2.6.24.orig/fs/reiser4/export_ops.c linux-2.6.24/fs/reiser4/export_ops.c
10219--- linux-2.6.24.orig/fs/reiser4/export_ops.c 1970-01-01 03:00:00.000000000 +0300
10220+++ linux-2.6.24/fs/reiser4/export_ops.c 2008-01-25 12:03:29.960445090 +0300
10221@@ -0,0 +1,319 @@
10222+/* Copyright 2005 by Hans Reiser, licensing governed by
10223+ * reiser4/README */
10224+
10225+#include "inode.h"
10226+#include "plugin/plugin.h"
10227+
10228+/*
10229+ * Supported file-handle types
10230+ */
10231+typedef enum {
10232+ FH_WITH_PARENT = 0x10, /* file handle with parent */
10233+ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10234+} reiser4_fhtype;
10235+
10236+#define NFSERROR (255)
10237+
10238+/* initialize place-holder for object */
10239+static void object_on_wire_init(reiser4_object_on_wire *o)
10240+{
10241+ o->plugin = NULL;
10242+}
10243+
10244+/* finish with @o */
10245+static void object_on_wire_done(reiser4_object_on_wire *o)
10246+{
10247+ if (o->plugin != NULL)
10248+ o->plugin->wire.done(o);
10249+}
10250+
10251+/*
10252+ * read serialized object identity from @addr and store information about
10253+ * object in @obj. This is dual to encode_inode().
10254+ */
10255+static char *decode_inode(struct super_block *s, char *addr,
10256+ reiser4_object_on_wire * obj)
10257+{
10258+ file_plugin *fplug;
10259+
10260+ /* identifier of object plugin is stored in the first two bytes,
10261+ * followed by... */
10262+ fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10263+ if (fplug != NULL) {
10264+ addr += sizeof(d16);
10265+ obj->plugin = fplug;
10266+ assert("nikita-3520", fplug->wire.read != NULL);
10267+ /* plugin specific encoding of object identity. */
10268+ addr = fplug->wire.read(addr, obj);
10269+ } else
10270+ addr = ERR_PTR(RETERR(-EINVAL));
10271+ return addr;
10272+}
10273+
10274+static struct dentry *reiser4_get_dentry(struct super_block *super,
10275+ void *data);
10276+/**
10277+ * reiser4_decode_fh: decode on-wire object - helper function
10278+ * for fh_to_dentry, fh_to_parent export operations;
10279+ * @super: super block;
10280+ * @addr: onwire object to be decoded;
10281+ *
10282+ * Returns dentry referring to the object being decoded.
10283+ */
10284+static struct dentry *reiser4_decode_fh(struct super_block * super,
10285+ char * addr)
10286+{
10287+ reiser4_object_on_wire object;
10288+
10289+ object_on_wire_init(&object);
10290+
10291+ addr = decode_inode(super, addr, &object);
10292+ if (!IS_ERR(addr)) {
10293+ struct dentry *d;
10294+ d = reiser4_get_dentry(super, &object);
10295+ if (d != NULL && !IS_ERR(d))
10296+ /* FIXME check for -ENOMEM */
10297+ reiser4_get_dentry_fsdata(d)->stateless = 1;
10298+ addr = (char *)d;
10299+ }
10300+ object_on_wire_done(&object);
10301+ return (void *)addr;
10302+}
10303+
10304+static struct dentry *reiser4_fh_to_dentry(struct super_block *sb,
10305+ struct fid *fid,
10306+ int fh_len, int fh_type)
10307+{
10308+ reiser4_context *ctx;
10309+ struct dentry *d;
10310+
10311+ assert("edward-1536",
10312+ fh_type == FH_WITH_PARENT || fh_type == FH_WITHOUT_PARENT);
10313+
10314+ ctx = reiser4_init_context(sb);
10315+ if (IS_ERR(ctx))
10316+ return (struct dentry *)ctx;
10317+
10318+ d = reiser4_decode_fh(sb, (char *)fid->raw);
10319+
10320+ reiser4_exit_context(ctx);
10321+ return d;
10322+}
10323+
10324+static struct dentry *reiser4_fh_to_parent(struct super_block *sb,
10325+ struct fid *fid,
10326+ int fh_len, int fh_type)
10327+{
10328+ char * addr;
10329+ struct dentry * d;
10330+ reiser4_context *ctx;
10331+ file_plugin *fplug;
10332+
10333+ if (fh_type == FH_WITHOUT_PARENT)
10334+ return NULL;
10335+ assert("edward-1537", fh_type == FH_WITH_PARENT);
10336+
10337+ ctx = reiser4_init_context(sb);
10338+ if (IS_ERR(ctx))
10339+ return (struct dentry *)ctx;
10340+ addr = (char *)fid->raw;
10341+ /* extract 2-bytes file plugin id */
10342+ fplug = file_plugin_by_disk_id(reiser4_get_tree(sb), (d16 *)addr);
10343+ if (fplug == NULL) {
10344+ d = ERR_PTR(RETERR(-EINVAL));
10345+ goto exit;
10346+ }
10347+ addr += sizeof(d16);
10348+ /* skip previously encoded object */
10349+ addr = fplug->wire.read(addr, NULL /* skip */);
10350+ if (IS_ERR(addr)) {
10351+ d = (struct dentry *)addr;
10352+ goto exit;
10353+ }
10354+ /* @extract and decode parent object */
10355+ d = reiser4_decode_fh(sb, addr);
10356+ exit:
10357+ reiser4_exit_context(ctx);
10358+ return d;
10359+}
10360+
10361+/*
10362+ * Object serialization support.
10363+ *
10364+ * To support knfsd file system provides export_operations that are used to
10365+ * construct and interpret NFS file handles. As a generalization of this,
10366+ * reiser4 object plugins have serialization support: it provides methods to
10367+ * create on-wire representation of identity of reiser4 object, and
10368+ * re-create/locate object given its on-wire identity.
10369+ *
10370+ */
10371+
10372+/*
10373+ * return number of bytes that on-wire representation of @inode's identity
10374+ * consumes.
10375+ */
10376+static int encode_inode_size(struct inode *inode)
10377+{
10378+ assert("nikita-3514", inode != NULL);
10379+ assert("nikita-3515", inode_file_plugin(inode) != NULL);
10380+ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10381+
10382+ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10383+}
10384+
10385+/*
10386+ * store on-wire representation of @inode's identity at the area beginning at
10387+ * @start.
10388+ */
10389+static char *encode_inode(struct inode *inode, char *start)
10390+{
10391+ assert("nikita-3517", inode != NULL);
10392+ assert("nikita-3518", inode_file_plugin(inode) != NULL);
10393+ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10394+
10395+ /*
10396+ * first, store two-byte identifier of object plugin, then
10397+ */
10398+ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10399+ (d16 *) start);
10400+ start += sizeof(d16);
10401+ /*
10402+ * call plugin to serialize object's identity
10403+ */
10404+ return inode_file_plugin(inode)->wire.write(inode, start);
10405+}
10406+
10407+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10408+ * returned if file handle can not be stored */
10409+/**
10410+ * reiser4_encode_fh - encode_fh of export operations
10411+ * @dentry:
10412+ * @fh:
10413+ * @lenp:
10414+ * @need_parent:
10415+ *
10416+ */
10417+static int
10418+reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10419+ int need_parent)
10420+{
10421+ struct inode *inode;
10422+ struct inode *parent;
10423+ char *addr;
10424+ int need;
10425+ int delta;
10426+ int result;
10427+ reiser4_context *ctx;
10428+
10429+ /*
10430+ * knfsd asks as to serialize object in @dentry, and, optionally its
10431+ * parent (if need_parent != 0).
10432+ *
10433+ * encode_inode() and encode_inode_size() is used to build
10434+ * representation of object and its parent. All hard work is done by
10435+ * object plugins.
10436+ */
10437+ inode = dentry->d_inode;
10438+ parent = dentry->d_parent->d_inode;
10439+
10440+ addr = (char *)fh;
10441+
10442+ need = encode_inode_size(inode);
10443+ if (need < 0)
10444+ return NFSERROR;
10445+ if (need_parent) {
10446+ delta = encode_inode_size(parent);
10447+ if (delta < 0)
10448+ return NFSERROR;
10449+ need += delta;
10450+ }
10451+
10452+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
10453+ if (IS_ERR(ctx))
10454+ return PTR_ERR(ctx);
10455+
10456+ if (need <= sizeof(__u32) * (*lenp)) {
10457+ addr = encode_inode(inode, addr);
10458+ if (need_parent)
10459+ addr = encode_inode(parent, addr);
10460+
10461+ /* store in lenp number of 32bit words required for file
10462+ * handle. */
10463+ *lenp = (need + sizeof(__u32) - 1) >> 2;
10464+ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10465+ } else
10466+ /* no enough space in file handle */
10467+ result = NFSERROR;
10468+ reiser4_exit_context(ctx);
10469+ return result;
10470+}
10471+
10472+/**
10473+ * reiser4_get_dentry_parent - get_parent of export operations
10474+ * @child:
10475+ *
10476+ */
10477+static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10478+{
10479+ struct inode *dir;
10480+ dir_plugin *dplug;
10481+
10482+ assert("nikita-3527", child != NULL);
10483+ /* see comment in reiser4_get_dentry() about following assertion */
10484+ assert("nikita-3528", is_in_reiser4_context());
10485+
10486+ dir = child->d_inode;
10487+ assert("nikita-3529", dir != NULL);
10488+ dplug = inode_dir_plugin(dir);
10489+ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10490+ if (dplug != NULL)
10491+ return dplug->get_parent(dir);
10492+ else
10493+ return ERR_PTR(RETERR(-ENOTDIR));
10494+}
10495+
10496+/**
10497+ * reiser4_get_dentry - get_dentry of export operations
10498+ * @super:
10499+ * @data:
10500+ *
10501+ *
10502+ */
10503+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10504+{
10505+ reiser4_object_on_wire *o;
10506+
10507+ assert("nikita-3522", super != NULL);
10508+ assert("nikita-3523", data != NULL);
10509+ /*
10510+ * this is only supposed to be called by
10511+ *
10512+ * reiser4_decode_fh->find_exported_dentry
10513+ *
10514+ * so, reiser4_context should be here already.
10515+ */
10516+ assert("nikita-3526", is_in_reiser4_context());
10517+
10518+ o = (reiser4_object_on_wire *)data;
10519+ assert("nikita-3524", o->plugin != NULL);
10520+ assert("nikita-3525", o->plugin->wire.get != NULL);
10521+
10522+ return o->plugin->wire.get(super, o);
10523+}
10524+
10525+struct export_operations reiser4_export_operations = {
10526+ .encode_fh = reiser4_encode_fh,
10527+ .fh_to_dentry = reiser4_fh_to_dentry,
10528+ .fh_to_parent = reiser4_fh_to_parent,
10529+ .get_parent = reiser4_get_dentry_parent,
10530+};
10531+
10532+/*
10533+ * Local variables:
10534+ * c-indentation-style: "K&R"
10535+ * mode-name: "LC"
10536+ * c-basic-offset: 8
10537+ * tab-width: 8
10538+ * fill-column: 79
10539+ * End:
10540+ */
10541diff -urN linux-2.6.24.orig/fs/reiser4/flush.c linux-2.6.24/fs/reiser4/flush.c
10542--- linux-2.6.24.orig/fs/reiser4/flush.c 1970-01-01 03:00:00.000000000 +0300
10543+++ linux-2.6.24/fs/reiser4/flush.c 2008-01-25 11:39:06.000000000 +0300
10544@@ -0,0 +1,3625 @@
10545+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10546+
10547+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10548+
10549+#include "forward.h"
10550+#include "debug.h"
10551+#include "dformat.h"
10552+#include "key.h"
10553+#include "coord.h"
10554+#include "plugin/item/item.h"
10555+#include "plugin/plugin.h"
10556+#include "plugin/object.h"
10557+#include "txnmgr.h"
10558+#include "jnode.h"
10559+#include "znode.h"
10560+#include "block_alloc.h"
10561+#include "tree_walk.h"
10562+#include "carry.h"
10563+#include "tree.h"
10564+#include "vfs_ops.h"
10565+#include "inode.h"
10566+#include "page_cache.h"
10567+#include "wander.h"
10568+#include "super.h"
10569+#include "entd.h"
10570+#include "reiser4.h"
10571+#include "flush.h"
10572+#include "writeout.h"
10573+
10574+#include <asm/atomic.h>
10575+#include <linux/fs.h> /* for struct super_block */
10576+#include <linux/mm.h> /* for struct page */
10577+#include <linux/bio.h> /* for struct bio */
10578+#include <linux/pagemap.h>
10579+#include <linux/blkdev.h>
10580+
10581+/* IMPLEMENTATION NOTES */
10582+
10583+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
10584+ order to the nodes of the tree in which the parent is placed before its children, which
10585+ are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it
10586+ describes the node that "came before in forward parent-first order". When we speak of a
10587+ "parent-first follower", it describes the node that "comes next in parent-first
10588+ order" (alternatively the node that "came before in reverse parent-first order").
10589+
10590+ The following pseudo-code prints the nodes of a tree in forward parent-first order:
10591+
10592+ void parent_first (node)
10593+ {
10594+ print_node (node);
10595+ if (node->level > leaf) {
10596+ for (i = 0; i < num_children; i += 1) {
10597+ parent_first (node->child[i]);
10598+ }
10599+ }
10600+ }
10601+*/
10602+
10603+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so
10604+ that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
10605+ can be accomplished with sequential reads, which results in reading nodes in their
10606+ parent-first order. This is a read-optimization aspect of the flush algorithm, and
10607+ there is also a write-optimization aspect, which is that we wish to make large
10608+ sequential writes to the disk by allocating or reallocating blocks so that they can be
10609+ written in sequence. Sometimes the read-optimization and write-optimization goals
10610+ conflict with each other, as we discuss in more detail below.
10611+*/
10612+
10613+/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are
10614+ the relevant jnode->state bits and their relevence to flush:
10615+
10616+ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it
10617+ must be allocated first. In order to be considered allocated, the jnode must have
10618+ exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and
10619+ all dirtied jnodes eventually have one of these bits set during each transaction.
10620+
10621+ JNODE_CREATED: The node was freshly created in its transaction and has no previous
10622+ block address, so it is unconditionally assigned to be relocated, although this is
10623+ mainly for code-convenience. It is not being 'relocated' from anything, but in
10624+ almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit
10625+ remains set even after JNODE_RELOC is set, so the actual relocate can be
10626+ distinguished from the created-and-allocated set easily: relocate-set members
10627+ (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
10628+ have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10629+
10630+ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
10631+ decision to maintain the pre-existing location for this node and it will be written
10632+ to the wandered-log.
10633+
10634+ JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
10635+ not created, see note above). A block with JNODE_RELOC set is eligible for
10636+ early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC
10637+ bit is set on a znode, the parent node's internal item is modified and the znode is
10638+ rehashed.
10639+
10640+ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
10641+ and calls plugin->f.squeeze() method for its items. By this technology we update disk
10642+ clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
10643+ has this flag (races with write(), rare case) the flush algorythm makes the decision
10644+ to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
10645+ repeated allocation.
10646+
10647+ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
10648+ flush queue. This means the jnode is not on any clean or dirty list, instead it is
10649+ moved to one of the flush queue (see flush_queue.h) object private list. This
10650+ prevents multiple concurrent flushes from attempting to start flushing from the
10651+ same node.
10652+
10653+ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10654+ squeeze-and-allocate on a node while its children are actively being squeezed and
10655+ allocated. This flag was created to avoid submitting a write request for a node
10656+ while its children are still being allocated and squeezed. Then flush queue was
10657+ re-implemented to allow unlimited number of nodes be queued. This flag support was
10658+ commented out in source code because we decided that there was no reason to submit
10659+ queued nodes before jnode_flush() finishes. However, current code calls fq_write()
10660+ during a slum traversal and may submit "busy nodes" to disk. Probably we can
10661+ re-enable the JNODE_FLUSH_BUSY bit support in future.
10662+
10663+ With these state bits, we describe a test used frequently in the code below,
10664+ jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The
10665+ test for "flushprepped" returns true if any of the following are true:
10666+
10667+ - The node is not dirty
10668+ - The node has JNODE_RELOC set
10669+ - The node has JNODE_OVRWR set
10670+
10671+ If either the node is not dirty or it has already been processed by flush (and assigned
10672+ JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns
10673+ true then flush has work to do on that node.
10674+*/
10675+
10676+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10677+ flushprepped twice (unless an explicit call to flush_unprep is made as described in
10678+ detail below). For example a node is dirtied, allocated, and then early-flushed to
10679+ disk and set clean. Before the transaction commits, the page is dirtied again and, due
10680+ to memory pressure, the node is flushed again. The flush algorithm will not relocate
10681+ the node to a new disk location, it will simply write it to the same, previously
10682+ relocated position again.
10683+*/
10684+
10685+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
10686+ start at a leaf node and allocate in parent-first order by iterating to the right. At
10687+ each step of the iteration, we check for the right neighbor. Before advancing to the
10688+ right neighbor, we check if the current position and the right neighbor share the same
10689+ parent. If they do not share the same parent, the parent is allocated before the right
10690+ neighbor.
10691+
10692+ This process goes recursively up the tree and squeeze nodes level by level as long as
10693+ the right neighbor and the current position have different parents, then it allocates
10694+ the right-neighbors-with-different-parents on the way back down. This process is
10695+ described in more detail in flush_squalloc_changed_ancestor and the recursive function
10696+ squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10697+ specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
10698+ approaches.
10699+
10700+ The top-down algorithm was implemented earlier (April-May 2002). In the top-down
10701+ approach, we find a starting point by scanning left along each level past dirty nodes,
10702+ then going up and repeating the process until the left node and the parent node are
10703+ clean. We then perform a parent-first traversal from the starting point, which makes
10704+ allocating in parent-first order trivial. After one subtree has been allocated in this
10705+ manner, we move to the right, try moving upward, then repeat the parent-first
10706+ traversal.
10707+
10708+ Both approaches have problems that need to be addressed. Both are approximately the
10709+ same amount of code, but the bottom-up approach has advantages in the order it acquires
10710+ locks which, at the very least, make it the better approach. At first glance each one
10711+ makes the other one look simpler, so it is important to remember a few of the problems
10712+ with each one.
10713+
10714+ Main problem with the top-down approach: When you encounter a clean child during the
10715+ parent-first traversal, what do you do? You would like to avoid searching through a
10716+ large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
10717+ obvious solution. One of the advantages of the top-down approach is that during the
10718+ parent-first traversal you check every child of a parent to see if it is dirty. In
10719+ this way, the top-down approach easily handles the main problem of the bottom-up
10720+ approach: unallocated children.
10721+
10722+ The unallocated children problem is that before writing a node to disk we must make
10723+ sure that all of its children are allocated. Otherwise, the writing the node means
10724+ extra I/O because the node will have to be written again when the child is finally
10725+ allocated.
10726+
10727+ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this
10728+ should not cause any file system corruption, it only degrades I/O performance because a
10729+ node may be written when it is sure to be written at least one more time in the same
10730+ transaction when the remaining children are allocated. What follows is a description
10731+ of how we will solve the problem.
10732+*/
10733+
10734+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
10735+ proceeding in parent first order, allocate some of its left-children, then encounter a
10736+ clean child in the middle of the parent. We do not allocate the clean child, but there
10737+ may remain unallocated (dirty) children to the right of the clean child. If we were to
10738+ stop flushing at this moment and write everything to disk, the parent might still
10739+ contain unallocated children.
10740+
10741+ We could try to allocate all the descendents of every node that we allocate, but this
10742+ is not necessary. Doing so could result in allocating the entire tree: if the root
10743+ node is allocated then every unallocated node would have to be allocated before
10744+ flushing. Actually, we do not have to write a node just because we allocate it. It is
10745+ possible to allocate but not write a node during flush, when it still has unallocated
10746+ children. However, this approach is probably not optimal for the following reason.
10747+
10748+ The flush algorithm is designed to allocate nodes in parent-first order in an attempt
10749+ to optimize reads that occur in the same order. Thus we are read-optimizing for a
10750+ left-to-right scan through all the leaves in the system, and we are hoping to
10751+ write-optimize at the same time because those nodes will be written together in batch.
10752+ What happens, however, if we assign a block number to a node in its read-optimized
10753+ order but then avoid writing it because it has unallocated children? In that
10754+ situation, we lose out on the write-optimization aspect because a node will have to be
10755+ written again to the its location on the device, later, which likely means seeking back
10756+ to that location.
10757+
10758+ So there are tradeoffs. We can choose either:
10759+
10760+ A. Allocate all unallocated children to preserve both write-optimization and
10761+ read-optimization, but this is not always desirable because it may mean having to
10762+ allocate and flush very many nodes at once.
10763+
10764+ B. Defer writing nodes with unallocated children, keep their read-optimized locations,
10765+ but sacrifice write-optimization because those nodes will be written again.
10766+
10767+ C. Defer writing nodes with unallocated children, but do not keep their read-optimized
10768+ locations. Instead, choose to write-optimize them later, when they are written. To
10769+ facilitate this, we "undo" the read-optimized allocation that was given to the node so
10770+ that later it can be write-optimized, thus "unpreparing" the flush decision. This is a
10771+ case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a
10772+ call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10773+ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
10774+ location, and set the JNODE_CREATED bit, effectively setting the node back to an
10775+ unallocated state.
10776+
10777+ We will take the following approach in v4.0: for twig nodes we will always finish
10778+ allocating unallocated children (A). For nodes with (level > TWIG) we will defer
10779+ writing and choose write-optimization (C).
10780+
10781+ To summarize, there are several parts to a solution that avoids the problem with
10782+ unallocated children:
10783+
10784+ FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
10785+ problem because there was an experiment which was done showed that we have 1-2 nodes
10786+ with unallocated children for thousands of written nodes. The experiment was simple
10787+ like coping / deletion of linux kernel sources. However the problem can arise in more
10788+ complex tests. I think we have jnode_io_hook to insert a check for unallocated
10789+ children and see what kind of problem we have.
10790+
10791+ 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
10792+ squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to
10793+ implement: should be simple -- amounts to adding a while loop to jnode_flush, see
10794+ comments in that function.
10795+
10796+ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
10797+ have unallocated children. If the twig level has unallocated children it is an
10798+ assertion failure. If a higher-level node has unallocated children, then it should be
10799+ explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement:
10800+ should be simple.
10801+
10802+ 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
10803+ CPU cycles than we would like, and it is possible (but medium complexity) to optimize
10804+ this somewhat in the case where large sub-trees are flushed. The following observation
10805+ helps: if both the left- and right-neighbor of a node are processed by the flush
10806+ algorithm then the node itself is guaranteed to have all of its children allocated.
10807+ However, the cost of this check may not be so expensive after all: it is not needed for
10808+ leaves and flush can guarantee this property for twigs. That leaves only (level >
10809+ TWIG) nodes that have to be checked, so this optimization only helps if at least three
10810+ (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
10811+ there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes
10812+ then the number of blocks being written will be very large, so the savings may be
10813+ insignificant. That said, the idea is to maintain both the left and right edges of
10814+ nodes that are processed in flush. When flush_empty_queue() is called, a relatively
10815+ simple test will tell whether the (level > TWIG) node is on the edge. If it is on the
10816+ edge, the slow check is necessary, but if it is in the interior then it can be assumed
10817+ to have all of its children allocated. FIXME: medium complexity to implement, but
10818+ simple to verify given that we must have a slow check anyway.
10819+
10820+ 4. (Optional) This part is optional, not for v4.0--flush should work independently of
10821+ whether this option is used or not. Called RAPID_SCAN, the idea is to amend the
10822+ left-scan operation to take unallocated children into account. Normally, the left-scan
10823+ operation goes left as long as adjacent nodes are dirty up until some large maximum
10824+ value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left
10825+ may stop at a position where there are unallocated children to the left with the same
10826+ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
10827+ FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
10828+ with a rapid scan. The rapid scan skips all the interior children of a node--if the
10829+ leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
10830+ twig to the left). If the left neighbor of the leftmost child is also dirty, then
10831+ continue the scan at the left twig and repeat. This option will cause flush to
10832+ allocate more twigs in a single pass, but it also has the potential to write many more
10833+ nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN
10834+ was partially implemented, code removed August 12, 2002 by JMACD.
10835+*/
10836+
10837+/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the
10838+ starting point for flush is a leaf node, but actually the flush code cares very little
10839+ about whether or not this is true. It is possible that all the leaf nodes are flushed
10840+ and dirty parent nodes still remain, in which case jnode_flush() is called on a
10841+ non-leaf argument. Flush doesn't care--it treats the argument node as if it were a
10842+ leaf, even when it is not. This is a simple approach, and there may be a more optimal
10843+ policy but until a problem with this approach is discovered, simplest is probably best.
10844+
10845+ NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
10846+ the leaves. This is done as a matter of simplicity and there is only one (shaky)
10847+ justification. When an atom commits, it flushes all leaf level nodes first, followed
10848+ by twigs, and so on. With flushing done in this order, if flush is eventually called
10849+ on a non-leaf node it means that (somehow) we reached a point where all leaves are
10850+ clean and only internal nodes need to be flushed. If that it the case, then it means
10851+ there were no leaves that were the parent-first preceder/follower of the parent. This
10852+ is expected to be a rare case, which is why we do nothing special about it. However,
10853+ memory pressure may pass an internal node to flush when there are still dirty leaf
10854+ nodes that need to be flushed, which could prove our original assumptions
10855+ "inoperative". If this needs to be fixed, then scan_left/right should have
10856+ special checks for the non-leaf levels. For example, instead of passing from a node to
10857+ the left neighbor, it should pass from the node to the left neighbor's rightmost
10858+ descendent (if dirty).
10859+
10860+*/
10861+
10862+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting
10863+ it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the
10864+ logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
10865+ device if we are walking from right to left. We then make passes in alternating directions, and as we do this the
10866+ device becomes sorted such that tree order and block number order fully correlate.
10867+
10868+ Resizing is done by shifting everything either all the way to the left or all the way
10869+ to the right, and then reporting the last block.
10870+*/
10871+
10872+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This
10873+ descibes the policy from the highest level:
10874+
10875+ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
10876+ leaf level during flush-scan (right, left), then we unconditionally decide to relocate
10877+ leaf nodes.
10878+
10879+ Otherwise, there are two contexts in which we make a decision to relocate:
10880+
10881+ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
10882+ During the initial stages of flush, after scan-right completes, we want to ask the
10883+ question: should we relocate this leaf node and thus dirty the parent node. Then if
10884+ the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
10885+ the question at the next level up, and so on. In these cases we are moving in the
10886+ reverse-parent first direction.
10887+
10888+ There is another case which is considered the reverse direction, which comes at the end
10889+ of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may
10890+ reach a point where there is a clean twig to the right with a dirty leftmost child. In
10891+ this case, we may wish to relocate the child by testing if it should be relocated
10892+ relative to its parent.
10893+
10894+ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
10895+ allocate_znode. What distinguishes the forward parent-first case from the
10896+ reverse-parent first case is that the preceder has already been allocated in the
10897+ forward case, whereas in the reverse case we don't know what the preceder is until we
10898+ finish "going in reverse". That simplifies the forward case considerably, and there we
10899+ actually use the block allocator to determine whether, e.g., a block closer to the
10900+ preceder is available.
10901+*/
10902+
10903+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we
10904+ finish scan-left and find a starting point, if the parent's left neighbor is dirty then
10905+ squeeze the parent's left neighbor and the parent. This may change the
10906+ flush-starting-node's parent. Repeat until the child's parent is stable. If the child
10907+ is a leftmost child, repeat this left-edge squeezing operation at the next level up.
10908+ Note that we cannot allocate extents during this or they will be out of parent-first
10909+ order. There is also some difficult coordinate maintenence issues. We can't do a tree
10910+ search to find coordinates again (because we hold locks), we have to determine them
10911+ from the two nodes being squeezed. Looks difficult, but has potential to increase
10912+ space utilization. */
10913+
10914+/* Flush-scan helper functions. */
10915+static void scan_init(flush_scan * scan);
10916+static void scan_done(flush_scan * scan);
10917+
10918+/* Flush-scan algorithm. */
10919+static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
10920+ unsigned limit);
10921+static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
10922+static int scan_common(flush_scan * scan, flush_scan * other);
10923+static int scan_formatted(flush_scan * scan);
10924+static int scan_unformatted(flush_scan * scan, flush_scan * other);
10925+static int scan_by_coord(flush_scan * scan);
10926+
10927+/* Initial flush-point ancestor allocation. */
10928+static int alloc_pos_and_ancestors(flush_pos_t * pos);
10929+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
10930+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
10931+
10932+/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */
10933+static int squalloc(flush_pos_t * pos);
10934+
10935+/* Flush squeeze implementation. */
10936+static int squeeze_right_non_twig(znode * left, znode * right);
10937+static int shift_one_internal_unit(znode * left, znode * right);
10938+
10939+/* Flush reverse parent-first relocation routines. */
10940+static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
10941+ const reiser4_block_nr * nblk);
10942+static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
10943+ flush_pos_t * pos);
10944+static int reverse_relocate_check_dirty_parent(jnode * node,
10945+ const coord_t * parent_coord,
10946+ flush_pos_t * pos);
10947+
10948+/* Flush allocate write-queueing functions: */
10949+static int allocate_znode(znode * node, const coord_t * parent_coord,
10950+ flush_pos_t * pos);
10951+static int allocate_znode_update(znode * node, const coord_t * parent_coord,
10952+ flush_pos_t * pos);
10953+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
10954+
10955+/* Flush helper functions: */
10956+static int jnode_lock_parent_coord(jnode * node,
10957+ coord_t * coord,
10958+ lock_handle * parent_lh,
10959+ load_count * parent_zh,
10960+ znode_lock_mode mode, int try);
10961+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
10962+ znode_lock_mode mode, int check_dirty, int expected);
10963+static int znode_same_parents(znode * a, znode * b);
10964+
10965+static int znode_check_flushprepped(znode * node)
10966+{
10967+ return jnode_check_flushprepped(ZJNODE(node));
10968+}
10969+
10970+/* Flush position functions */
10971+static void pos_init(flush_pos_t * pos);
10972+static int pos_valid(flush_pos_t * pos);
10973+static void pos_done(flush_pos_t * pos);
10974+static int pos_stop(flush_pos_t * pos);
10975+
10976+/* check that @org is first jnode extent unit, if extent is unallocated,
10977+ * because all jnodes of unallocated extent are dirty and of the same atom. */
10978+#define checkchild(scan) \
10979+assert("nikita-3435", \
10980+ ergo(scan->direction == LEFT_SIDE && \
10981+ (scan->parent_coord.node->level == TWIG_LEVEL) && \
10982+ jnode_is_unformatted(scan->node) && \
10983+ extent_is_unallocated(&scan->parent_coord), \
10984+ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
10985+
10986+/* This flush_cnt variable is used to track the number of concurrent flush operations,
10987+ useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has
10988+ no static initializer function...) */
10989+ON_DEBUG(atomic_t flush_cnt;
10990+ )
10991+
10992+/* check fs backing device for write congestion */
10993+static int check_write_congestion(void)
10994+{
10995+ struct super_block *sb;
10996+ struct backing_dev_info *bdi;
10997+
10998+ sb = reiser4_get_current_sb();
10999+ bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
11000+ return bdi_write_congested(bdi);
11001+}
11002+
11003+/* conditionally write flush queue */
11004+static int write_prepped_nodes(flush_pos_t * pos)
11005+{
11006+ int ret;
11007+
11008+ assert("zam-831", pos);
11009+ assert("zam-832", pos->fq);
11010+
11011+ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
11012+ return 0;
11013+
11014+ if (check_write_congestion())
11015+ return 0;
11016+
11017+ ret = reiser4_write_fq(pos->fq, pos->nr_written,
11018+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11019+ return ret;
11020+}
11021+
11022+/* Proper release all flush pos. resources then move flush position to new
11023+ locked node */
11024+static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
11025+ load_count * new_load, const coord_t * new_coord)
11026+{
11027+ assert("zam-857", new_lock->node == new_load->node);
11028+
11029+ if (new_coord) {
11030+ assert("zam-858", new_coord->node == new_lock->node);
11031+ coord_dup(&pos->coord, new_coord);
11032+ } else {
11033+ coord_init_first_unit(&pos->coord, new_lock->node);
11034+ }
11035+
11036+ if (pos->child) {
11037+ jput(pos->child);
11038+ pos->child = NULL;
11039+ }
11040+
11041+ move_load_count(&pos->load, new_load);
11042+ done_lh(&pos->lock);
11043+ move_lh(&pos->lock, new_lock);
11044+}
11045+
11046+/* delete empty node which link from the parent still exists. */
11047+static int delete_empty_node(znode * node)
11048+{
11049+ reiser4_key smallest_removed;
11050+
11051+ assert("zam-1019", node != NULL);
11052+ assert("zam-1020", node_is_empty(node));
11053+ assert("zam-1023", znode_is_wlocked(node));
11054+
11055+ return reiser4_delete_node(node, &smallest_removed, NULL, 1);
11056+}
11057+
11058+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
11059+static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
11060+{
11061+ int ret;
11062+ load_count load;
11063+ lock_handle lock;
11064+
11065+ init_lh(&lock);
11066+ init_load_count(&load);
11067+
11068+ if (jnode_is_znode(org)) {
11069+ ret = longterm_lock_znode(&lock, JZNODE(org),
11070+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
11071+ if (ret)
11072+ return ret;
11073+
11074+ ret = incr_load_count_znode(&load, JZNODE(org));
11075+ if (ret)
11076+ return ret;
11077+
11078+ pos->state =
11079+ (jnode_get_level(org) ==
11080+ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11081+ move_flush_pos(pos, &lock, &load, NULL);
11082+ } else {
11083+ coord_t parent_coord;
11084+ ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11085+ &load, ZNODE_WRITE_LOCK, 0);
11086+ if (ret)
11087+ goto done;
11088+ if (!item_is_extent(&parent_coord)) {
11089+ /* file was converted to tail, org became HB, we found internal
11090+ item */
11091+ ret = -EAGAIN;
11092+ goto done;
11093+ }
11094+
11095+ pos->state = POS_ON_EPOINT;
11096+ move_flush_pos(pos, &lock, &load, &parent_coord);
11097+ pos->child = jref(org);
11098+ if (extent_is_unallocated(&parent_coord)
11099+ && extent_unit_index(&parent_coord) != index_jnode(org)) {
11100+ /* @org is not first child of its parent unit. This may happen
11101+ because longerm lock of its parent node was released between
11102+ scan_left and scan_right. For now work around this having flush to repeat */
11103+ ret = -EAGAIN;
11104+ }
11105+ }
11106+
11107+ done:
11108+ done_load_count(&load);
11109+ done_lh(&lock);
11110+ return ret;
11111+}
11112+
11113+/* TODO LIST (no particular order): */
11114+/* I have labelled most of the legitimate FIXME comments in this file with letters to
11115+ indicate which issue they relate to. There are a few miscellaneous FIXMEs with
11116+ specific names mentioned instead that need to be inspected/resolved. */
11117+/* B. There is an issue described in reverse_relocate_test having to do with an
11118+ imprecise is_preceder? check having to do with partially-dirty extents. The code that
11119+ sets preceder hints and computes the preceder is basically untested. Careful testing
11120+ needs to be done that preceder calculations are done correctly, since if it doesn't
11121+ affect correctness we will not catch this stuff during regular testing. */
11122+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are
11123+ considered expected but unlikely conditions. Flush currently returns 0 (i.e., success
11124+ but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11125+ Many of the calls that may produce one of these return values (i.e.,
11126+ longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11127+ values themselves and, for instance, stop flushing instead of resulting in a restart.
11128+ If any of these results are true error conditions then flush will go into a busy-loop,
11129+ as we noticed during testing when a corrupt tree caused find_child_ptr to return
11130+ ENOENT. It needs careful thought and testing of corner conditions.
11131+*/
11132+/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created
11133+ block is assigned a block number then early-flushed to disk. It is dirtied again and
11134+ flush is called again. Concurrently, that block is deleted, and the de-allocation of
11135+ its block number does not need to be deferred, since it is not part of the preserve set
11136+ (i.e., it didn't exist before the transaction). I think there may be a race condition
11137+ where flush writes the dirty, created block after the non-deferred deallocated block
11138+ number is re-allocated, making it possible to write deleted data on top of non-deleted
11139+ data. Its just a theory, but it needs to be thought out. */
11140+/* F. bio_alloc() failure is not handled gracefully. */
11141+/* G. Unallocated children. */
11142+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11143+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11144+
11145+/* JNODE_FLUSH: MAIN ENTRY POINT */
11146+/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11147+ neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty
11148+ blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11149+ a part of transaction commit.
11150+
11151+ Our objective here is to prep and flush the slum the jnode belongs to. We want to
11152+ squish the slum together, and allocate the nodes in it as we squish because allocation
11153+ of children affects squishing of parents.
11154+
11155+ The "argument" @node tells flush where to start. From there, flush finds the left edge
11156+ of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a
11157+ "better place" to start squalloc first we perform a flush_scan.
11158+
11159+ Flush-scanning may be performed in both left and right directions, but for different
11160+ purposes. When scanning to the left, we are searching for a node that precedes a
11161+ sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11162+ During flush-scanning, we also take the opportunity to count the number of consecutive
11163+ leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11164+ make a decision to reallocate leaf nodes (thus favoring write-optimization).
11165+
11166+ Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11167+ also be dirty nodes to the right of the argument. If the scan-left operation does not
11168+ count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11169+ operation to see whether there is, in fact, enough nodes to meet the relocate
11170+ threshold. Each right- and left-scan operation uses a single flush_scan object.
11171+
11172+ After left-scan and possibly right-scan, we prepare a flush_position object with the
11173+ starting flush point or parent coordinate, which was determined using scan-left.
11174+
11175+ Next we call the main flush routine, squalloc, which iterates along the
11176+ leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11177+
11178+ After squalloc returns we take extra steps to ensure that all the children
11179+ of the final twig node are allocated--this involves repeating squalloc
11180+ until we finish at a twig with no unallocated children.
11181+
11182+ Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter
11183+ any above-twig nodes during flush_empty_queue that still have unallocated children, we
11184+ flush_unprep them.
11185+
11186+ Flush treats several "failure" cases as non-failures, essentially causing them to start
11187+ over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11188+ probably be handled properly rather than restarting, but there are a bunch of cases to
11189+ audit.
11190+*/
11191+
11192+static int
11193+jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11194+ flush_queue_t * fq, int flags)
11195+{
11196+ long ret = 0;
11197+ flush_scan *right_scan;
11198+ flush_scan *left_scan;
11199+ flush_pos_t *flush_pos;
11200+ int todo;
11201+ struct super_block *sb;
11202+ reiser4_super_info_data *sbinfo;
11203+ jnode *leftmost_in_slum = NULL;
11204+
11205+ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11206+ assert("nikita-3022", reiser4_schedulable());
11207+
11208+ assert("nikita-3185",
11209+ get_current_super_private()->delete_mutex_owner != current);
11210+
11211+ /* allocate right_scan, left_scan and flush_pos */
11212+ right_scan =
11213+ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11214+ reiser4_ctx_gfp_mask_get());
11215+ if (right_scan == NULL)
11216+ return RETERR(-ENOMEM);
11217+ left_scan = right_scan + 1;
11218+ flush_pos = (flush_pos_t *) (left_scan + 1);
11219+
11220+ sb = reiser4_get_current_sb();
11221+ sbinfo = get_super_private(sb);
11222+
11223+ /* Flush-concurrency debug code */
11224+#if REISER4_DEBUG
11225+ atomic_inc(&flush_cnt);
11226+#endif
11227+
11228+ reiser4_enter_flush(sb);
11229+
11230+ /* Initialize a flush position. */
11231+ pos_init(flush_pos);
11232+
11233+ flush_pos->nr_written = nr_written;
11234+ flush_pos->fq = fq;
11235+ flush_pos->flags = flags;
11236+ flush_pos->nr_to_write = nr_to_write;
11237+
11238+ scan_init(right_scan);
11239+ scan_init(left_scan);
11240+
11241+ /* First scan left and remember the leftmost scan position. If the leftmost
11242+ position is unformatted we remember its parent_coord. We scan until counting
11243+ FLUSH_SCAN_MAXNODES.
11244+
11245+ If starting @node is unformatted, at the beginning of left scan its
11246+ parent (twig level node, containing extent item) will be long term
11247+ locked and lock handle will be stored in the
11248+ @right_scan->parent_lock. This lock is used to start the rightward
11249+ scan without redoing the tree traversal (necessary to find parent)
11250+ and, hence, is kept during leftward scan. As a result, we have to
11251+ use try-lock when taking long term locks during the leftward scan.
11252+ */
11253+ ret = scan_left(left_scan, right_scan,
11254+ node, sbinfo->flush.scan_maxnodes);
11255+ if (ret != 0)
11256+ goto failed;
11257+
11258+ leftmost_in_slum = jref(left_scan->node);
11259+ scan_done(left_scan);
11260+
11261+ /* Then possibly go right to decide if we will use a policy of relocating leaves.
11262+ This is only done if we did not scan past (and count) enough nodes during the
11263+ leftward scan. If we do scan right, we only care to go far enough to establish
11264+ that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The
11265+ scan limit is the difference between left_scan.count and the threshold. */
11266+
11267+ todo = sbinfo->flush.relocate_threshold - left_scan->count;
11268+ /* scan right is inherently deadlock prone, because we are
11269+ * (potentially) holding a lock on the twig node at this moment.
11270+ * FIXME: this is incorrect comment: lock is not held */
11271+ if (todo > 0) {
11272+ ret = scan_right(right_scan, node, (unsigned)todo);
11273+ if (ret != 0)
11274+ goto failed;
11275+ }
11276+
11277+ /* Only the right-scan count is needed, release any rightward locks right away. */
11278+ scan_done(right_scan);
11279+
11280+ /* ... and the answer is: we should relocate leaf nodes if at least
11281+ FLUSH_RELOCATE_THRESHOLD nodes were found. */
11282+ flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11283+ (left_scan->count + right_scan->count >=
11284+ sbinfo->flush.relocate_threshold);
11285+
11286+ /* Funny business here. We set the 'point' in the flush_position at prior to
11287+ starting squalloc regardless of whether the first point is
11288+ formatted or unformatted. Without this there would be an invariant, in the
11289+ rest of the code, that if the flush_position is unformatted then
11290+ flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11291+ and if the flush_position is formatted then flush_position->point is non-NULL
11292+ and no parent info is set.
11293+
11294+ This seems lazy, but it makes the initial calls to reverse_relocate_test
11295+ (which ask "is it the pos->point the leftmost child of its parent") much easier
11296+ because we know the first child already. Nothing is broken by this, but the
11297+ reasoning is subtle. Holding an extra reference on a jnode during flush can
11298+ cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11299+ removed from sibling lists until they have zero reference count. Flush would
11300+ never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11301+ deleted to the right. So if nothing is broken, why fix it?
11302+
11303+ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11304+ point and in any moment, because of the concurrent file system
11305+ activity (for example, truncate). */
11306+
11307+ /* Check jnode state after flush_scan completed. Having a lock on this
11308+ node or its parent (in case of unformatted) helps us in case of
11309+ concurrent flushing. */
11310+ if (jnode_check_flushprepped(leftmost_in_slum)
11311+ && !jnode_convertible(leftmost_in_slum)) {
11312+ ret = 0;
11313+ goto failed;
11314+ }
11315+
11316+ /* Now setup flush_pos using scan_left's endpoint. */
11317+ ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11318+ if (ret)
11319+ goto failed;
11320+
11321+ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11322+ && node_is_empty(flush_pos->coord.node)) {
11323+ znode *empty = flush_pos->coord.node;
11324+
11325+ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11326+ ret = delete_empty_node(empty);
11327+ goto failed;
11328+ }
11329+
11330+ if (jnode_check_flushprepped(leftmost_in_slum)
11331+ && !jnode_convertible(leftmost_in_slum)) {
11332+ ret = 0;
11333+ goto failed;
11334+ }
11335+
11336+ /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */
11337+ ret = alloc_pos_and_ancestors(flush_pos);
11338+ if (ret)
11339+ goto failed;
11340+
11341+ /* Do the main rightward-bottom-up squeeze and allocate loop. */
11342+ ret = squalloc(flush_pos);
11343+ pos_stop(flush_pos);
11344+ if (ret)
11345+ goto failed;
11346+
11347+ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11348+ First, the pos_stop() and pos_valid() routines should be modified
11349+ so that pos_stop() sets a flush_position->stop flag to 1 without
11350+ releasing the current position immediately--instead release it in
11351+ pos_done(). This is a better implementation than the current one anyway.
11352+
11353+ It is not clear that all fields of the flush_position should not be released,
11354+ but at the very least the parent_lock, parent_coord, and parent_load should
11355+ remain held because they are hold the last twig when pos_stop() is
11356+ called.
11357+
11358+ When we reach this point in the code, if the parent_coord is set to after the
11359+ last item then we know that flush reached the end of a twig (and according to
11360+ the new flush queueing design, we will return now). If parent_coord is not
11361+ past the last item, we should check if the current twig has any unallocated
11362+ children to the right (we are not concerned with unallocated children to the
11363+ left--in that case the twig itself should not have been allocated). If the
11364+ twig has unallocated children to the right, set the parent_coord to that
11365+ position and then repeat the call to squalloc.
11366+
11367+ Testing for unallocated children may be defined in two ways: if any internal
11368+ item has a fake block number, it is unallocated; if any extent item is
11369+ unallocated then all of its children are unallocated. But there is a more
11370+ aggressive approach: if there are any dirty children of the twig to the right
11371+ of the current position, we may wish to relocate those nodes now. Checking for
11372+ potential relocation is more expensive as it requires knowing whether there are
11373+ any dirty children that are not unallocated. The extent_needs_allocation
11374+ should be used after setting the correct preceder.
11375+
11376+ When we reach the end of a twig at this point in the code, if the flush can
11377+ continue (when the queue is ready) it will need some information on the future
11378+ starting point. That should be stored away in the flush_handle using a seal, I
11379+ believe. Holding a jref() on the future starting point may break other code
11380+ that deletes that node.
11381+ */
11382+
11383+ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11384+ above the twig level. If the VM calls flush above the twig level, do nothing
11385+ and return (but figure out why this happens). The txnmgr should be modified to
11386+ only flush its leaf-level dirty list. This will do all the necessary squeeze
11387+ and allocate steps but leave unallocated branches and possibly unallocated
11388+ twigs (when the twig's leftmost child is not dirty). After flushing the leaf
11389+ level, the remaining unallocated nodes should be given write-optimized
11390+ locations. (Possibly, the remaining unallocated twigs should be allocated just
11391+ before their leftmost child.)
11392+ */
11393+
11394+ /* Any failure reaches this point. */
11395+ failed:
11396+
11397+ switch (ret) {
11398+ case -E_REPEAT:
11399+ case -EINVAL:
11400+ case -E_DEADLOCK:
11401+ case -E_NO_NEIGHBOR:
11402+ case -ENOENT:
11403+ /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11404+ in each case. They already are handled in many cases. */
11405+ /* Something bad happened, but difficult to avoid... Try again! */
11406+ ret = 0;
11407+ }
11408+
11409+ if (leftmost_in_slum)
11410+ jput(leftmost_in_slum);
11411+
11412+ pos_done(flush_pos);
11413+ scan_done(left_scan);
11414+ scan_done(right_scan);
11415+ kfree(right_scan);
11416+
11417+ ON_DEBUG(atomic_dec(&flush_cnt));
11418+
11419+ reiser4_leave_flush(sb);
11420+
11421+ return ret;
11422+}
11423+
11424+/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11425+ * flusher should submit all prepped nodes immediately without keeping them in
11426+ * flush queues for long time. The reason for rapid flush mode is to free
11427+ * memory as fast as possible. */
11428+
11429+#if REISER4_USE_RAPID_FLUSH
11430+
11431+/**
11432+ * submit all prepped nodes if rapid flush mode is set,
11433+ * turn rapid flush mode off.
11434+ */
11435+
11436+static int rapid_flush(flush_pos_t * pos)
11437+{
11438+ if (!wbq_available())
11439+ return 0;
11440+
11441+ return write_prepped_nodes(pos);
11442+}
11443+
11444+#else
11445+
11446+#define rapid_flush(pos) (0)
11447+
11448+#endif /* REISER4_USE_RAPID_FLUSH */
11449+
11450+static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11451+ flush_queue_t *fq, int *nr_queued,
11452+ int flags)
11453+{
11454+ jnode * node;
11455+
11456+ if (start != NULL) {
11457+ spin_lock_jnode(start);
11458+ if (!jnode_is_flushprepped(start)) {
11459+ assert("zam-1056", start->atom == atom);
11460+ node = start;
11461+ goto enter;
11462+ }
11463+ spin_unlock_jnode(start);
11464+ }
11465+ /*
11466+ * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11467+ * nodes. The atom spin lock is not released until all dirty nodes processed or
11468+ * not prepped node found in the atom dirty lists.
11469+ */
11470+ while ((node = find_first_dirty_jnode(atom, flags))) {
11471+ spin_lock_jnode(node);
11472+ enter:
11473+ assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11474+ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11475+
11476+ if (JF_ISSET(node, JNODE_WRITEBACK)) {
11477+ /* move node to the end of atom's writeback list */
11478+ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11479+
11480+ /*
11481+ * jnode is not necessarily on dirty list: if it was dirtied when
11482+ * it was on flush queue - it does not get moved to dirty list
11483+ */
11484+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11485+ WB_LIST, 1));
11486+
11487+ } else if (jnode_is_znode(node)
11488+ && znode_above_root(JZNODE(node))) {
11489+ /*
11490+ * A special case for znode-above-root. The above-root (fake)
11491+ * znode is captured and dirtied when the tree height changes or
11492+ * when the root node is relocated. This causes atoms to fuse so
11493+ * that changes at the root are serialized. However, this node is
11494+ * never flushed. This special case used to be in lock.c to
11495+ * prevent the above-root node from ever being captured, but now
11496+ * that it is captured we simply prevent it from flushing. The
11497+ * log-writer code relies on this to properly log superblock
11498+ * modifications of the tree height.
11499+ */
11500+ jnode_make_wander_nolock(node);
11501+ } else if (JF_ISSET(node, JNODE_RELOC)) {
11502+ queue_jnode(fq, node);
11503+ ++(*nr_queued);
11504+ } else
11505+ break;
11506+
11507+ spin_unlock_jnode(node);
11508+ }
11509+ return node;
11510+}
11511+
11512+/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11513+ * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11514+ * other errors as they are. */
11515+int
11516+flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11517+ txn_atom ** atom, jnode *start)
11518+{
11519+ reiser4_super_info_data *sinfo = get_current_super_private();
11520+ flush_queue_t *fq = NULL;
11521+ jnode *node;
11522+ int nr_queued;
11523+ int ret;
11524+
11525+ assert("zam-889", atom != NULL && *atom != NULL);
11526+ assert_spin_locked(&((*atom)->alock));
11527+ assert("zam-892", get_current_context()->trans->atom == *atom);
11528+
11529+ nr_to_write = LONG_MAX;
11530+ while (1) {
11531+ ret = reiser4_fq_by_atom(*atom, &fq);
11532+ if (ret != -E_REPEAT)
11533+ break;
11534+ *atom = get_current_atom_locked();
11535+ }
11536+ if (ret)
11537+ return ret;
11538+
11539+ assert_spin_locked(&((*atom)->alock));
11540+
11541+ /* parallel flushers limit */
11542+ if (sinfo->tmgr.atom_max_flushers != 0) {
11543+ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11544+ /* An reiser4_atom_send_event() call is inside
11545+ reiser4_fq_put_nolock() which is called when flush is
11546+ finished and nr_flushers is decremented. */
11547+ reiser4_atom_wait_event(*atom);
11548+ *atom = get_current_atom_locked();
11549+ }
11550+ }
11551+
11552+ /* count ourself as a flusher */
11553+ (*atom)->nr_flushers++;
11554+
11555+ writeout_mode_enable();
11556+
11557+ nr_queued = 0;
11558+ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11559+
11560+ if (node == NULL) {
11561+ if (nr_queued == 0) {
11562+ (*atom)->nr_flushers--;
11563+ reiser4_fq_put_nolock(fq);
11564+ reiser4_atom_send_event(*atom);
11565+ /* current atom remains locked */
11566+ writeout_mode_disable();
11567+ return 0;
11568+ }
11569+ spin_unlock_atom(*atom);
11570+ } else {
11571+ jref(node);
11572+ BUG_ON((*atom)->super != node->tree->super);
11573+ spin_unlock_atom(*atom);
11574+ spin_unlock_jnode(node);
11575+ BUG_ON(nr_to_write == 0);
11576+ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11577+ jput(node);
11578+ }
11579+
11580+ ret =
11581+ reiser4_write_fq(fq, nr_submitted,
11582+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11583+
11584+ *atom = get_current_atom_locked();
11585+ (*atom)->nr_flushers--;
11586+ reiser4_fq_put_nolock(fq);
11587+ reiser4_atom_send_event(*atom);
11588+ spin_unlock_atom(*atom);
11589+
11590+ writeout_mode_disable();
11591+
11592+ if (ret == 0)
11593+ ret = -E_REPEAT;
11594+
11595+ return ret;
11596+}
11597+
11598+/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11599+
11600+/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
11601+ reverse parent-first relocate context. Here all we know is the preceder and the block
11602+ number. Since we are going in reverse, the preceder may still be relocated as well, so
11603+ we can't ask the block allocator "is there a closer block available to relocate?" here.
11604+ In the _forward_ parent-first relocate context (not here) we actually call the block
11605+ allocator to try and find a closer location. */
11606+static int
11607+reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11608+ const reiser4_block_nr * nblk)
11609+{
11610+ reiser4_block_nr dist;
11611+
11612+ assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11613+ assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
11614+ assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
11615+
11616+ /* Distance is the absolute value. */
11617+ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11618+
11619+ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
11620+ block, do not relocate. */
11621+ if (dist <= get_current_super_private()->flush.relocate_distance) {
11622+ return 0;
11623+ }
11624+
11625+ return 1;
11626+}
11627+
11628+/* This function is a predicate that tests for relocation. Always called in the
11629+ reverse-parent-first context, when we are asking whether the current node should be
11630+ relocated in order to expand the flush by dirtying the parent level (and thus
11631+ proceeding to flush that level). When traversing in the forward parent-first direction
11632+ (not here), relocation decisions are handled in two places: allocate_znode() and
11633+ extent_needs_allocation(). */
11634+static int
11635+reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11636+ flush_pos_t * pos)
11637+{
11638+ reiser4_block_nr pblk = 0;
11639+ reiser4_block_nr nblk = 0;
11640+
11641+ assert("jmacd-8989", !jnode_is_root(node));
11642+
11643+ /*
11644+ * This function is called only from the
11645+ * reverse_relocate_check_dirty_parent() and only if the parent
11646+ * node is clean. This implies that the parent has the real (i.e., not
11647+ * fake) block number, and, so does the child, because otherwise the
11648+ * parent would be dirty.
11649+ */
11650+
11651+ /* New nodes are treated as if they are being relocated. */
11652+ if (JF_ISSET (node, JNODE_CREATED) ||
11653+ (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
11654+ return 1;
11655+ }
11656+
11657+ /* Find the preceder. FIXME(B): When the child is an unformatted, previously
11658+ existing node, the coord may be leftmost even though the child is not the
11659+ parent-first preceder of the parent. If the first dirty node appears somewhere
11660+ in the middle of the first extent unit, this preceder calculation is wrong.
11661+ Needs more logic in here. */
11662+ if (coord_is_leftmost_unit(parent_coord)) {
11663+ pblk = *znode_get_block(parent_coord->node);
11664+ } else {
11665+ pblk = pos->preceder.blk;
11666+ }
11667+ check_preceder(pblk);
11668+
11669+ /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
11670+ if (pblk == 0) {
11671+ return 1;
11672+ }
11673+
11674+ nblk = *jnode_get_block(node);
11675+
11676+ if (reiser4_blocknr_is_fake(&nblk))
11677+ /* child is unallocated, mark parent dirty */
11678+ return 1;
11679+
11680+ return reverse_relocate_if_close_enough(&pblk, &nblk);
11681+}
11682+
11683+/* This function calls reverse_relocate_test to make a reverse-parent-first
11684+ relocation decision and then, if yes, it marks the parent dirty. */
11685+static int
11686+reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
11687+ flush_pos_t * pos)
11688+{
11689+ int ret;
11690+
11691+ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11692+
11693+ ret = reverse_relocate_test(node, parent_coord, pos);
11694+ if (ret < 0) {
11695+ return ret;
11696+ }
11697+
11698+ /* FIXME-ZAM
11699+ if parent is already relocated - we do not want to grab space, right? */
11700+ if (ret == 1) {
11701+ int grabbed;
11702+
11703+ grabbed = get_current_context()->grabbed_blocks;
11704+ if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11705+ 0)
11706+ reiser4_panic("umka-1250",
11707+ "No space left during flush.");
11708+
11709+ assert("jmacd-18923",
11710+ znode_is_write_locked(parent_coord->node));
11711+ znode_make_dirty(parent_coord->node);
11712+ grabbed2free_mark(grabbed);
11713+ }
11714+ }
11715+
11716+ return 0;
11717+}
11718+
11719+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
11720+ PARENT-FIRST LOOP BEGINS) */
11721+
11722+/* Get the leftmost child for given coord. */
11723+static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
11724+{
11725+ int ret;
11726+
11727+ ret = item_utmost_child(coord, LEFT_SIDE, child);
11728+
11729+ if (ret)
11730+ return ret;
11731+
11732+ if (IS_ERR(*child))
11733+ return PTR_ERR(*child);
11734+
11735+ return 0;
11736+}
11737+
11738+/* This step occurs after the left- and right-scans are completed, before starting the
11739+ forward parent-first traversal. Here we attempt to allocate ancestors of the starting
11740+ flush point, which means continuing in the reverse parent-first direction to the
11741+ parent, grandparent, and so on (as long as the child is a leftmost child). This
11742+ routine calls a recursive process, alloc_one_ancestor, which does the real work,
11743+ except there is special-case handling here for the first ancestor, which may be a twig.
11744+ At each level (here and alloc_one_ancestor), we check for relocation and then, if
11745+ the child is a leftmost child, repeat at the next level. On the way back down (the
11746+ recursion), we allocate the ancestors in parent-first order. */
11747+static int alloc_pos_and_ancestors(flush_pos_t * pos)
11748+{
11749+ int ret = 0;
11750+ lock_handle plock;
11751+ load_count pload;
11752+ coord_t pcoord;
11753+
11754+ if (znode_check_flushprepped(pos->lock.node))
11755+ return 0;
11756+
11757+ coord_init_invalid(&pcoord, NULL);
11758+ init_lh(&plock);
11759+ init_load_count(&pload);
11760+
11761+ if (pos->state == POS_ON_EPOINT) {
11762+ /* a special case for pos on twig level, where we already have
11763+ a lock on parent node. */
11764+ /* The parent may not be dirty, in which case we should decide
11765+ whether to relocate the child now. If decision is made to
11766+ relocate the child, the parent is marked dirty. */
11767+ ret =
11768+ reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11769+ pos);
11770+ if (ret)
11771+ goto exit;
11772+
11773+ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11774+ is leftmost) and the leaf/child, so recursion is not needed.
11775+ Levels above the twig will be allocated for
11776+ write-optimization before the transaction commits. */
11777+
11778+ /* Do the recursive step, allocating zero or more of our
11779+ * ancestors. */
11780+ ret = alloc_one_ancestor(&pos->coord, pos);
11781+
11782+ } else {
11783+ if (!znode_is_root(pos->lock.node)) {
11784+ /* all formatted nodes except tree root */
11785+ ret =
11786+ reiser4_get_parent(&plock, pos->lock.node,
11787+ ZNODE_WRITE_LOCK);
11788+ if (ret)
11789+ goto exit;
11790+
11791+ ret = incr_load_count_znode(&pload, plock.node);
11792+ if (ret)
11793+ goto exit;
11794+
11795+ ret =
11796+ find_child_ptr(plock.node, pos->lock.node, &pcoord);
11797+ if (ret)
11798+ goto exit;
11799+
11800+ ret =
11801+ reverse_relocate_check_dirty_parent(ZJNODE
11802+ (pos->lock.
11803+ node), &pcoord,
11804+ pos);
11805+ if (ret)
11806+ goto exit;
11807+
11808+ ret = alloc_one_ancestor(&pcoord, pos);
11809+ if (ret)
11810+ goto exit;
11811+ }
11812+
11813+ ret = allocate_znode(pos->lock.node, &pcoord, pos);
11814+ }
11815+ exit:
11816+ done_load_count(&pload);
11817+ done_lh(&plock);
11818+ return ret;
11819+}
11820+
11821+/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the
11822+ call to set_preceder, which is the next function described, this checks if the
11823+ child is a leftmost child and returns if it is not. If the child is a leftmost child
11824+ it checks for relocation, possibly dirtying the parent. Then it performs the recursive
11825+ step. */
11826+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
11827+{
11828+ int ret = 0;
11829+ lock_handle alock;
11830+ load_count aload;
11831+ coord_t acoord;
11832+
11833+ /* As we ascend at the left-edge of the region to flush, take this opportunity at
11834+ the twig level to find our parent-first preceder unless we have already set
11835+ it. */
11836+ if (pos->preceder.blk == 0) {
11837+ ret = set_preceder(coord, pos);
11838+ if (ret != 0)
11839+ return ret;
11840+ }
11841+
11842+ /* If the ancestor is clean or already allocated, or if the child is not a
11843+ leftmost child, stop going up, even leaving coord->node not flushprepped. */
11844+ if (znode_check_flushprepped(coord->node)
11845+ || !coord_is_leftmost_unit(coord))
11846+ return 0;
11847+
11848+ init_lh(&alock);
11849+ init_load_count(&aload);
11850+ coord_init_invalid(&acoord, NULL);
11851+
11852+ /* Only ascend to the next level if it is a leftmost child, but write-lock the
11853+ parent in case we will relocate the child. */
11854+ if (!znode_is_root(coord->node)) {
11855+
11856+ ret =
11857+ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
11858+ &alock, &aload, ZNODE_WRITE_LOCK,
11859+ 0);
11860+ if (ret != 0) {
11861+ /* FIXME(C): check EINVAL, E_DEADLOCK */
11862+ goto exit;
11863+ }
11864+
11865+ ret =
11866+ reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
11867+ &acoord, pos);
11868+ if (ret != 0) {
11869+ goto exit;
11870+ }
11871+
11872+ /* Recursive call. */
11873+ if (!znode_check_flushprepped(acoord.node)) {
11874+ ret = alloc_one_ancestor(&acoord, pos);
11875+ if (ret)
11876+ goto exit;
11877+ }
11878+ }
11879+
11880+ /* Note: we call allocate with the parent write-locked (except at the root) in
11881+ case we relocate the child, in which case it will modify the parent during this
11882+ call. */
11883+ ret = allocate_znode(coord->node, &acoord, pos);
11884+
11885+ exit:
11886+ done_load_count(&aload);
11887+ done_lh(&alock);
11888+ return ret;
11889+}
11890+
11891+/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
11892+ a call to this function at the twig level. During alloc_pos_and_ancestors we may ask:
11893+ should this node be relocated (in reverse parent-first context)? We repeat this
11894+ process as long as the child is the leftmost child, eventually reaching an ancestor of
11895+ the flush point that is not a leftmost child. The preceder of that ancestors, which is
11896+ not a leftmost child, is actually on the leaf level. The preceder of that block is the
11897+ left-neighbor of the flush point. The preceder of that block is the rightmost child of
11898+ the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig
11899+ level, it stops momentarily to remember the block of the rightmost child of the twig on
11900+ the left and sets it to the flush_position's preceder_hint.
11901+
11902+ There is one other place where we may set the flush_position's preceder hint, which is
11903+ during scan-left.
11904+*/
11905+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
11906+{
11907+ int ret;
11908+ coord_t coord;
11909+ lock_handle left_lock;
11910+ load_count left_load;
11911+
11912+ coord_dup(&coord, coord_in);
11913+
11914+ init_lh(&left_lock);
11915+ init_load_count(&left_load);
11916+
11917+ /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
11918+ coord_is_leftmost_unit is not the right test if the unformatted child is in the
11919+ middle of the first extent unit. */
11920+ if (!coord_is_leftmost_unit(&coord)) {
11921+ coord_prev_unit(&coord);
11922+ } else {
11923+ ret =
11924+ reiser4_get_left_neighbor(&left_lock, coord.node,
11925+ ZNODE_READ_LOCK, GN_SAME_ATOM);
11926+ if (ret) {
11927+ /* If we fail for any reason it doesn't matter because the
11928+ preceder is only a hint. We are low-priority at this point, so
11929+ this must be the case. */
11930+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
11931+ ret == -ENOENT || ret == -EINVAL
11932+ || ret == -E_DEADLOCK) {
11933+ ret = 0;
11934+ }
11935+ goto exit;
11936+ }
11937+
11938+ ret = incr_load_count_znode(&left_load, left_lock.node);
11939+ if (ret)
11940+ goto exit;
11941+
11942+ coord_init_last_unit(&coord, left_lock.node);
11943+ }
11944+
11945+ ret =
11946+ item_utmost_child_real_block(&coord, RIGHT_SIDE,
11947+ &pos->preceder.blk);
11948+ exit:
11949+ check_preceder(pos->preceder.blk);
11950+ done_load_count(&left_load);
11951+ done_lh(&left_lock);
11952+ return ret;
11953+}
11954+
11955+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
11956+
11957+/* This procedure implements the outer loop of the flush algorithm. To put this in
11958+ context, here is the general list of steps taken by the flush routine as a whole:
11959+
11960+ 1. Scan-left
11961+ 2. Scan-right (maybe)
11962+ 3. Allocate initial flush position and its ancestors
11963+ 4. <handle extents>
11964+ 5. <squeeze and next position and its ancestors to-the-right,
11965+ then update position to-the-right>
11966+ 6. <repeat from #4 until flush is stopped>
11967+
11968+ This procedure implements the loop in steps 4 through 6 in the above listing.
11969+
11970+ Step 4: if the current flush position is an extent item (position on the twig level),
11971+ it allocates the extent (allocate_extent_item_in_place) then shifts to the next
11972+ coordinate. If the next coordinate's leftmost child needs flushprep, we will continue.
11973+ If the next coordinate is an internal item, we descend back to the leaf level,
11974+ otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate"
11975+ brings us past the end of the twig level, then we call
11976+ reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
11977+ step #5 which moves to the right.
11978+
11979+ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
11980+ tree to allocate any ancestors of the next-right flush position that are not also
11981+ ancestors of the current position. Those ancestors (in top-down order) are the next in
11982+ parent-first order. We squeeze adjacent nodes on the way up until the right node and
11983+ current node share the same parent, then allocate on the way back down. Finally, this
11984+ step sets the flush position to the next-right node. Then repeat steps 4 and 5.
11985+*/
11986+
11987+/* SQUEEZE CODE */
11988+
11989+/* squalloc_right_twig helper function, cut a range of extent items from
11990+ cut node to->node from the beginning up to coord @to. */
11991+static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
11992+ znode * left)
11993+{
11994+ coord_t from;
11995+ reiser4_key from_key;
11996+
11997+ coord_init_first_unit(&from, to->node);
11998+ item_key_by_coord(&from, &from_key);
11999+
12000+ return cut_node_content(&from, to, &from_key, to_key, NULL);
12001+}
12002+
12003+/* Copy as much of the leading extents from @right to @left, allocating
12004+ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
12005+ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
12006+ internal item it calls shift_one_internal_unit and may then return
12007+ SUBTREE_MOVED. */
12008+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
12009+{
12010+ int ret = SUBTREE_MOVED;
12011+ coord_t coord; /* used to iterate over items */
12012+ reiser4_key stop_key;
12013+
12014+ assert("jmacd-2008", !node_is_empty(right));
12015+ coord_init_first_unit(&coord, right);
12016+
12017+ /* FIXME: can be optimized to cut once */
12018+ while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
12019+ ON_DEBUG(void *vp);
12020+
12021+ assert("vs-1468", coord_is_leftmost_unit(&coord));
12022+ ON_DEBUG(vp = shift_check_prepare(left, coord.node));
12023+
12024+ /* stop_key is used to find what was copied and what to cut */
12025+ stop_key = *reiser4_min_key();
12026+ ret = squalloc_extent(left, &coord, pos, &stop_key);
12027+ if (ret != SQUEEZE_CONTINUE) {
12028+ ON_DEBUG(kfree(vp));
12029+ break;
12030+ }
12031+ assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
12032+
12033+ /* Helper function to do the cutting. */
12034+ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
12035+ check_me("vs-1466",
12036+ squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
12037+
12038+ ON_DEBUG(shift_check(vp, left, coord.node));
12039+ }
12040+
12041+ if (node_is_empty(coord.node))
12042+ ret = SQUEEZE_SOURCE_EMPTY;
12043+
12044+ if (ret == SQUEEZE_TARGET_FULL) {
12045+ goto out;
12046+ }
12047+
12048+ if (node_is_empty(right)) {
12049+ /* The whole right node was copied into @left. */
12050+ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
12051+ goto out;
12052+ }
12053+
12054+ coord_init_first_unit(&coord, right);
12055+
12056+ if (!item_is_internal(&coord)) {
12057+ /* we do not want to squeeze anything else to left neighbor because "slum"
12058+ is over */
12059+ ret = SQUEEZE_TARGET_FULL;
12060+ goto out;
12061+ }
12062+ assert("jmacd-433", item_is_internal(&coord));
12063+
12064+ /* Shift an internal unit. The child must be allocated before shifting any more
12065+ extents, so we stop here. */
12066+ ret = shift_one_internal_unit(left, right);
12067+
12068+ out:
12069+ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
12070+ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
12071+
12072+ if (ret == SQUEEZE_TARGET_FULL) {
12073+ /* We submit prepped nodes here and expect that this @left twig
12074+ * will not be modified again during this jnode_flush() call. */
12075+ int ret1;
12076+
12077+ /* NOTE: seems like io is done under long term locks. */
12078+ ret1 = write_prepped_nodes(pos);
12079+ if (ret1 < 0)
12080+ return ret1;
12081+ }
12082+
12083+ return ret;
12084+}
12085+
12086+#if REISER4_DEBUG
12087+static void item_convert_invariant(flush_pos_t * pos)
12088+{
12089+ assert("edward-1225", coord_is_existing_item(&pos->coord));
12090+ if (chaining_data_present(pos)) {
12091+ item_plugin *iplug = item_convert_plug(pos);
12092+
12093+ assert("edward-1000",
12094+ iplug == item_plugin_by_coord(&pos->coord));
12095+ assert("edward-1001", iplug->f.convert != NULL);
12096+ } else
12097+ assert("edward-1226", pos->child == NULL);
12098+}
12099+#else
12100+
12101+#define item_convert_invariant(pos) noop
12102+
12103+#endif
12104+
12105+/* Scan node items starting from the first one and apply for each
12106+ item its flush ->convert() method (if any). This method may
12107+ resize/kill the item so the tree will be changed.
12108+*/
12109+static int convert_node(flush_pos_t * pos, znode * node)
12110+{
12111+ int ret = 0;
12112+ item_plugin *iplug;
12113+
12114+ assert("edward-304", pos != NULL);
12115+ assert("edward-305", pos->child == NULL);
12116+ assert("edward-475", znode_convertible(node));
12117+ assert("edward-669", znode_is_wlocked(node));
12118+ assert("edward-1210", !node_is_empty(node));
12119+
12120+ if (znode_get_level(node) != LEAF_LEVEL)
12121+ /* unsupported */
12122+ goto exit;
12123+
12124+ coord_init_first_unit(&pos->coord, node);
12125+
12126+ while (1) {
12127+ ret = 0;
12128+ coord_set_to_left(&pos->coord);
12129+ item_convert_invariant(pos);
12130+
12131+ iplug = item_plugin_by_coord(&pos->coord);
12132+ assert("edward-844", iplug != NULL);
12133+
12134+ if (iplug->f.convert) {
12135+ ret = iplug->f.convert(pos);
12136+ if (ret)
12137+ goto exit;
12138+ }
12139+ assert("edward-307", pos->child == NULL);
12140+
12141+ if (coord_next_item(&pos->coord)) {
12142+ /* node is over */
12143+
12144+ if (!chaining_data_present(pos))
12145+ /* finished this node */
12146+ break;
12147+ if (should_chain_next_node(pos)) {
12148+ /* go to next node */
12149+ move_chaining_data(pos, 0 /* to next node */ );
12150+ break;
12151+ }
12152+ /* repeat this node */
12153+ move_chaining_data(pos, 1 /* this node */ );
12154+ continue;
12155+ }
12156+ /* Node is not over.
12157+ Check if there is attached convert data.
12158+ If so roll one item position back and repeat
12159+ on this node
12160+ */
12161+ if (chaining_data_present(pos)) {
12162+
12163+ if (iplug != item_plugin_by_coord(&pos->coord))
12164+ set_item_convert_count(pos, 0);
12165+
12166+ ret = coord_prev_item(&pos->coord);
12167+ assert("edward-1003", !ret);
12168+
12169+ move_chaining_data(pos, 1 /* this node */ );
12170+ }
12171+ }
12172+ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12173+ znode_make_dirty(node);
12174+ exit:
12175+ assert("edward-1004", !ret);
12176+ return ret;
12177+}
12178+
12179+/* Squeeze and allocate the right neighbor. This is called after @left and
12180+ its current children have been squeezed and allocated already. This
12181+ procedure's job is to squeeze and items from @right to @left.
12182+
12183+ If at the leaf level, use the shift_everything_left memcpy-optimized
12184+ version of shifting (squeeze_right_leaf).
12185+
12186+ If at the twig level, extents are allocated as they are shifted from @right
12187+ to @left (squalloc_right_twig).
12188+
12189+ At any other level, shift one internal item and return to the caller
12190+ (squalloc_parent_first) so that the shifted-subtree can be processed in
12191+ parent-first order.
12192+
12193+ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12194+ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12195+ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12196+ is returned.
12197+*/
12198+
12199+static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12200+ znode * right)
12201+{
12202+ int ret;
12203+
12204+ /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12205+ * tree owing to error (for example, ENOSPC) in write */
12206+ /* assert("jmacd-9321", !node_is_empty(left)); */
12207+ assert("jmacd-9322", !node_is_empty(right));
12208+ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12209+
12210+ switch (znode_get_level(left)) {
12211+ case TWIG_LEVEL:
12212+ /* Shift with extent allocating until either an internal item
12213+ is encountered or everything is shifted or no free space
12214+ left in @left */
12215+ ret = squeeze_right_twig(left, right, pos);
12216+ break;
12217+
12218+ default:
12219+ /* All other levels can use shift_everything until we implement per-item
12220+ flush plugins. */
12221+ ret = squeeze_right_non_twig(left, right);
12222+ break;
12223+ }
12224+
12225+ assert("jmacd-2011", (ret < 0 ||
12226+ ret == SQUEEZE_SOURCE_EMPTY
12227+ || ret == SQUEEZE_TARGET_FULL
12228+ || ret == SUBTREE_MOVED));
12229+ return ret;
12230+}
12231+
12232+static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12233+ znode * right)
12234+{
12235+ int ret;
12236+
12237+ ret = squeeze_right_twig(pos->lock.node, right, pos);
12238+ if (ret < 0)
12239+ return ret;
12240+ if (ret > 0) {
12241+ coord_init_after_last_item(&pos->coord, pos->lock.node);
12242+ return ret;
12243+ }
12244+
12245+ coord_init_last_unit(&pos->coord, pos->lock.node);
12246+ return 0;
12247+}
12248+
12249+/* forward declaration */
12250+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12251+
12252+/* do a fast check for "same parents" condition before calling
12253+ * squalloc_upper_levels() */
12254+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12255+ znode * left,
12256+ znode * right)
12257+{
12258+ if (znode_same_parents(left, right))
12259+ return 0;
12260+
12261+ return squalloc_upper_levels(pos, left, right);
12262+}
12263+
12264+/* Check whether the parent of given @right node needs to be processes
12265+ ((re)allocated) prior to processing of the child. If @left and @right do not
12266+ share at least the parent of the @right is after the @left but before the
12267+ @right in parent-first order, we have to (re)allocate it before the @right
12268+ gets (re)allocated. */
12269+static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12270+{
12271+ int ret;
12272+
12273+ lock_handle left_parent_lock;
12274+ lock_handle right_parent_lock;
12275+
12276+ load_count left_parent_load;
12277+ load_count right_parent_load;
12278+
12279+ init_lh(&left_parent_lock);
12280+ init_lh(&right_parent_lock);
12281+
12282+ init_load_count(&left_parent_load);
12283+ init_load_count(&right_parent_load);
12284+
12285+ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12286+ if (ret)
12287+ goto out;
12288+
12289+ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12290+ if (ret)
12291+ goto out;
12292+
12293+ /* Check for same parents */
12294+ if (left_parent_lock.node == right_parent_lock.node)
12295+ goto out;
12296+
12297+ if (znode_check_flushprepped(right_parent_lock.node)) {
12298+ /* Keep parent-first order. In the order, the right parent node stands
12299+ before the @right node. If it is already allocated, we set the
12300+ preceder (next block search start point) to its block number, @right
12301+ node should be allocated after it.
12302+
12303+ However, preceder is set only if the right parent is on twig level.
12304+ The explanation is the following: new branch nodes are allocated over
12305+ already allocated children while the tree grows, it is difficult to
12306+ keep tree ordered, we assume that only leaves and twings are correctly
12307+ allocated. So, only twigs are used as a preceder for allocating of the
12308+ rest of the slum. */
12309+ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12310+ pos->preceder.blk =
12311+ *znode_get_block(right_parent_lock.node);
12312+ check_preceder(pos->preceder.blk);
12313+ }
12314+ goto out;
12315+ }
12316+
12317+ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12318+ if (ret)
12319+ goto out;
12320+
12321+ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12322+ if (ret)
12323+ goto out;
12324+
12325+ ret =
12326+ squeeze_right_neighbor(pos, left_parent_lock.node,
12327+ right_parent_lock.node);
12328+ /* We stop if error. We stop if some items/units were shifted (ret == 0)
12329+ * and thus @right changed its parent. It means we have not process
12330+ * right_parent node prior to processing of @right. Positive return
12331+ * values say that shifting items was not happen because of "empty
12332+ * source" or "target full" conditions. */
12333+ if (ret <= 0)
12334+ goto out;
12335+
12336+ /* parent(@left) and parent(@right) may have different parents also. We
12337+ * do a recursive call for checking that. */
12338+ ret =
12339+ check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12340+ right_parent_lock.node);
12341+ if (ret)
12342+ goto out;
12343+
12344+ /* allocate znode when going down */
12345+ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12346+
12347+ out:
12348+ done_load_count(&left_parent_load);
12349+ done_load_count(&right_parent_load);
12350+
12351+ done_lh(&left_parent_lock);
12352+ done_lh(&right_parent_lock);
12353+
12354+ return ret;
12355+}
12356+
12357+/* Check the leftmost child "flushprepped" status, also returns true if child
12358+ * node was not found in cache. */
12359+static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12360+{
12361+ int ret;
12362+ int prepped;
12363+
12364+ jnode *child;
12365+
12366+ ret = get_leftmost_child_of_unit(coord, &child);
12367+
12368+ if (ret)
12369+ return ret;
12370+
12371+ if (child) {
12372+ prepped = jnode_check_flushprepped(child);
12373+ jput(child);
12374+ } else {
12375+ /* We consider not existing child as a node which slum
12376+ processing should not continue to. Not cached node is clean,
12377+ so it is flushprepped. */
12378+ prepped = 1;
12379+ }
12380+
12381+ return prepped;
12382+}
12383+
12384+/* (re)allocate znode with automated getting parent node */
12385+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12386+{
12387+ int ret;
12388+ lock_handle parent_lock;
12389+ load_count parent_load;
12390+ coord_t pcoord;
12391+
12392+ assert("zam-851", znode_is_write_locked(node));
12393+
12394+ init_lh(&parent_lock);
12395+ init_load_count(&parent_load);
12396+
12397+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12398+ if (ret)
12399+ goto out;
12400+
12401+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
12402+ if (ret)
12403+ goto out;
12404+
12405+ ret = find_child_ptr(parent_lock.node, node, &pcoord);
12406+ if (ret)
12407+ goto out;
12408+
12409+ ret = allocate_znode(node, &pcoord, pos);
12410+
12411+ out:
12412+ done_load_count(&parent_load);
12413+ done_lh(&parent_lock);
12414+ return ret;
12415+}
12416+
12417+/* Process nodes on leaf level until unformatted node or rightmost node in the
12418+ * slum reached. */
12419+static int handle_pos_on_formatted(flush_pos_t * pos)
12420+{
12421+ int ret;
12422+ lock_handle right_lock;
12423+ load_count right_load;
12424+
12425+ init_lh(&right_lock);
12426+ init_load_count(&right_load);
12427+
12428+ if (should_convert_node(pos, pos->lock.node)) {
12429+ ret = convert_node(pos, pos->lock.node);
12430+ if (ret)
12431+ return ret;
12432+ }
12433+
12434+ while (1) {
12435+ int expected;
12436+ expected = should_convert_next_node(pos);
12437+ ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12438+ ZNODE_WRITE_LOCK, !expected, expected);
12439+ if (ret) {
12440+ if (expected)
12441+ warning("edward-1495",
12442+ "Expected neighbor not found (ret = %d). Fsck?",
12443+ ret);
12444+ break;
12445+ }
12446+
12447+ /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it
12448+ * can be optimal. For now we choose to live with the risk that it will
12449+ * be suboptimal because it would be quite complex to code it to be
12450+ * smarter. */
12451+ if (znode_check_flushprepped(right_lock.node)
12452+ && !znode_convertible(right_lock.node)) {
12453+ assert("edward-1005", !should_convert_next_node(pos));
12454+ pos_stop(pos);
12455+ break;
12456+ }
12457+
12458+ ret = incr_load_count_znode(&right_load, right_lock.node);
12459+ if (ret)
12460+ break;
12461+ if (should_convert_node(pos, right_lock.node)) {
12462+ ret = convert_node(pos, right_lock.node);
12463+ if (ret)
12464+ break;
12465+ if (node_is_empty(right_lock.node)) {
12466+ /* node became empty after converting, repeat */
12467+ done_load_count(&right_load);
12468+ done_lh(&right_lock);
12469+ continue;
12470+ }
12471+ }
12472+
12473+ /* squeeze _before_ going upward. */
12474+ ret =
12475+ squeeze_right_neighbor(pos, pos->lock.node,
12476+ right_lock.node);
12477+ if (ret < 0)
12478+ break;
12479+
12480+ if (znode_check_flushprepped(right_lock.node)) {
12481+ if (should_convert_next_node(pos)) {
12482+ /* in spite of flushprepped status of the node,
12483+ its right slum neighbor should be converted */
12484+ assert("edward-953", convert_data(pos));
12485+ assert("edward-954", item_convert_data(pos));
12486+
12487+ if (node_is_empty(right_lock.node)) {
12488+ done_load_count(&right_load);
12489+ done_lh(&right_lock);
12490+ } else
12491+ move_flush_pos(pos, &right_lock,
12492+ &right_load, NULL);
12493+ continue;
12494+ }
12495+ pos_stop(pos);
12496+ break;
12497+ }
12498+
12499+ if (node_is_empty(right_lock.node)) {
12500+ /* repeat if right node was squeezed completely */
12501+ done_load_count(&right_load);
12502+ done_lh(&right_lock);
12503+ continue;
12504+ }
12505+
12506+ /* parent(right_lock.node) has to be processed before
12507+ * (right_lock.node) due to "parent-first" allocation order. */
12508+ ret =
12509+ check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12510+ right_lock.node);
12511+ if (ret)
12512+ break;
12513+ /* (re)allocate _after_ going upward */
12514+ ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12515+ if (ret)
12516+ break;
12517+ if (should_terminate_squalloc(pos)) {
12518+ set_item_convert_count(pos, 0);
12519+ break;
12520+ }
12521+
12522+ /* advance the flush position to the right neighbor */
12523+ move_flush_pos(pos, &right_lock, &right_load, NULL);
12524+
12525+ ret = rapid_flush(pos);
12526+ if (ret)
12527+ break;
12528+ }
12529+ check_convert_info(pos);
12530+ done_load_count(&right_load);
12531+ done_lh(&right_lock);
12532+
12533+ /* This function indicates via pos whether to stop or go to twig or continue on current
12534+ * level. */
12535+ return ret;
12536+
12537+}
12538+
12539+/* Process nodes on leaf level until unformatted node or rightmost node in the
12540+ * slum reached. */
12541+static int handle_pos_on_leaf(flush_pos_t * pos)
12542+{
12543+ int ret;
12544+
12545+ assert("zam-845", pos->state == POS_ON_LEAF);
12546+
12547+ ret = handle_pos_on_formatted(pos);
12548+
12549+ if (ret == -E_NO_NEIGHBOR) {
12550+ /* cannot get right neighbor, go process extents. */
12551+ pos->state = POS_TO_TWIG;
12552+ return 0;
12553+ }
12554+
12555+ return ret;
12556+}
12557+
12558+/* Process slum on level > 1 */
12559+static int handle_pos_on_internal(flush_pos_t * pos)
12560+{
12561+ assert("zam-850", pos->state == POS_ON_INTERNAL);
12562+ return handle_pos_on_formatted(pos);
12563+}
12564+
12565+/* check whether squalloc should stop before processing given extent */
12566+static int squalloc_extent_should_stop(flush_pos_t * pos)
12567+{
12568+ assert("zam-869", item_is_extent(&pos->coord));
12569+
12570+ /* pos->child is a jnode handle_pos_on_extent() should start with in
12571+ * stead of the first child of the first extent unit. */
12572+ if (pos->child) {
12573+ int prepped;
12574+
12575+ assert("vs-1383", jnode_is_unformatted(pos->child));
12576+ prepped = jnode_check_flushprepped(pos->child);
12577+ pos->pos_in_unit =
12578+ jnode_get_index(pos->child) -
12579+ extent_unit_index(&pos->coord);
12580+ assert("vs-1470",
12581+ pos->pos_in_unit < extent_unit_width(&pos->coord));
12582+ assert("nikita-3434",
12583+ ergo(extent_is_unallocated(&pos->coord),
12584+ pos->pos_in_unit == 0));
12585+ jput(pos->child);
12586+ pos->child = NULL;
12587+
12588+ return prepped;
12589+ }
12590+
12591+ pos->pos_in_unit = 0;
12592+ if (extent_is_unallocated(&pos->coord))
12593+ return 0;
12594+
12595+ return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12596+}
12597+
12598+/* Handle the case when regular reiser4 tree (znodes connected one to its
12599+ * neighbors by sibling pointers) is interrupted on leaf level by one or more
12600+ * unformatted nodes. By having a lock on twig level and use extent code
12601+ * routines to process unformatted nodes we swim around an irregular part of
12602+ * reiser4 tree. */
12603+static int handle_pos_on_twig(flush_pos_t * pos)
12604+{
12605+ int ret;
12606+
12607+ assert("zam-844", pos->state == POS_ON_EPOINT);
12608+ assert("zam-843", item_is_extent(&pos->coord));
12609+
12610+ /* We decide should we continue slum processing with current extent
12611+ unit: if leftmost child of current extent unit is flushprepped
12612+ (i.e. clean or already processed by flush) we stop squalloc(). There
12613+ is a fast check for unallocated extents which we assume contain all
12614+ not flushprepped nodes. */
12615+ /* FIXME: Here we implement simple check, we are only looking on the
12616+ leftmost child. */
12617+ ret = squalloc_extent_should_stop(pos);
12618+ if (ret != 0) {
12619+ pos_stop(pos);
12620+ return ret;
12621+ }
12622+
12623+ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12624+ && item_is_extent(&pos->coord)) {
12625+ ret = reiser4_alloc_extent(pos);
12626+ if (ret) {
12627+ break;
12628+ }
12629+ coord_next_unit(&pos->coord);
12630+ }
12631+
12632+ if (coord_is_after_rightmost(&pos->coord)) {
12633+ pos->state = POS_END_OF_TWIG;
12634+ return 0;
12635+ }
12636+ if (item_is_internal(&pos->coord)) {
12637+ pos->state = POS_TO_LEAF;
12638+ return 0;
12639+ }
12640+
12641+ assert("zam-860", item_is_extent(&pos->coord));
12642+
12643+ /* "slum" is over */
12644+ pos->state = POS_INVALID;
12645+ return 0;
12646+}
12647+
12648+/* When we about to return flush position from twig to leaf level we can process
12649+ * the right twig node or move position to the leaf. This processes right twig
12650+ * if it is possible and jump to leaf level if not. */
12651+static int handle_pos_end_of_twig(flush_pos_t * pos)
12652+{
12653+ int ret;
12654+ lock_handle right_lock;
12655+ load_count right_load;
12656+ coord_t at_right;
12657+ jnode *child = NULL;
12658+
12659+ assert("zam-848", pos->state == POS_END_OF_TWIG);
12660+ assert("zam-849", coord_is_after_rightmost(&pos->coord));
12661+
12662+ init_lh(&right_lock);
12663+ init_load_count(&right_load);
12664+
12665+ /* We get a lock on the right twig node even it is not dirty because
12666+ * slum continues or discontinues on leaf level not on next twig. This
12667+ * lock on the right twig is needed for getting its leftmost child. */
12668+ ret =
12669+ reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12670+ ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12671+ if (ret)
12672+ goto out;
12673+
12674+ ret = incr_load_count_znode(&right_load, right_lock.node);
12675+ if (ret)
12676+ goto out;
12677+
12678+ /* right twig could be not dirty */
12679+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12680+ /* If right twig node is dirty we always attempt to squeeze it
12681+ * content to the left... */
12682+ became_dirty:
12683+ ret =
12684+ squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12685+ if (ret <= 0) {
12686+ /* pos->coord is on internal item, go to leaf level, or
12687+ * we have an error which will be caught in squalloc() */
12688+ pos->state = POS_TO_LEAF;
12689+ goto out;
12690+ }
12691+
12692+ /* If right twig was squeezed completely we wave to re-lock
12693+ * right twig. now it is done through the top-level squalloc
12694+ * routine. */
12695+ if (node_is_empty(right_lock.node))
12696+ goto out;
12697+
12698+ /* ... and prep it if it is not yet prepped */
12699+ if (!znode_check_flushprepped(right_lock.node)) {
12700+ /* As usual, process parent before ... */
12701+ ret =
12702+ check_parents_and_squalloc_upper_levels(pos,
12703+ pos->lock.
12704+ node,
12705+ right_lock.
12706+ node);
12707+ if (ret)
12708+ goto out;
12709+
12710+ /* ... processing the child */
12711+ ret =
12712+ lock_parent_and_allocate_znode(right_lock.node,
12713+ pos);
12714+ if (ret)
12715+ goto out;
12716+ }
12717+ } else {
12718+ coord_init_first_unit(&at_right, right_lock.node);
12719+
12720+ /* check first child of next twig, should we continue there ? */
12721+ ret = get_leftmost_child_of_unit(&at_right, &child);
12722+ if (ret || child == NULL || jnode_check_flushprepped(child)) {
12723+ pos_stop(pos);
12724+ goto out;
12725+ }
12726+
12727+ /* check clean twig for possible relocation */
12728+ if (!znode_check_flushprepped(right_lock.node)) {
12729+ ret =
12730+ reverse_relocate_check_dirty_parent(child,
12731+ &at_right, pos);
12732+ if (ret)
12733+ goto out;
12734+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12735+ goto became_dirty;
12736+ }
12737+ }
12738+
12739+ assert("zam-875", znode_check_flushprepped(right_lock.node));
12740+
12741+ /* Update the preceder by a block number of just processed right twig
12742+ * node. The code above could miss the preceder updating because
12743+ * allocate_znode() could not be called for this node. */
12744+ pos->preceder.blk = *znode_get_block(right_lock.node);
12745+ check_preceder(pos->preceder.blk);
12746+
12747+ coord_init_first_unit(&at_right, right_lock.node);
12748+ assert("zam-868", coord_is_existing_unit(&at_right));
12749+
12750+ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12751+ move_flush_pos(pos, &right_lock, &right_load, &at_right);
12752+
12753+ out:
12754+ done_load_count(&right_load);
12755+ done_lh(&right_lock);
12756+
12757+ if (child)
12758+ jput(child);
12759+
12760+ return ret;
12761+}
12762+
12763+/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12764+ * continue there. */
12765+static int handle_pos_to_leaf(flush_pos_t * pos)
12766+{
12767+ int ret;
12768+ lock_handle child_lock;
12769+ load_count child_load;
12770+ jnode *child;
12771+
12772+ assert("zam-846", pos->state == POS_TO_LEAF);
12773+ assert("zam-847", item_is_internal(&pos->coord));
12774+
12775+ init_lh(&child_lock);
12776+ init_load_count(&child_load);
12777+
12778+ ret = get_leftmost_child_of_unit(&pos->coord, &child);
12779+ if (ret)
12780+ return ret;
12781+ if (child == NULL) {
12782+ pos_stop(pos);
12783+ return 0;
12784+ }
12785+
12786+ if (jnode_check_flushprepped(child)) {
12787+ pos->state = POS_INVALID;
12788+ goto out;
12789+ }
12790+
12791+ ret =
12792+ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
12793+ ZNODE_LOCK_LOPRI);
12794+ if (ret)
12795+ goto out;
12796+
12797+ ret = incr_load_count_znode(&child_load, JZNODE(child));
12798+ if (ret)
12799+ goto out;
12800+
12801+ ret = allocate_znode(JZNODE(child), &pos->coord, pos);
12802+ if (ret)
12803+ goto out;
12804+
12805+ /* move flush position to leaf level */
12806+ pos->state = POS_ON_LEAF;
12807+ move_flush_pos(pos, &child_lock, &child_load, NULL);
12808+
12809+ if (node_is_empty(JZNODE(child))) {
12810+ ret = delete_empty_node(JZNODE(child));
12811+ pos->state = POS_INVALID;
12812+ }
12813+ out:
12814+ done_load_count(&child_load);
12815+ done_lh(&child_lock);
12816+ jput(child);
12817+
12818+ return ret;
12819+}
12820+
12821+/* move pos from leaf to twig, and move lock from leaf to twig. */
12822+/* Move pos->lock to upper (twig) level */
12823+static int handle_pos_to_twig(flush_pos_t * pos)
12824+{
12825+ int ret;
12826+
12827+ lock_handle parent_lock;
12828+ load_count parent_load;
12829+ coord_t pcoord;
12830+
12831+ assert("zam-852", pos->state == POS_TO_TWIG);
12832+
12833+ init_lh(&parent_lock);
12834+ init_load_count(&parent_load);
12835+
12836+ ret =
12837+ reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
12838+ if (ret)
12839+ goto out;
12840+
12841+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
12842+ if (ret)
12843+ goto out;
12844+
12845+ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
12846+ if (ret)
12847+ goto out;
12848+
12849+ assert("zam-870", item_is_internal(&pcoord));
12850+ coord_next_item(&pcoord);
12851+
12852+ if (coord_is_after_rightmost(&pcoord))
12853+ pos->state = POS_END_OF_TWIG;
12854+ else if (item_is_extent(&pcoord))
12855+ pos->state = POS_ON_EPOINT;
12856+ else {
12857+ /* Here we understand that getting -E_NO_NEIGHBOR in
12858+ * handle_pos_on_leaf() was because of just a reaching edge of
12859+ * slum */
12860+ pos_stop(pos);
12861+ goto out;
12862+ }
12863+
12864+ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
12865+
12866+ out:
12867+ done_load_count(&parent_load);
12868+ done_lh(&parent_lock);
12869+
12870+ return ret;
12871+}
12872+
12873+typedef int (*pos_state_handle_t) (flush_pos_t *);
12874+static pos_state_handle_t flush_pos_handlers[] = {
12875+ /* process formatted nodes on leaf level, keep lock on a leaf node */
12876+ [POS_ON_LEAF] = handle_pos_on_leaf,
12877+ /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
12878+ * being processed */
12879+ [POS_ON_EPOINT] = handle_pos_on_twig,
12880+ /* move a lock from leaf node to its parent for further processing of unformatted nodes */
12881+ [POS_TO_TWIG] = handle_pos_to_twig,
12882+ /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
12883+ * pos->coord points to the leaf node we jump to */
12884+ [POS_TO_LEAF] = handle_pos_to_leaf,
12885+ /* after processing last extent in the twig node, attempting to shift items from the twigs
12886+ * right neighbor and process them while shifting */
12887+ [POS_END_OF_TWIG] = handle_pos_end_of_twig,
12888+ /* process formatted nodes on internal level, keep lock on an internal node */
12889+ [POS_ON_INTERNAL] = handle_pos_on_internal
12890+};
12891+
12892+/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
12893+ * encrypt) nodes and their ancestors in "parent-first" order */
12894+static int squalloc(flush_pos_t * pos)
12895+{
12896+ int ret = 0;
12897+
12898+ /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
12899+ * greater CPU efficiency? Measure and see.... -Hans */
12900+ while (pos_valid(pos)) {
12901+ ret = flush_pos_handlers[pos->state] (pos);
12902+ if (ret < 0)
12903+ break;
12904+
12905+ ret = rapid_flush(pos);
12906+ if (ret)
12907+ break;
12908+ }
12909+
12910+ /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
12911+ routines, -E_NO_NEIGHBOR means that slum edge was reached */
12912+ if (ret > 0 || ret == -E_NO_NEIGHBOR)
12913+ ret = 0;
12914+
12915+ return ret;
12916+}
12917+
12918+static void update_ldkey(znode * node)
12919+{
12920+ reiser4_key ldkey;
12921+
12922+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
12923+ if (node_is_empty(node))
12924+ return;
12925+
12926+ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
12927+}
12928+
12929+/* this is to be called after calling of shift node's method to shift data from @right to
12930+ @left. It sets left delimiting keys of @left and @right to keys of first items of @left
12931+ and @right correspondingly and sets right delimiting key of @left to first key of @right */
12932+static void update_znode_dkeys(znode * left, znode * right)
12933+{
12934+ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
12935+ assert("vs-1629", (znode_is_write_locked(left) &&
12936+ znode_is_write_locked(right)));
12937+
12938+ /* we need to update left delimiting of left if it was empty before shift */
12939+ update_ldkey(left);
12940+ update_ldkey(right);
12941+ if (node_is_empty(right))
12942+ znode_set_rd_key(left, znode_get_rd_key(right));
12943+ else
12944+ znode_set_rd_key(left, znode_get_ld_key(right));
12945+}
12946+
12947+/* try to shift everything from @right to @left. If everything was shifted -
12948+ @right is removed from the tree. Result is the number of bytes shifted. */
12949+static int
12950+shift_everything_left(znode * right, znode * left, carry_level * todo)
12951+{
12952+ coord_t from;
12953+ node_plugin *nplug;
12954+ carry_plugin_info info;
12955+
12956+ coord_init_after_last_item(&from, right);
12957+
12958+ nplug = node_plugin_by_node(right);
12959+ info.doing = NULL;
12960+ info.todo = todo;
12961+ return nplug->shift(&from, left, SHIFT_LEFT,
12962+ 1 /* delete @right if it becomes empty */ ,
12963+ 1
12964+ /* move coord @from to node @left if everything will be shifted */
12965+ ,
12966+ &info);
12967+}
12968+
12969+/* Shift as much as possible from @right to @left using the memcpy-optimized
12970+ shift_everything_left. @left and @right are formatted neighboring nodes on
12971+ leaf level. */
12972+static int squeeze_right_non_twig(znode * left, znode * right)
12973+{
12974+ int ret;
12975+ carry_pool *pool;
12976+ carry_level *todo;
12977+
12978+ assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
12979+
12980+ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
12981+ !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
12982+ return SQUEEZE_TARGET_FULL;
12983+
12984+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
12985+ if (IS_ERR(pool))
12986+ return PTR_ERR(pool);
12987+ todo = (carry_level *) (pool + 1);
12988+ init_carry_level(todo, pool);
12989+
12990+ ret = shift_everything_left(right, left, todo);
12991+ if (ret > 0) {
12992+ /* something was shifted */
12993+ reiser4_tree *tree;
12994+ __u64 grabbed;
12995+
12996+ znode_make_dirty(left);
12997+ znode_make_dirty(right);
12998+
12999+ /* update delimiting keys of nodes which participated in
13000+ shift. FIXME: it would be better to have this in shift
13001+ node's operation. But it can not be done there. Nobody
13002+ remembers why, though */
13003+ tree = znode_get_tree(left);
13004+ write_lock_dk(tree);
13005+ update_znode_dkeys(left, right);
13006+ write_unlock_dk(tree);
13007+
13008+ /* Carry is called to update delimiting key and, maybe, to remove empty
13009+ node. */
13010+ grabbed = get_current_context()->grabbed_blocks;
13011+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13012+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13013+ ret = reiser4_carry(todo, NULL /* previous level */ );
13014+ grabbed2free_mark(grabbed);
13015+ } else {
13016+ /* Shifting impossible, we return appropriate result code */
13017+ ret =
13018+ node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
13019+ SQUEEZE_TARGET_FULL;
13020+ }
13021+
13022+ done_carry_pool(pool);
13023+
13024+ return ret;
13025+}
13026+
13027+#if REISER4_DEBUG
13028+static int sibling_link_is_ok(const znode *left, const znode *right)
13029+{
13030+ int result;
13031+
13032+ read_lock_tree(znode_get_tree(left));
13033+ result = (left->right == right && left == right->left);
13034+ read_unlock_tree(znode_get_tree(left));
13035+ return result;
13036+}
13037+#endif
13038+
13039+/* Shift first unit of first item if it is an internal one. Return
13040+ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
13041+ SUBTREE_MOVED. */
13042+static int shift_one_internal_unit(znode * left, znode * right)
13043+{
13044+ int ret;
13045+ carry_pool *pool;
13046+ carry_level *todo;
13047+ coord_t *coord;
13048+ carry_plugin_info *info;
13049+ int size, moved;
13050+
13051+ assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
13052+ assert("nikita-2435", znode_is_write_locked(left));
13053+ assert("nikita-2436", znode_is_write_locked(right));
13054+ assert("nikita-2434", sibling_link_is_ok(left, right));
13055+
13056+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
13057+ sizeof(*coord) + sizeof(*info)
13058+#if REISER4_DEBUG
13059+ + sizeof(*coord) + 2 * sizeof(reiser4_key)
13060+#endif
13061+ );
13062+ if (IS_ERR(pool))
13063+ return PTR_ERR(pool);
13064+ todo = (carry_level *) (pool + 1);
13065+ init_carry_level(todo, pool);
13066+
13067+ coord = (coord_t *) (todo + 3);
13068+ coord_init_first_unit(coord, right);
13069+ info = (carry_plugin_info *) (coord + 1);
13070+
13071+#if REISER4_DEBUG
13072+ if (!node_is_empty(left)) {
13073+ coord_t *last;
13074+ reiser4_key *right_key;
13075+ reiser4_key *left_key;
13076+
13077+ last = (coord_t *) (info + 1);
13078+ right_key = (reiser4_key *) (last + 1);
13079+ left_key = right_key + 1;
13080+ coord_init_last_unit(last, left);
13081+
13082+ assert("nikita-2463",
13083+ keyle(item_key_by_coord(last, left_key),
13084+ item_key_by_coord(coord, right_key)));
13085+ }
13086+#endif
13087+
13088+ assert("jmacd-2007", item_is_internal(coord));
13089+
13090+ size = item_length_by_coord(coord);
13091+ info->todo = todo;
13092+ info->doing = NULL;
13093+
13094+ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13095+ 1
13096+ /* delete @right if it becomes empty */
13097+ ,
13098+ 0
13099+ /* do not move coord @coord to node @left */
13100+ ,
13101+ info);
13102+
13103+ /* If shift returns positive, then we shifted the item. */
13104+ assert("vs-423", ret <= 0 || size == ret);
13105+ moved = (ret > 0);
13106+
13107+ if (moved) {
13108+ /* something was moved */
13109+ reiser4_tree *tree;
13110+ int grabbed;
13111+
13112+ znode_make_dirty(left);
13113+ znode_make_dirty(right);
13114+ tree = znode_get_tree(left);
13115+ write_lock_dk(tree);
13116+ update_znode_dkeys(left, right);
13117+ write_unlock_dk(tree);
13118+
13119+ /* reserve space for delimiting keys after shifting */
13120+ grabbed = get_current_context()->grabbed_blocks;
13121+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13122+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13123+
13124+ ret = reiser4_carry(todo, NULL /* previous level */ );
13125+ grabbed2free_mark(grabbed);
13126+ }
13127+
13128+ done_carry_pool(pool);
13129+
13130+ if (ret != 0) {
13131+ /* Shift or carry operation failed. */
13132+ assert("jmacd-7325", ret < 0);
13133+ return ret;
13134+ }
13135+
13136+ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13137+}
13138+
13139+/* Make the final relocate/wander decision during forward parent-first squalloc for a
13140+ znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13141+static int
13142+allocate_znode_loaded(znode * node,
13143+ const coord_t * parent_coord, flush_pos_t * pos)
13144+{
13145+ int ret;
13146+ reiser4_super_info_data *sbinfo = get_current_super_private();
13147+ /* FIXME(D): We have the node write-locked and should have checked for !
13148+ allocated() somewhere before reaching this point, but there can be a race, so
13149+ this assertion is bogus. */
13150+ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13151+ assert("jmacd-7988", znode_is_write_locked(node));
13152+ assert("jmacd-7989", coord_is_invalid(parent_coord)
13153+ || znode_is_write_locked(parent_coord->node));
13154+
13155+ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13156+ znode_is_root(node) ||
13157+ /* We have enough nodes to relocate no matter what. */
13158+ (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13159+ /* No need to decide with new nodes, they are treated the same as
13160+ relocate. If the root node is dirty, relocate. */
13161+ if (pos->preceder.blk == 0) {
13162+ /* preceder is unknown and we have decided to relocate node --
13163+ using of default value for search start is better than search
13164+ from block #0. */
13165+ get_blocknr_hint_default(&pos->preceder.blk);
13166+ check_preceder(pos->preceder.blk);
13167+ }
13168+
13169+ goto best_reloc;
13170+
13171+ } else if (pos->preceder.blk == 0) {
13172+ /* If we don't know the preceder, leave it where it is. */
13173+ jnode_make_wander(ZJNODE(node));
13174+ } else {
13175+ /* Make a decision based on block distance. */
13176+ reiser4_block_nr dist;
13177+ reiser4_block_nr nblk = *znode_get_block(node);
13178+
13179+ assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13180+ assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13181+ assert("jmacd-6174", pos->preceder.blk != 0);
13182+
13183+ if (pos->preceder.blk == nblk - 1) {
13184+ /* Ideal. */
13185+ jnode_make_wander(ZJNODE(node));
13186+ } else {
13187+
13188+ dist =
13189+ (nblk <
13190+ pos->preceder.blk) ? (pos->preceder.blk -
13191+ nblk) : (nblk -
13192+ pos->preceder.blk);
13193+
13194+ /* See if we can find a closer block (forward direction only). */
13195+ pos->preceder.max_dist =
13196+ min((reiser4_block_nr) sbinfo->flush.
13197+ relocate_distance, dist);
13198+ pos->preceder.level = znode_get_level(node);
13199+
13200+ ret = allocate_znode_update(node, parent_coord, pos);
13201+
13202+ pos->preceder.max_dist = 0;
13203+
13204+ if (ret && (ret != -ENOSPC))
13205+ return ret;
13206+
13207+ if (ret == 0) {
13208+ /* Got a better allocation. */
13209+ znode_make_reloc(node, pos->fq);
13210+ } else if (dist < sbinfo->flush.relocate_distance) {
13211+ /* The present allocation is good enough. */
13212+ jnode_make_wander(ZJNODE(node));
13213+ } else {
13214+ /* Otherwise, try to relocate to the best position. */
13215+ best_reloc:
13216+ ret =
13217+ allocate_znode_update(node, parent_coord,
13218+ pos);
13219+ if (ret != 0)
13220+ return ret;
13221+
13222+ /* set JNODE_RELOC bit _after_ node gets allocated */
13223+ znode_make_reloc(node, pos->fq);
13224+ }
13225+ }
13226+ }
13227+
13228+ /* This is the new preceder. */
13229+ pos->preceder.blk = *znode_get_block(node);
13230+ check_preceder(pos->preceder.blk);
13231+ pos->alloc_cnt += 1;
13232+
13233+ assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13234+
13235+ return 0;
13236+}
13237+
13238+static int
13239+allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13240+{
13241+ /*
13242+ * perform znode allocation with znode pinned in memory to avoid races
13243+ * with asynchronous emergency flush (which plays with
13244+ * JNODE_FLUSH_RESERVED bit).
13245+ */
13246+ return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13247+}
13248+
13249+/* A subroutine of allocate_znode, this is called first to see if there is a close
13250+ position to relocate to. It may return ENOSPC if there is no close position. If there
13251+ is no close position it may not relocate. This takes care of updating the parent node
13252+ with the relocated block address. */
13253+static int
13254+allocate_znode_update(znode * node, const coord_t * parent_coord,
13255+ flush_pos_t * pos)
13256+{
13257+ int ret;
13258+ reiser4_block_nr blk;
13259+ lock_handle uber_lock;
13260+ int flush_reserved_used = 0;
13261+ int grabbed;
13262+ reiser4_context *ctx;
13263+ reiser4_super_info_data *sbinfo;
13264+
13265+ init_lh(&uber_lock);
13266+
13267+ ctx = get_current_context();
13268+ sbinfo = get_super_private(ctx->super);
13269+
13270+ grabbed = ctx->grabbed_blocks;
13271+
13272+ /* discard e-flush allocation */
13273+ ret = zload(node);
13274+ if (ret)
13275+ return ret;
13276+
13277+ if (ZF_ISSET(node, JNODE_CREATED)) {
13278+ assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13279+ pos->preceder.block_stage = BLOCK_UNALLOCATED;
13280+ } else {
13281+ pos->preceder.block_stage = BLOCK_GRABBED;
13282+
13283+ /* The disk space for relocating the @node is already reserved in "flush reserved"
13284+ * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13285+ * space from whole disk not from only 95%). */
13286+ if (znode_get_level(node) == LEAF_LEVEL) {
13287+ /*
13288+ * earlier (during do_jnode_make_dirty()) we decided
13289+ * that @node can possibly go into overwrite set and
13290+ * reserved block for its wandering location.
13291+ */
13292+ txn_atom *atom = get_current_atom_locked();
13293+ assert("nikita-3449",
13294+ ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13295+ flush_reserved2grabbed(atom, (__u64) 1);
13296+ spin_unlock_atom(atom);
13297+ /*
13298+ * we are trying to move node into relocate
13299+ * set. Allocation of relocated position "uses"
13300+ * reserved block.
13301+ */
13302+ ZF_CLR(node, JNODE_FLUSH_RESERVED);
13303+ flush_reserved_used = 1;
13304+ } else {
13305+ ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13306+ if (ret != 0)
13307+ goto exit;
13308+ }
13309+ }
13310+
13311+ /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13312+ ret = reiser4_alloc_block(&pos->preceder, &blk,
13313+ BA_FORMATTED | BA_PERMANENT);
13314+ if (ret)
13315+ goto exit;
13316+
13317+ if (!ZF_ISSET(node, JNODE_CREATED) &&
13318+ (ret =
13319+ reiser4_dealloc_block(znode_get_block(node), 0,
13320+ BA_DEFER | BA_FORMATTED)))
13321+ goto exit;
13322+
13323+ if (likely(!znode_is_root(node))) {
13324+ item_plugin *iplug;
13325+
13326+ iplug = item_plugin_by_coord(parent_coord);
13327+ assert("nikita-2954", iplug->f.update != NULL);
13328+ iplug->f.update(parent_coord, &blk);
13329+
13330+ znode_make_dirty(parent_coord->node);
13331+
13332+ } else {
13333+ reiser4_tree *tree = znode_get_tree(node);
13334+ znode *uber;
13335+
13336+ /* We take a longterm lock on the fake node in order to change
13337+ the root block number. This may cause atom fusion. */
13338+ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13339+ &uber_lock);
13340+ /* The fake node cannot be deleted, and we must have priority
13341+ here, and may not be confused with ENOSPC. */
13342+ assert("jmacd-74412",
13343+ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13344+
13345+ if (ret)
13346+ goto exit;
13347+
13348+ uber = uber_lock.node;
13349+
13350+ write_lock_tree(tree);
13351+ tree->root_block = blk;
13352+ write_unlock_tree(tree);
13353+
13354+ znode_make_dirty(uber);
13355+ }
13356+
13357+ ret = znode_rehash(node, &blk);
13358+ exit:
13359+ if (ret) {
13360+ /* Get flush reserved block back if something fails, because
13361+ * callers assume that on error block wasn't relocated and its
13362+ * flush reserved block wasn't used. */
13363+ if (flush_reserved_used) {
13364+ /*
13365+ * ok, we failed to move node into relocate
13366+ * set. Restore status quo.
13367+ */
13368+ grabbed2flush_reserved((__u64) 1);
13369+ ZF_SET(node, JNODE_FLUSH_RESERVED);
13370+ }
13371+ }
13372+ zrelse(node);
13373+ done_lh(&uber_lock);
13374+ grabbed2free_mark(grabbed);
13375+ return ret;
13376+}
13377+
13378+/* JNODE INTERFACE */
13379+
13380+/* Lock a node (if formatted) and then get its parent locked, set the child's
13381+ coordinate in the parent. If the child is the root node, the above_root
13382+ znode is returned but the coord is not set. This function may cause atom
13383+ fusion, but it is only used for read locks (at this point) and therefore
13384+ fusion only occurs when the parent is already dirty. */
13385+/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13386+ pointer in jnodes. */
13387+static int
13388+jnode_lock_parent_coord(jnode * node,
13389+ coord_t * coord,
13390+ lock_handle * parent_lh,
13391+ load_count * parent_zh,
13392+ znode_lock_mode parent_mode, int try)
13393+{
13394+ int ret;
13395+
13396+ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13397+ assert("edward-54", jnode_is_unformatted(node)
13398+ || znode_is_any_locked(JZNODE(node)));
13399+
13400+ if (!jnode_is_znode(node)) {
13401+ reiser4_key key;
13402+ tree_level stop_level = TWIG_LEVEL;
13403+ lookup_bias bias = FIND_EXACT;
13404+
13405+ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13406+
13407+ /* The case when node is not znode, but can have parent coord
13408+ (unformatted node, node which represents cluster page,
13409+ etc..). Generate a key for the appropriate entry, search
13410+ in the tree using coord_by_key, which handles locking for
13411+ us. */
13412+
13413+ /*
13414+ * nothing is locked at this moment, so, nothing prevents
13415+ * concurrent truncate from removing jnode from inode. To
13416+ * prevent this spin-lock jnode. jnode can be truncated just
13417+ * after call to the jnode_build_key(), but this is ok,
13418+ * because coord_by_key() will just fail to find appropriate
13419+ * extent.
13420+ */
13421+ spin_lock_jnode(node);
13422+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13423+ jnode_build_key(node, &key);
13424+ ret = 0;
13425+ } else
13426+ ret = RETERR(-ENOENT);
13427+ spin_unlock_jnode(node);
13428+
13429+ if (ret != 0)
13430+ return ret;
13431+
13432+ if (jnode_is_cluster_page(node))
13433+ stop_level = LEAF_LEVEL;
13434+
13435+ assert("jmacd-1812", coord != NULL);
13436+
13437+ ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13438+ parent_mode, bias, stop_level, stop_level,
13439+ CBK_UNIQUE, NULL /*ra_info */ );
13440+ switch (ret) {
13441+ case CBK_COORD_NOTFOUND:
13442+ assert("edward-1038",
13443+ ergo(jnode_is_cluster_page(node),
13444+ JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13445+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13446+ warning("nikita-3177", "Parent not found");
13447+ return ret;
13448+ case CBK_COORD_FOUND:
13449+ if (coord->between != AT_UNIT) {
13450+ /* FIXME: comment needed */
13451+ done_lh(parent_lh);
13452+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13453+ warning("nikita-3178",
13454+ "Found but not happy: %i",
13455+ coord->between);
13456+ }
13457+ return RETERR(-ENOENT);
13458+ }
13459+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13460+ if (ret != 0)
13461+ return ret;
13462+ /* if (jnode_is_cluster_page(node)) {
13463+ races with write() are possible
13464+ check_child_cluster (parent_lh->node);
13465+ }
13466+ */
13467+ break;
13468+ default:
13469+ return ret;
13470+ }
13471+
13472+ } else {
13473+ int flags;
13474+ znode *z;
13475+
13476+ z = JZNODE(node);
13477+ /* Formatted node case: */
13478+ assert("jmacd-2061", !znode_is_root(z));
13479+
13480+ flags = GN_ALLOW_NOT_CONNECTED;
13481+ if (try)
13482+ flags |= GN_TRY_LOCK;
13483+
13484+ ret =
13485+ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13486+ if (ret != 0)
13487+ /* -E_REPEAT is ok here, it is handled by the caller. */
13488+ return ret;
13489+
13490+ /* Make the child's position "hint" up-to-date. (Unless above
13491+ root, which caller must check.) */
13492+ if (coord != NULL) {
13493+
13494+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13495+ if (ret != 0) {
13496+ warning("jmacd-976812386",
13497+ "incr_load_count_znode failed: %d",
13498+ ret);
13499+ return ret;
13500+ }
13501+
13502+ ret = find_child_ptr(parent_lh->node, z, coord);
13503+ if (ret != 0) {
13504+ warning("jmacd-976812",
13505+ "find_child_ptr failed: %d", ret);
13506+ return ret;
13507+ }
13508+ }
13509+ }
13510+
13511+ return 0;
13512+}
13513+
13514+/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13515+ If there is no next neighbor or the neighbor is not in memory or if there is a
13516+ neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13517+ In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13518+static int neighbor_in_slum(znode * node, /* starting point */
13519+ lock_handle * lock, /* lock on starting point */
13520+ sideof side, /* left or right direction we seek the next node in */
13521+ znode_lock_mode mode, /* kind of lock we want */
13522+ int check_dirty, /* true if the neighbor should be dirty */
13523+ int use_upper_levels /* get neighbor by going though
13524+ upper levels */)
13525+{
13526+ int ret;
13527+ int flags;
13528+
13529+ assert("jmacd-6334", znode_is_connected(node));
13530+
13531+ flags = GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
13532+ if (use_upper_levels)
13533+ flags |= GN_CAN_USE_UPPER_LEVELS;
13534+
13535+ ret = reiser4_get_neighbor(lock, node, mode, flags);
13536+ if (ret) {
13537+ /* May return -ENOENT or -E_NO_NEIGHBOR. */
13538+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13539+ if (ret == -ENOENT) {
13540+ ret = RETERR(-E_NO_NEIGHBOR);
13541+ }
13542+ return ret;
13543+ }
13544+ if (!check_dirty)
13545+ return 0;
13546+ /* Check dirty bit of locked znode, no races here */
13547+ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13548+ return 0;
13549+
13550+ done_lh(lock);
13551+ return RETERR(-E_NO_NEIGHBOR);
13552+}
13553+
13554+/* Return true if two znodes have the same parent. This is called with both nodes
13555+ write-locked (for squeezing) so no tree lock is needed. */
13556+static int znode_same_parents(znode * a, znode * b)
13557+{
13558+ int result;
13559+
13560+ assert("jmacd-7011", znode_is_write_locked(a));
13561+ assert("jmacd-7012", znode_is_write_locked(b));
13562+
13563+ /* We lock the whole tree for this check.... I really don't like whole tree
13564+ * locks... -Hans */
13565+ read_lock_tree(znode_get_tree(a));
13566+ result = (znode_parent(a) == znode_parent(b));
13567+ read_unlock_tree(znode_get_tree(a));
13568+ return result;
13569+}
13570+
13571+/* FLUSH SCAN */
13572+
13573+/* Initialize the flush_scan data structure. */
13574+static void scan_init(flush_scan * scan)
13575+{
13576+ memset(scan, 0, sizeof(*scan));
13577+ init_lh(&scan->node_lock);
13578+ init_lh(&scan->parent_lock);
13579+ init_load_count(&scan->parent_load);
13580+ init_load_count(&scan->node_load);
13581+ coord_init_invalid(&scan->parent_coord, NULL);
13582+}
13583+
13584+/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
13585+static void scan_done(flush_scan * scan)
13586+{
13587+ done_load_count(&scan->node_load);
13588+ if (scan->node != NULL) {
13589+ jput(scan->node);
13590+ scan->node = NULL;
13591+ }
13592+ done_load_count(&scan->parent_load);
13593+ done_lh(&scan->parent_lock);
13594+ done_lh(&scan->node_lock);
13595+}
13596+
13597+/* Returns true if flush scanning is finished. */
13598+int reiser4_scan_finished(flush_scan * scan)
13599+{
13600+ return scan->stop || (scan->direction == RIGHT_SIDE &&
13601+ scan->count >= scan->max_count);
13602+}
13603+
13604+/* Return true if the scan should continue to the @tonode. True if the node meets the
13605+ same_slum_check condition. If not, deref the "left" node and stop the scan. */
13606+int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
13607+{
13608+ int go = same_slum_check(scan->node, tonode, 1, 0);
13609+
13610+ if (!go) {
13611+ scan->stop = 1;
13612+ jput(tonode);
13613+ }
13614+
13615+ return go;
13616+}
13617+
13618+/* Set the current scan->node, refcount it, increment count by the @add_count (number to
13619+ count, e.g., skipped unallocated nodes), deref previous current, and copy the current
13620+ parent coordinate. */
13621+int
13622+scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13623+ const coord_t * parent)
13624+{
13625+ /* Release the old references, take the new reference. */
13626+ done_load_count(&scan->node_load);
13627+
13628+ if (scan->node != NULL) {
13629+ jput(scan->node);
13630+ }
13631+ scan->node = node;
13632+ scan->count += add_count;
13633+
13634+ /* This next stmt is somewhat inefficient. The reiser4_scan_extent() code could
13635+ delay this update step until it finishes and update the parent_coord only once.
13636+ It did that before, but there was a bug and this was the easiest way to make it
13637+ correct. */
13638+ if (parent != NULL) {
13639+ coord_dup(&scan->parent_coord, parent);
13640+ }
13641+
13642+ /* Failure may happen at the incr_load_count call, but the caller can assume the reference
13643+ is safely taken. */
13644+ return incr_load_count_jnode(&scan->node_load, node);
13645+}
13646+
13647+/* Return true if scanning in the leftward direction. */
13648+int reiser4_scanning_left(flush_scan * scan)
13649+{
13650+ return scan->direction == LEFT_SIDE;
13651+}
13652+
13653+/* Performs leftward scanning starting from either kind of node. Counts the starting
13654+ node. The right-scan object is passed in for the left-scan in order to copy the parent
13655+ of an unformatted starting position. This way we avoid searching for the unformatted
13656+ node's parent when scanning in each direction. If we search for the parent once it is
13657+ set in both scan objects. The limit parameter tells flush-scan when to stop.
13658+
13659+ Rapid scanning is used only during scan_left, where we are interested in finding the
13660+ 'leftpoint' where we begin flushing. We are interested in stopping at the left child
13661+ of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The
13662+ problem is finding a way to flush only those nodes without unallocated children, and it
13663+ is difficult to solve in the bottom-up flushing algorithm we are currently using. The
13664+ problem can be solved by scanning left at every level as we go upward, but this would
13665+ basically bring us back to using a top-down allocation strategy, which we already tried
13666+ (see BK history from May 2002), and has a different set of problems. The top-down
13667+ strategy makes avoiding unallocated children easier, but makes it difficult to
13668+ propertly flush dirty children with clean parents that would otherwise stop the
13669+ top-down flush, only later to dirty the parent once the children are flushed. So we
13670+ solve the problem in the bottom-up algorithm with a special case for twigs and leaves
13671+ only.
13672+
13673+ The first step in solving the problem is this rapid leftward scan. After we determine
13674+ that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
13675+ are no longer interested in the exact count, we are only interested in finding a the
13676+ best place to start the flush. We could choose one of two possibilities:
13677+
13678+ 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
13679+ This requires checking one leaf per rapid-scan twig
13680+
13681+ 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
13682+ to the left. This requires checking possibly all of the in-memory children of each
13683+ twig during the rapid scan.
13684+
13685+ For now we implement the first policy.
13686+*/
13687+static int
13688+scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13689+{
13690+ int ret = 0;
13691+
13692+ scan->max_count = limit;
13693+ scan->direction = LEFT_SIDE;
13694+
13695+ ret = scan_set_current(scan, jref(node), 1, NULL);
13696+ if (ret != 0) {
13697+ return ret;
13698+ }
13699+
13700+ ret = scan_common(scan, right);
13701+ if (ret != 0) {
13702+ return ret;
13703+ }
13704+
13705+ /* Before rapid scanning, we need a lock on scan->node so that we can get its
13706+ parent, only if formatted. */
13707+ if (jnode_is_znode(scan->node)) {
13708+ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13709+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13710+ }
13711+
13712+ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
13713+ return ret;
13714+}
13715+
13716+/* Performs rightward scanning... Does not count the starting node. The limit parameter
13717+ is described in scan_left. If the starting node is unformatted then the
13718+ parent_coord was already set during scan_left. The rapid_after parameter is not used
13719+ during right-scanning.
13720+
13721+ scan_right is only called if the scan_left operation does not count at least
13722+ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to
13723+ the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
13724+ scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
13725+static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13726+{
13727+ int ret;
13728+
13729+ scan->max_count = limit;
13730+ scan->direction = RIGHT_SIDE;
13731+
13732+ ret = scan_set_current(scan, jref(node), 0, NULL);
13733+ if (ret != 0) {
13734+ return ret;
13735+ }
13736+
13737+ return scan_common(scan, NULL);
13738+}
13739+
13740+/* Common code to perform left or right scanning. */
13741+static int scan_common(flush_scan * scan, flush_scan * other)
13742+{
13743+ int ret;
13744+
13745+ assert("nikita-2376", scan->node != NULL);
13746+ assert("edward-54", jnode_is_unformatted(scan->node)
13747+ || jnode_is_znode(scan->node));
13748+
13749+ /* Special case for starting at an unformatted node. Optimization: we only want
13750+ to search for the parent (which requires a tree traversal) once. Obviously, we
13751+ shouldn't have to call it once for the left scan and once for the right scan.
13752+ For this reason, if we search for the parent during scan-left we then duplicate
13753+ the coord/lock/load into the scan-right object. */
13754+ if (jnode_is_unformatted(scan->node)) {
13755+ ret = scan_unformatted(scan, other);
13756+ if (ret != 0)
13757+ return ret;
13758+ }
13759+ /* This loop expects to start at a formatted position and performs chaining of
13760+ formatted regions */
13761+ while (!reiser4_scan_finished(scan)) {
13762+
13763+ ret = scan_formatted(scan);
13764+ if (ret != 0) {
13765+ return ret;
13766+ }
13767+ }
13768+
13769+ return 0;
13770+}
13771+
13772+static int scan_unformatted(flush_scan * scan, flush_scan * other)
13773+{
13774+ int ret = 0;
13775+ int try = 0;
13776+
13777+ if (!coord_is_invalid(&scan->parent_coord))
13778+ goto scan;
13779+
13780+ /* set parent coord from */
13781+ if (!jnode_is_unformatted(scan->node)) {
13782+ /* formatted position */
13783+
13784+ lock_handle lock;
13785+ assert("edward-301", jnode_is_znode(scan->node));
13786+ init_lh(&lock);
13787+
13788+ /*
13789+ * when flush starts from unformatted node, first thing it
13790+ * does is tree traversal to find formatted parent of starting
13791+ * node. This parent is then kept lock across scans to the
13792+ * left and to the right. This means that during scan to the
13793+ * left we cannot take left-ward lock, because this is
13794+ * dead-lock prone. So, if we are scanning to the left and
13795+ * there is already lock held by this thread,
13796+ * jnode_lock_parent_coord() should use try-lock.
13797+ */
13798+ try = reiser4_scanning_left(scan)
13799+ && !lock_stack_isclean(get_current_lock_stack());
13800+ /* Need the node locked to get the parent lock, We have to
13801+ take write lock since there is at least one call path
13802+ where this znode is already write-locked by us. */
13803+ ret =
13804+ longterm_lock_znode(&lock, JZNODE(scan->node),
13805+ ZNODE_WRITE_LOCK,
13806+ reiser4_scanning_left(scan) ?
13807+ ZNODE_LOCK_LOPRI :
13808+ ZNODE_LOCK_HIPRI);
13809+ if (ret != 0)
13810+ /* EINVAL or E_DEADLOCK here mean... try again! At this point we've
13811+ scanned too far and can't back out, just start over. */
13812+ return ret;
13813+
13814+ ret = jnode_lock_parent_coord(scan->node,
13815+ &scan->parent_coord,
13816+ &scan->parent_lock,
13817+ &scan->parent_load,
13818+ ZNODE_WRITE_LOCK, try);
13819+
13820+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13821+ done_lh(&lock);
13822+ if (ret == -E_REPEAT) {
13823+ scan->stop = 1;
13824+ return 0;
13825+ }
13826+ if (ret)
13827+ return ret;
13828+
13829+ } else {
13830+ /* unformatted position */
13831+
13832+ ret =
13833+ jnode_lock_parent_coord(scan->node, &scan->parent_coord,
13834+ &scan->parent_lock,
13835+ &scan->parent_load,
13836+ ZNODE_WRITE_LOCK, try);
13837+
13838+ if (IS_CBKERR(ret))
13839+ return ret;
13840+
13841+ if (ret == CBK_COORD_NOTFOUND)
13842+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13843+ return ret;
13844+
13845+ /* parent was found */
13846+ assert("jmacd-8661", other != NULL);
13847+ /* Duplicate the reference into the other flush_scan. */
13848+ coord_dup(&other->parent_coord, &scan->parent_coord);
13849+ copy_lh(&other->parent_lock, &scan->parent_lock);
13850+ copy_load_count(&other->parent_load, &scan->parent_load);
13851+ }
13852+ scan:
13853+ return scan_by_coord(scan);
13854+}
13855+
13856+/* Performs left- or rightward scanning starting from a formatted node. Follow left
13857+ pointers under tree lock as long as:
13858+
13859+ - node->left/right is non-NULL
13860+ - node->left/right is connected, dirty
13861+ - node->left/right belongs to the same atom
13862+ - scan has not reached maximum count
13863+*/
13864+static int scan_formatted(flush_scan * scan)
13865+{
13866+ int ret;
13867+ znode *neighbor = NULL;
13868+
13869+ assert("jmacd-1401", !reiser4_scan_finished(scan));
13870+
13871+ do {
13872+ znode *node = JZNODE(scan->node);
13873+
13874+ /* Node should be connected, but if not stop the scan. */
13875+ if (!znode_is_connected(node)) {
13876+ scan->stop = 1;
13877+ break;
13878+ }
13879+
13880+ /* Lock the tree, check-for and reference the next sibling. */
13881+ read_lock_tree(znode_get_tree(node));
13882+
13883+ /* It may be that a node is inserted or removed between a node and its
13884+ left sibling while the tree lock is released, but the flush-scan count
13885+ does not need to be precise. Thus, we release the tree lock as soon as
13886+ we get the neighboring node. */
13887+ neighbor =
13888+ reiser4_scanning_left(scan) ? node->left : node->right;
13889+ if (neighbor != NULL) {
13890+ zref(neighbor);
13891+ }
13892+
13893+ read_unlock_tree(znode_get_tree(node));
13894+
13895+ /* If neighbor is NULL at the leaf level, need to check for an unformatted
13896+ sibling using the parent--break in any case. */
13897+ if (neighbor == NULL) {
13898+ break;
13899+ }
13900+
13901+ /* Check the condition for going left, break if it is not met. This also
13902+ releases (jputs) the neighbor if false. */
13903+ if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
13904+ break;
13905+ }
13906+
13907+ /* Advance the flush_scan state to the left, repeat. */
13908+ ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
13909+ if (ret != 0) {
13910+ return ret;
13911+ }
13912+
13913+ } while (!reiser4_scan_finished(scan));
13914+
13915+ /* If neighbor is NULL then we reached the end of a formatted region, or else the
13916+ sibling is out of memory, now check for an extent to the left (as long as
13917+ LEAF_LEVEL). */
13918+ if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
13919+ || reiser4_scan_finished(scan)) {
13920+ scan->stop = 1;
13921+ return 0;
13922+ }
13923+ /* Otherwise, calls scan_by_coord for the right(left)most item of the
13924+ left(right) neighbor on the parent level, then possibly continue. */
13925+
13926+ coord_init_invalid(&scan->parent_coord, NULL);
13927+ return scan_unformatted(scan, NULL);
13928+}
13929+
13930+/* NOTE-EDWARD:
13931+ This scans adjacent items of the same type and calls scan flush plugin for each one.
13932+ Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start
13933+ from unformatted node, then we continue only if the next neighbor is also unformatted.
13934+ When called from scan_formatted, we skip first iteration (to make sure that
13935+ right(left)most item of the left(right) neighbor on the parent level is of the same
13936+ type and set appropriate coord). */
13937+static int scan_by_coord(flush_scan * scan)
13938+{
13939+ int ret = 0;
13940+ int scan_this_coord;
13941+ lock_handle next_lock;
13942+ load_count next_load;
13943+ coord_t next_coord;
13944+ jnode *child;
13945+ item_plugin *iplug;
13946+
13947+ init_lh(&next_lock);
13948+ init_load_count(&next_load);
13949+ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
13950+
13951+ /* set initial item id */
13952+ iplug = item_plugin_by_coord(&scan->parent_coord);
13953+
13954+ for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
13955+ if (scan_this_coord) {
13956+ /* Here we expect that unit is scannable. it would not be so due
13957+ * to race with extent->tail conversion. */
13958+ if (iplug->f.scan == NULL) {
13959+ scan->stop = 1;
13960+ ret = -E_REPEAT;
13961+ /* skip the check at the end. */
13962+ goto race;
13963+ }
13964+
13965+ ret = iplug->f.scan(scan);
13966+ if (ret != 0)
13967+ goto exit;
13968+
13969+ if (reiser4_scan_finished(scan)) {
13970+ checkchild(scan);
13971+ break;
13972+ }
13973+ } else {
13974+ /* the same race against truncate as above is possible
13975+ * here, it seems */
13976+
13977+ /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
13978+ the first coordinate. */
13979+ assert("jmacd-1231",
13980+ item_is_internal(&scan->parent_coord));
13981+ }
13982+
13983+ if (iplug->f.utmost_child == NULL
13984+ || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
13985+ /* stop this coord and continue on parrent level */
13986+ ret =
13987+ scan_set_current(scan,
13988+ ZJNODE(zref
13989+ (scan->parent_coord.node)),
13990+ 1, NULL);
13991+ if (ret != 0)
13992+ goto exit;
13993+ break;
13994+ }
13995+
13996+ /* Either way, the invariant is that scan->parent_coord is set to the
13997+ parent of scan->node. Now get the next unit. */
13998+ coord_dup(&next_coord, &scan->parent_coord);
13999+ coord_sideof_unit(&next_coord, scan->direction);
14000+
14001+ /* If off-the-end of the twig, try the next twig. */
14002+ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
14003+ /* We take the write lock because we may start flushing from this
14004+ * coordinate. */
14005+ ret = neighbor_in_slum(next_coord.node,
14006+ &next_lock,
14007+ scan->direction,
14008+ ZNODE_WRITE_LOCK,
14009+ 1 /* check dirty */,
14010+ 0 /* don't go though upper
14011+ levels */);
14012+ if (ret == -E_NO_NEIGHBOR) {
14013+ scan->stop = 1;
14014+ ret = 0;
14015+ break;
14016+ }
14017+
14018+ if (ret != 0) {
14019+ goto exit;
14020+ }
14021+
14022+ ret = incr_load_count_znode(&next_load, next_lock.node);
14023+ if (ret != 0) {
14024+ goto exit;
14025+ }
14026+
14027+ coord_init_sideof_unit(&next_coord, next_lock.node,
14028+ sideof_reverse(scan->direction));
14029+ }
14030+
14031+ iplug = item_plugin_by_coord(&next_coord);
14032+
14033+ /* Get the next child. */
14034+ ret =
14035+ iplug->f.utmost_child(&next_coord,
14036+ sideof_reverse(scan->direction),
14037+ &child);
14038+ if (ret != 0)
14039+ goto exit;
14040+ /* If the next child is not in memory, or, item_utmost_child
14041+ failed (due to race with unlink, most probably), stop
14042+ here. */
14043+ if (child == NULL || IS_ERR(child)) {
14044+ scan->stop = 1;
14045+ checkchild(scan);
14046+ break;
14047+ }
14048+
14049+ assert("nikita-2374", jnode_is_unformatted(child)
14050+ || jnode_is_znode(child));
14051+
14052+ /* See if it is dirty, part of the same atom. */
14053+ if (!reiser4_scan_goto(scan, child)) {
14054+ checkchild(scan);
14055+ break;
14056+ }
14057+
14058+ /* If so, make this child current. */
14059+ ret = scan_set_current(scan, child, 1, &next_coord);
14060+ if (ret != 0)
14061+ goto exit;
14062+
14063+ /* Now continue. If formatted we release the parent lock and return, then
14064+ proceed. */
14065+ if (jnode_is_znode(child))
14066+ break;
14067+
14068+ /* Otherwise, repeat the above loop with next_coord. */
14069+ if (next_load.node != NULL) {
14070+ done_lh(&scan->parent_lock);
14071+ move_lh(&scan->parent_lock, &next_lock);
14072+ move_load_count(&scan->parent_load, &next_load);
14073+ }
14074+ }
14075+
14076+ assert("jmacd-6233",
14077+ reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
14078+ exit:
14079+ checkchild(scan);
14080+ race: /* skip the above check */
14081+ if (jnode_is_znode(scan->node)) {
14082+ done_lh(&scan->parent_lock);
14083+ done_load_count(&scan->parent_load);
14084+ }
14085+
14086+ done_load_count(&next_load);
14087+ done_lh(&next_lock);
14088+ return ret;
14089+}
14090+
14091+/* FLUSH POS HELPERS */
14092+
14093+/* Initialize the fields of a flush_position. */
14094+static void pos_init(flush_pos_t * pos)
14095+{
14096+ memset(pos, 0, sizeof *pos);
14097+
14098+ pos->state = POS_INVALID;
14099+ coord_init_invalid(&pos->coord, NULL);
14100+ init_lh(&pos->lock);
14101+ init_load_count(&pos->load);
14102+
14103+ reiser4_blocknr_hint_init(&pos->preceder);
14104+}
14105+
14106+/* The flush loop inside squalloc periodically checks pos_valid to
14107+ determine when "enough flushing" has been performed. This will return true until one
14108+ of the following conditions is met:
14109+
14110+ 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14111+ parameter, meaning we have flushed as many blocks as the kernel requested. When
14112+ flushing to commit, this parameter is NULL.
14113+
14114+ 2. pos_stop() is called because squalloc discovers that the "next" node in the
14115+ flush order is either non-existant, not dirty, or not in the same atom.
14116+*/
14117+
14118+static int pos_valid(flush_pos_t * pos)
14119+{
14120+ return pos->state != POS_INVALID;
14121+}
14122+
14123+/* Release any resources of a flush_position. Called when jnode_flush finishes. */
14124+static void pos_done(flush_pos_t * pos)
14125+{
14126+ pos_stop(pos);
14127+ reiser4_blocknr_hint_done(&pos->preceder);
14128+ if (convert_data(pos))
14129+ free_convert_data(pos);
14130+}
14131+
14132+/* Reset the point and parent. Called during flush subroutines to terminate the
14133+ squalloc loop. */
14134+static int pos_stop(flush_pos_t * pos)
14135+{
14136+ pos->state = POS_INVALID;
14137+ done_lh(&pos->lock);
14138+ done_load_count(&pos->load);
14139+ coord_init_invalid(&pos->coord, NULL);
14140+
14141+ if (pos->child) {
14142+ jput(pos->child);
14143+ pos->child = NULL;
14144+ }
14145+
14146+ return 0;
14147+}
14148+
14149+/* Return the flush_position's block allocator hint. */
14150+reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
14151+{
14152+ return &pos->preceder;
14153+}
14154+
14155+flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
14156+{
14157+ return pos->fq;
14158+}
14159+
14160+/* Make Linus happy.
14161+ Local variables:
14162+ c-indentation-style: "K&R"
14163+ mode-name: "LC"
14164+ c-basic-offset: 8
14165+ tab-width: 8
14166+ fill-column: 90
14167+ LocalWords: preceder
14168+ End:
14169+*/
14170diff -urN linux-2.6.24.orig/fs/reiser4/flush.h linux-2.6.24/fs/reiser4/flush.h
14171--- linux-2.6.24.orig/fs/reiser4/flush.h 1970-01-01 03:00:00.000000000 +0300
14172+++ linux-2.6.24/fs/reiser4/flush.h 2008-01-25 11:39:06.924204598 +0300
14173@@ -0,0 +1,290 @@
14174+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14175+
14176+/* DECLARATIONS: */
14177+
14178+#if !defined(__REISER4_FLUSH_H__)
14179+#define __REISER4_FLUSH_H__
14180+
14181+#include "plugin/cluster.h"
14182+
14183+/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14184+ single level of the tree. A flush-scan is used for counting the number of adjacent
14185+ nodes to flush, which is used to determine whether we should relocate, and it is also
14186+ used to find a starting point for flush. A flush-scan object can scan in both right
14187+ and left directions via the scan_left() and scan_right() interfaces. The
14188+ right- and left-variations are similar but perform different functions. When scanning
14189+ left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14190+ When scanning right we are simply counting the number of adjacent, dirty nodes. */
14191+struct flush_scan {
14192+
14193+ /* The current number of nodes scanned on this level. */
14194+ unsigned count;
14195+
14196+ /* There may be a maximum number of nodes for a scan on any single level. When
14197+ going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14198+ unsigned max_count;
14199+
14200+ /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14201+ sideof direction;
14202+
14203+ /* Initially @stop is set to false then set true once some condition stops the
14204+ search (e.g., we found a clean node before reaching max_count or we found a
14205+ node belonging to another atom). */
14206+ int stop;
14207+
14208+ /* The current scan position. If @node is non-NULL then its reference count has
14209+ been incremented to reflect this reference. */
14210+ jnode *node;
14211+
14212+ /* A handle for zload/zrelse of current scan position node. */
14213+ load_count node_load;
14214+
14215+ /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14216+ node is locked using this lock handle. The endpoint needs to be locked for
14217+ transfer to the flush_position object after scanning finishes. */
14218+ lock_handle node_lock;
14219+
14220+ /* When the position is unformatted, its parent, coordinate, and parent
14221+ zload/zrelse handle. */
14222+ lock_handle parent_lock;
14223+ coord_t parent_coord;
14224+ load_count parent_load;
14225+
14226+ /* The block allocator preceder hint. Sometimes flush_scan determines what the
14227+ preceder is and if so it sets it here, after which it is copied into the
14228+ flush_position. Otherwise, the preceder is computed later. */
14229+ reiser4_block_nr preceder_blk;
14230+};
14231+
14232+struct convert_item_info {
14233+ dc_item_stat d_cur; /* disk cluster state of the current item */
14234+ dc_item_stat d_next; /* disk cluster state of the next slum item */
14235+ int cluster_shift; /* disk cluster shift */
14236+ flow_t flow; /* disk cluster data */
14237+};
14238+
14239+struct convert_info {
14240+ int count; /* for squalloc terminating */
14241+ item_plugin *iplug; /* current item plugin */
14242+ struct convert_item_info *itm; /* current item info */
14243+ struct cluster_handle clust; /* transform cluster */
14244+};
14245+
14246+typedef enum flush_position_state {
14247+ POS_INVALID, /* Invalid or stopped pos, do not continue slum
14248+ * processing */
14249+ POS_ON_LEAF, /* pos points to already prepped, locked formatted node at
14250+ * leaf level */
14251+ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used
14252+ * to traverse unformatted nodes */
14253+ POS_TO_LEAF, /* pos is being moved to leaf level */
14254+ POS_TO_TWIG, /* pos is being moved to twig level */
14255+ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after
14256+ * rightmost unit of the current twig */
14257+ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */
14258+} flushpos_state_t;
14259+
14260+/* An encapsulation of the current flush point and all the parameters that are passed
14261+ through the entire squeeze-and-allocate stage of the flush routine. A single
14262+ flush_position object is constructed after left- and right-scanning finishes. */
14263+struct flush_position {
14264+ flushpos_state_t state;
14265+
14266+ coord_t coord; /* coord to traverse unformatted nodes */
14267+ lock_handle lock; /* current lock we hold */
14268+ load_count load; /* load status for current locked formatted node */
14269+
14270+ jnode *child; /* for passing a reference to unformatted child
14271+ * across pos state changes */
14272+
14273+ reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14274+ int leaf_relocate; /* True if enough leaf-level nodes were
14275+ * found to suggest a relocate policy. */
14276+ int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */
14277+ int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14278+ flush_queue_t *fq;
14279+ long *nr_written; /* number of nodes submitted to disk */
14280+ int flags; /* a copy of jnode_flush flags argument */
14281+
14282+ znode *prev_twig; /* previous parent pointer value, used to catch
14283+ * processing of new twig node */
14284+ struct convert_info *sq; /* convert info */
14285+
14286+ unsigned long pos_in_unit; /* for extents only. Position
14287+ within an extent unit of first
14288+ jnode of slum */
14289+ long nr_to_write; /* number of unformatted nodes to handle on flush */
14290+};
14291+
14292+static inline int item_convert_count(flush_pos_t * pos)
14293+{
14294+ return pos->sq->count;
14295+}
14296+static inline void inc_item_convert_count(flush_pos_t * pos)
14297+{
14298+ pos->sq->count++;
14299+}
14300+static inline void set_item_convert_count(flush_pos_t * pos, int count)
14301+{
14302+ pos->sq->count = count;
14303+}
14304+static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14305+{
14306+ return pos->sq->iplug;
14307+}
14308+
14309+static inline struct convert_info *convert_data(flush_pos_t * pos)
14310+{
14311+ return pos->sq;
14312+}
14313+
14314+static inline struct convert_item_info *item_convert_data(flush_pos_t * pos)
14315+{
14316+ assert("edward-955", convert_data(pos));
14317+ return pos->sq->itm;
14318+}
14319+
14320+static inline struct tfm_cluster * tfm_cluster_sq(flush_pos_t * pos)
14321+{
14322+ return &pos->sq->clust.tc;
14323+}
14324+
14325+static inline struct tfm_stream * tfm_stream_sq(flush_pos_t * pos,
14326+ tfm_stream_id id)
14327+{
14328+ assert("edward-854", pos->sq != NULL);
14329+ return get_tfm_stream(tfm_cluster_sq(pos), id);
14330+}
14331+
14332+static inline int chaining_data_present(flush_pos_t * pos)
14333+{
14334+ return convert_data(pos) && item_convert_data(pos);
14335+}
14336+
14337+/* Returns true if next node contains next item of the disk cluster
14338+ so item convert data should be moved to the right slum neighbor.
14339+*/
14340+static inline int should_chain_next_node(flush_pos_t * pos)
14341+{
14342+ int result = 0;
14343+
14344+ assert("edward-1007", chaining_data_present(pos));
14345+
14346+ switch (item_convert_data(pos)->d_next) {
14347+ case DC_CHAINED_ITEM:
14348+ result = 1;
14349+ break;
14350+ case DC_AFTER_CLUSTER:
14351+ break;
14352+ default:
14353+ impossible("edward-1009", "bad state of next slum item");
14354+ }
14355+ return result;
14356+}
14357+
14358+/* update item state in a disk cluster to assign conversion mode */
14359+static inline void
14360+move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14361+{
14362+
14363+ assert("edward-1010", chaining_data_present(pos));
14364+
14365+ if (this_node == 0) {
14366+ /* next item is on the right neighbor */
14367+ assert("edward-1011",
14368+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14369+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14370+ assert("edward-1012",
14371+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14372+
14373+ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14374+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14375+ } else {
14376+ /* next item is on the same node */
14377+ assert("edward-1013",
14378+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14379+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14380+ assert("edward-1227",
14381+ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14382+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
14383+
14384+ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14385+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14386+ }
14387+}
14388+
14389+static inline int should_convert_node(flush_pos_t * pos, znode * node)
14390+{
14391+ return znode_convertible(node);
14392+}
14393+
14394+/* true if there is attached convert item info */
14395+static inline int should_convert_next_node(flush_pos_t * pos)
14396+{
14397+ return convert_data(pos) && item_convert_data(pos);
14398+}
14399+
14400+#define SQUALLOC_THRESHOLD 256
14401+
14402+static inline int should_terminate_squalloc(flush_pos_t * pos)
14403+{
14404+ return convert_data(pos) &&
14405+ !item_convert_data(pos) &&
14406+ item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14407+}
14408+
14409+#if 1
14410+#define check_convert_info(pos) \
14411+do { \
14412+ if (unlikely(should_convert_next_node(pos))){ \
14413+ warning("edward-1006", "unprocessed chained data"); \
14414+ printk("d_cur = %d, d_next = %d, flow.len = %llu\n", \
14415+ item_convert_data(pos)->d_cur, \
14416+ item_convert_data(pos)->d_next, \
14417+ item_convert_data(pos)->flow.length); \
14418+ } \
14419+} while (0)
14420+#else
14421+#define check_convert_info(pos)
14422+#endif /* REISER4_DEBUG */
14423+
14424+void free_convert_data(flush_pos_t * pos);
14425+/* used in extent.c */
14426+int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14427+ const coord_t * parent);
14428+int reiser4_scan_finished(flush_scan * scan);
14429+int reiser4_scanning_left(flush_scan * scan);
14430+int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14431+txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14432+int reiser4_alloc_extent(flush_pos_t *flush_pos);
14433+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14434+ reiser4_key *stop_key);
14435+extern int reiser4_init_fqs(void);
14436+extern void reiser4_done_fqs(void);
14437+
14438+#if REISER4_DEBUG
14439+
14440+extern void reiser4_check_fq(const txn_atom *atom);
14441+extern atomic_t flush_cnt;
14442+
14443+#define check_preceder(blk) \
14444+assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14445+extern void check_pos(flush_pos_t * pos);
14446+#else
14447+#define check_preceder(b) noop
14448+#define check_pos(pos) noop
14449+#endif
14450+
14451+/* __REISER4_FLUSH_H__ */
14452+#endif
14453+
14454+/* Make Linus happy.
14455+ Local variables:
14456+ c-indentation-style: "K&R"
14457+ mode-name: "LC"
14458+ c-basic-offset: 8
14459+ tab-width: 8
14460+ fill-column: 90
14461+ LocalWords: preceder
14462+ End:
14463+*/
14464diff -urN linux-2.6.24.orig/fs/reiser4/flush_queue.c linux-2.6.24/fs/reiser4/flush_queue.c
14465--- linux-2.6.24.orig/fs/reiser4/flush_queue.c 1970-01-01 03:00:00.000000000 +0300
14466+++ linux-2.6.24/fs/reiser4/flush_queue.c 2008-01-25 11:54:46.665843146 +0300
14467@@ -0,0 +1,674 @@
14468+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14469+
14470+#include "debug.h"
14471+#include "super.h"
14472+#include "txnmgr.h"
14473+#include "jnode.h"
14474+#include "znode.h"
14475+#include "page_cache.h"
14476+#include "wander.h"
14477+#include "vfs_ops.h"
14478+#include "writeout.h"
14479+#include "flush.h"
14480+
14481+#include <linux/bio.h>
14482+#include <linux/mm.h>
14483+#include <linux/pagemap.h>
14484+#include <linux/blkdev.h>
14485+#include <linux/writeback.h>
14486+
14487+/* A flush queue object is an accumulator for keeping jnodes prepared
14488+ by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14489+ kept on the flush queue until memory pressure or atom commit asks
14490+ flush queues to write some or all from their jnodes. */
14491+
14492+/*
14493+ LOCKING:
14494+
14495+ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14496+ list protected by atom spin lock. fq->prepped list uses the following
14497+ locking:
14498+
14499+ two ways to protect fq->prepped list for read-only list traversal:
14500+
14501+ 1. atom spin-lock atom.
14502+ 2. fq is IN_USE, atom->nr_running_queues increased.
14503+
14504+ and one for list modification:
14505+
14506+ 1. atom is spin-locked and one condition is true: fq is IN_USE or
14507+ atom->nr_running_queues == 0.
14508+
14509+ The deadlock-safe order for flush queues and atoms is: first lock atom, then
14510+ lock flush queue, then lock jnode.
14511+*/
14512+
14513+#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14514+#define fq_ready(fq) (!fq_in_use(fq))
14515+
14516+#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14517+#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14518+
14519+/* get lock on atom from locked flush queue object */
14520+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14521+{
14522+ /* This code is similar to jnode_get_atom(), look at it for the
14523+ * explanation. */
14524+ txn_atom *atom;
14525+
14526+ assert_spin_locked(&(fq->guard));
14527+
14528+ while (1) {
14529+ atom = fq->atom;
14530+ if (atom == NULL)
14531+ break;
14532+
14533+ if (spin_trylock_atom(atom))
14534+ break;
14535+
14536+ atomic_inc(&atom->refcount);
14537+ spin_unlock(&(fq->guard));
14538+ spin_lock_atom(atom);
14539+ spin_lock(&(fq->guard));
14540+
14541+ if (fq->atom == atom) {
14542+ atomic_dec(&atom->refcount);
14543+ break;
14544+ }
14545+
14546+ spin_unlock(&(fq->guard));
14547+ atom_dec_and_unlock(atom);
14548+ spin_lock(&(fq->guard));
14549+ }
14550+
14551+ return atom;
14552+}
14553+
14554+txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14555+{
14556+ txn_atom *atom;
14557+
14558+ spin_lock(&(fq->guard));
14559+ atom = atom_locked_by_fq_nolock(fq);
14560+ spin_unlock(&(fq->guard));
14561+ return atom;
14562+}
14563+
14564+static void init_fq(flush_queue_t * fq)
14565+{
14566+ memset(fq, 0, sizeof *fq);
14567+
14568+ atomic_set(&fq->nr_submitted, 0);
14569+
14570+ INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14571+
14572+ init_waitqueue_head(&fq->wait);
14573+ spin_lock_init(&fq->guard);
14574+}
14575+
14576+/* slab for flush queues */
14577+static struct kmem_cache *fq_slab;
14578+
14579+/**
14580+ * reiser4_init_fqs - create flush queue cache
14581+ *
14582+ * Initializes slab cache of flush queues. It is part of reiser4 module
14583+ * initialization.
14584+ */
14585+int reiser4_init_fqs(void)
14586+{
14587+ fq_slab = kmem_cache_create("fq",
14588+ sizeof(flush_queue_t),
14589+ 0, SLAB_HWCACHE_ALIGN, NULL);
14590+ if (fq_slab == NULL)
14591+ return RETERR(-ENOMEM);
14592+ return 0;
14593+}
14594+
14595+/**
14596+ * reiser4_done_fqs - delete flush queue cache
14597+ *
14598+ * This is called on reiser4 module unloading or system shutdown.
14599+ */
14600+void reiser4_done_fqs(void)
14601+{
14602+ destroy_reiser4_cache(&fq_slab);
14603+}
14604+
14605+/* create new flush queue object */
14606+static flush_queue_t *create_fq(gfp_t gfp)
14607+{
14608+ flush_queue_t *fq;
14609+
14610+ fq = kmem_cache_alloc(fq_slab, gfp);
14611+ if (fq)
14612+ init_fq(fq);
14613+
14614+ return fq;
14615+}
14616+
14617+/* adjust atom's and flush queue's counters of queued nodes */
14618+static void count_enqueued_node(flush_queue_t * fq)
14619+{
14620+ ON_DEBUG(fq->atom->num_queued++);
14621+}
14622+
14623+static void count_dequeued_node(flush_queue_t * fq)
14624+{
14625+ assert("zam-993", fq->atom->num_queued > 0);
14626+ ON_DEBUG(fq->atom->num_queued--);
14627+}
14628+
14629+/* attach flush queue object to the atom */
14630+static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14631+{
14632+ assert_spin_locked(&(atom->alock));
14633+ list_add(&fq->alink, &atom->flush_queues);
14634+ fq->atom = atom;
14635+ ON_DEBUG(atom->nr_flush_queues++);
14636+}
14637+
14638+static void detach_fq(flush_queue_t * fq)
14639+{
14640+ assert_spin_locked(&(fq->atom->alock));
14641+
14642+ spin_lock(&(fq->guard));
14643+ list_del_init(&fq->alink);
14644+ assert("vs-1456", fq->atom->nr_flush_queues > 0);
14645+ ON_DEBUG(fq->atom->nr_flush_queues--);
14646+ fq->atom = NULL;
14647+ spin_unlock(&(fq->guard));
14648+}
14649+
14650+/* destroy flush queue object */
14651+static void done_fq(flush_queue_t * fq)
14652+{
14653+ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14654+ assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14655+
14656+ kmem_cache_free(fq_slab, fq);
14657+}
14658+
14659+/* */
14660+static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
14661+{
14662+ JF_SET(node, JNODE_FLUSH_QUEUED);
14663+ count_enqueued_node(fq);
14664+}
14665+
14666+/* Putting jnode into the flush queue. Both atom and jnode should be
14667+ spin-locked. */
14668+void queue_jnode(flush_queue_t * fq, jnode * node)
14669+{
14670+ assert_spin_locked(&(node->guard));
14671+ assert("zam-713", node->atom != NULL);
14672+ assert_spin_locked(&(node->atom->alock));
14673+ assert("zam-716", fq->atom != NULL);
14674+ assert("zam-717", fq->atom == node->atom);
14675+ assert("zam-907", fq_in_use(fq));
14676+
14677+ assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14678+ assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14679+ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14680+ assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14681+
14682+ mark_jnode_queued(fq, node);
14683+ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14684+
14685+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14686+ FQ_LIST, 1));
14687+}
14688+
14689+/* repeatable process for waiting io completion on a flush queue object */
14690+static int wait_io(flush_queue_t * fq, int *nr_io_errors)
14691+{
14692+ assert("zam-738", fq->atom != NULL);
14693+ assert_spin_locked(&(fq->atom->alock));
14694+ assert("zam-736", fq_in_use(fq));
14695+ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14696+
14697+ if (atomic_read(&fq->nr_submitted) != 0) {
14698+ struct super_block *super;
14699+
14700+ spin_unlock_atom(fq->atom);
14701+
14702+ assert("nikita-3013", reiser4_schedulable());
14703+
14704+ super = reiser4_get_current_sb();
14705+
14706+ /* FIXME: this is instead of blk_run_queues() */
14707+ blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
14708+
14709+ if (!(super->s_flags & MS_RDONLY))
14710+ wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
14711+
14712+ /* Ask the caller to re-acquire the locks and call this
14713+ function again. Note: this technique is commonly used in
14714+ the txnmgr code. */
14715+ return -E_REPEAT;
14716+ }
14717+
14718+ *nr_io_errors += atomic_read(&fq->nr_errors);
14719+ return 0;
14720+}
14721+
14722+/* wait on I/O completion, re-submit dirty nodes to write */
14723+static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
14724+{
14725+ int ret;
14726+ txn_atom *atom = fq->atom;
14727+
14728+ assert("zam-801", atom != NULL);
14729+ assert_spin_locked(&(atom->alock));
14730+ assert("zam-762", fq_in_use(fq));
14731+
14732+ ret = wait_io(fq, nr_io_errors);
14733+ if (ret)
14734+ return ret;
14735+
14736+ detach_fq(fq);
14737+ done_fq(fq);
14738+
14739+ reiser4_atom_send_event(atom);
14740+
14741+ return 0;
14742+}
14743+
14744+/* wait for all i/o for given atom to be completed, actually do one iteration
14745+ on that and return -E_REPEAT if there more iterations needed */
14746+static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14747+{
14748+ flush_queue_t *fq;
14749+
14750+ assert_spin_locked(&(atom->alock));
14751+
14752+ if (list_empty_careful(&atom->flush_queues))
14753+ return 0;
14754+
14755+ list_for_each_entry(fq, &atom->flush_queues, alink) {
14756+ if (fq_ready(fq)) {
14757+ int ret;
14758+
14759+ mark_fq_in_use(fq);
14760+ assert("vs-1247", fq->owner == NULL);
14761+ ON_DEBUG(fq->owner = current);
14762+ ret = finish_fq(fq, nr_io_errors);
14763+
14764+ if (*nr_io_errors)
14765+ reiser4_handle_error();
14766+
14767+ if (ret) {
14768+ reiser4_fq_put(fq);
14769+ return ret;
14770+ }
14771+
14772+ spin_unlock_atom(atom);
14773+
14774+ return -E_REPEAT;
14775+ }
14776+ }
14777+
14778+ /* All flush queues are in use; atom remains locked */
14779+ return -EBUSY;
14780+}
14781+
14782+/* wait all i/o for current atom */
14783+int current_atom_finish_all_fq(void)
14784+{
14785+ txn_atom *atom;
14786+ int nr_io_errors = 0;
14787+ int ret = 0;
14788+
14789+ do {
14790+ while (1) {
14791+ atom = get_current_atom_locked();
14792+ ret = finish_all_fq(atom, &nr_io_errors);
14793+ if (ret != -EBUSY)
14794+ break;
14795+ reiser4_atom_wait_event(atom);
14796+ }
14797+ } while (ret == -E_REPEAT);
14798+
14799+ /* we do not need locked atom after this function finishes, SUCCESS or
14800+ -EBUSY are two return codes when atom remains locked after
14801+ finish_all_fq */
14802+ if (!ret)
14803+ spin_unlock_atom(atom);
14804+
14805+ assert_spin_not_locked(&(atom->alock));
14806+
14807+ if (ret)
14808+ return ret;
14809+
14810+ if (nr_io_errors)
14811+ return RETERR(-EIO);
14812+
14813+ return 0;
14814+}
14815+
14816+/* change node->atom field for all jnode from given list */
14817+static void
14818+scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
14819+{
14820+ jnode *cur;
14821+
14822+ list_for_each_entry(cur, list, capture_link) {
14823+ spin_lock_jnode(cur);
14824+ cur->atom = atom;
14825+ spin_unlock_jnode(cur);
14826+ }
14827+}
14828+
14829+/* support for atom fusion operation */
14830+void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
14831+{
14832+ flush_queue_t *fq;
14833+
14834+ assert_spin_locked(&(to->alock));
14835+ assert_spin_locked(&(from->alock));
14836+
14837+ list_for_each_entry(fq, &from->flush_queues, alink) {
14838+ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
14839+ spin_lock(&(fq->guard));
14840+ fq->atom = to;
14841+ spin_unlock(&(fq->guard));
14842+ }
14843+
14844+ list_splice_init(&from->flush_queues, to->flush_queues.prev);
14845+
14846+#if REISER4_DEBUG
14847+ to->num_queued += from->num_queued;
14848+ to->nr_flush_queues += from->nr_flush_queues;
14849+ from->nr_flush_queues = 0;
14850+#endif
14851+}
14852+
14853+#if REISER4_DEBUG
14854+int atom_fq_parts_are_clean(txn_atom * atom)
14855+{
14856+ assert("zam-915", atom != NULL);
14857+ return list_empty_careful(&atom->flush_queues);
14858+}
14859+#endif
14860+/* Bio i/o completion routine for reiser4 write operations. */
14861+static void
14862+end_io_handler(struct bio *bio, int err)
14863+{
14864+ int i;
14865+ int nr_errors = 0;
14866+ flush_queue_t *fq;
14867+
14868+ assert("zam-958", bio->bi_rw & WRITE);
14869+
14870+ if (err == -EOPNOTSUPP)
14871+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
14872+
14873+ /* we expect that bio->private is set to NULL or fq object which is used
14874+ * for synchronization and error counting. */
14875+ fq = bio->bi_private;
14876+ /* Check all elements of io_vec for correct write completion. */
14877+ for (i = 0; i < bio->bi_vcnt; i += 1) {
14878+ struct page *pg = bio->bi_io_vec[i].bv_page;
14879+
14880+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
14881+ SetPageError(pg);
14882+ nr_errors++;
14883+ }
14884+
14885+ {
14886+ /* jnode WRITEBACK ("write is in progress bit") is
14887+ * atomically cleared here. */
14888+ jnode *node;
14889+
14890+ assert("zam-736", pg != NULL);
14891+ assert("zam-736", PagePrivate(pg));
14892+ node = jprivate(pg);
14893+
14894+ JF_CLR(node, JNODE_WRITEBACK);
14895+ }
14896+
14897+ end_page_writeback(pg);
14898+ page_cache_release(pg);
14899+ }
14900+
14901+ if (fq) {
14902+ /* count i/o error in fq object */
14903+ atomic_add(nr_errors, &fq->nr_errors);
14904+
14905+ /* If all write requests registered in this "fq" are done we up
14906+ * the waiter. */
14907+ if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
14908+ wake_up(&fq->wait);
14909+ }
14910+
14911+ bio_put(bio);
14912+}
14913+
14914+/* Count I/O requests which will be submitted by @bio in given flush queues
14915+ @fq */
14916+void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
14917+{
14918+ bio->bi_private = fq;
14919+ bio->bi_end_io = end_io_handler;
14920+
14921+ if (fq)
14922+ atomic_add(bio->bi_vcnt, &fq->nr_submitted);
14923+}
14924+
14925+/* Move all queued nodes out from @fq->prepped list. */
14926+static void release_prepped_list(flush_queue_t * fq)
14927+{
14928+ txn_atom *atom;
14929+
14930+ assert("zam-904", fq_in_use(fq));
14931+ atom = atom_locked_by_fq(fq);
14932+
14933+ while (!list_empty(ATOM_FQ_LIST(fq))) {
14934+ jnode *cur;
14935+
14936+ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
14937+ list_del_init(&cur->capture_link);
14938+
14939+ count_dequeued_node(fq);
14940+ spin_lock_jnode(cur);
14941+ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
14942+ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
14943+ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
14944+ JF_CLR(cur, JNODE_FLUSH_QUEUED);
14945+
14946+ if (JF_ISSET(cur, JNODE_DIRTY)) {
14947+ list_add_tail(&cur->capture_link,
14948+ ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
14949+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14950+ DIRTY_LIST, 1));
14951+ } else {
14952+ list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
14953+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14954+ CLEAN_LIST, 1));
14955+ }
14956+
14957+ spin_unlock_jnode(cur);
14958+ }
14959+
14960+ if (--atom->nr_running_queues == 0)
14961+ reiser4_atom_send_event(atom);
14962+
14963+ spin_unlock_atom(atom);
14964+}
14965+
14966+/* Submit write requests for nodes on the already filled flush queue @fq.
14967+
14968+ @fq: flush queue object which contains jnodes we can (and will) write.
14969+ @return: number of submitted blocks (>=0) if success, otherwise -- an error
14970+ code (<0). */
14971+int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
14972+{
14973+ int ret;
14974+ txn_atom *atom;
14975+
14976+ while (1) {
14977+ atom = atom_locked_by_fq(fq);
14978+ assert("zam-924", atom);
14979+ /* do not write fq in parallel. */
14980+ if (atom->nr_running_queues == 0
14981+ || !(flags & WRITEOUT_SINGLE_STREAM))
14982+ break;
14983+ reiser4_atom_wait_event(atom);
14984+ }
14985+
14986+ atom->nr_running_queues++;
14987+ spin_unlock_atom(atom);
14988+
14989+ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
14990+ release_prepped_list(fq);
14991+
14992+ return ret;
14993+}
14994+
14995+/* Getting flush queue object for exclusive use by one thread. May require
14996+ several iterations which is indicated by -E_REPEAT return code.
14997+
14998+ This function does not contain code for obtaining an atom lock because an
14999+ atom lock is obtained by different ways in different parts of reiser4,
15000+ usually it is current atom, but we need a possibility for getting fq for the
15001+ atom of given jnode. */
15002+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
15003+{
15004+ flush_queue_t *fq;
15005+
15006+ assert_spin_locked(&(atom->alock));
15007+
15008+ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
15009+ while (&atom->flush_queues != &fq->alink) {
15010+ spin_lock(&(fq->guard));
15011+
15012+ if (fq_ready(fq)) {
15013+ mark_fq_in_use(fq);
15014+ assert("vs-1246", fq->owner == NULL);
15015+ ON_DEBUG(fq->owner = current);
15016+ spin_unlock(&(fq->guard));
15017+
15018+ if (*new_fq)
15019+ done_fq(*new_fq);
15020+
15021+ *new_fq = fq;
15022+
15023+ return 0;
15024+ }
15025+
15026+ spin_unlock(&(fq->guard));
15027+
15028+ fq = list_entry(fq->alink.next, flush_queue_t, alink);
15029+ }
15030+
15031+ /* Use previously allocated fq object */
15032+ if (*new_fq) {
15033+ mark_fq_in_use(*new_fq);
15034+ assert("vs-1248", (*new_fq)->owner == 0);
15035+ ON_DEBUG((*new_fq)->owner = current);
15036+ attach_fq(atom, *new_fq);
15037+
15038+ return 0;
15039+ }
15040+
15041+ spin_unlock_atom(atom);
15042+
15043+ *new_fq = create_fq(gfp);
15044+
15045+ if (*new_fq == NULL)
15046+ return RETERR(-ENOMEM);
15047+
15048+ return RETERR(-E_REPEAT);
15049+}
15050+
15051+int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
15052+{
15053+ return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
15054+}
15055+
15056+/* A wrapper around reiser4_fq_by_atom for getting a flush queue
15057+ object for current atom, if success fq->atom remains locked. */
15058+flush_queue_t *get_fq_for_current_atom(void)
15059+{
15060+ flush_queue_t *fq = NULL;
15061+ txn_atom *atom;
15062+ int ret;
15063+
15064+ do {
15065+ atom = get_current_atom_locked();
15066+ ret = reiser4_fq_by_atom(atom, &fq);
15067+ } while (ret == -E_REPEAT);
15068+
15069+ if (ret)
15070+ return ERR_PTR(ret);
15071+ return fq;
15072+}
15073+
15074+/* Releasing flush queue object after exclusive use */
15075+void reiser4_fq_put_nolock(flush_queue_t *fq)
15076+{
15077+ assert("zam-747", fq->atom != NULL);
15078+ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15079+ mark_fq_ready(fq);
15080+ assert("vs-1245", fq->owner == current);
15081+ ON_DEBUG(fq->owner = NULL);
15082+}
15083+
15084+void reiser4_fq_put(flush_queue_t * fq)
15085+{
15086+ txn_atom *atom;
15087+
15088+ spin_lock(&(fq->guard));
15089+ atom = atom_locked_by_fq_nolock(fq);
15090+
15091+ assert("zam-746", atom != NULL);
15092+
15093+ reiser4_fq_put_nolock(fq);
15094+ reiser4_atom_send_event(atom);
15095+
15096+ spin_unlock(&(fq->guard));
15097+ spin_unlock_atom(atom);
15098+}
15099+
15100+/* A part of atom object initialization related to the embedded flush queue
15101+ list head */
15102+
15103+void init_atom_fq_parts(txn_atom *atom)
15104+{
15105+ INIT_LIST_HEAD(&atom->flush_queues);
15106+}
15107+
15108+#if REISER4_DEBUG
15109+
15110+void reiser4_check_fq(const txn_atom *atom)
15111+{
15112+ /* check number of nodes on all atom's flush queues */
15113+ flush_queue_t *fq;
15114+ int count;
15115+ struct list_head *pos;
15116+
15117+ count = 0;
15118+ list_for_each_entry(fq, &atom->flush_queues, alink) {
15119+ spin_lock(&(fq->guard));
15120+ /* calculate number of jnodes on fq' list of prepped jnodes */
15121+ list_for_each(pos, ATOM_FQ_LIST(fq))
15122+ count++;
15123+ spin_unlock(&(fq->guard));
15124+ }
15125+ if (count != atom->fq)
15126+ warning("", "fq counter %d, real %d\n", atom->fq, count);
15127+
15128+}
15129+
15130+#endif
15131+
15132+/*
15133+ * Local variables:
15134+ * c-indentation-style: "K&R"
15135+ * mode-name: "LC"
15136+ * c-basic-offset: 8
15137+ * tab-width: 8
15138+ * fill-column: 79
15139+ * scroll-step: 1
15140+ * End:
15141+ */
15142diff -urN linux-2.6.24.orig/fs/reiser4/forward.h linux-2.6.24/fs/reiser4/forward.h
15143--- linux-2.6.24.orig/fs/reiser4/forward.h 1970-01-01 03:00:00.000000000 +0300
15144+++ linux-2.6.24/fs/reiser4/forward.h 2008-01-25 11:39:06.928205628 +0300
15145@@ -0,0 +1,252 @@
15146+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15147+
15148+/* Forward declarations. Thank you Kernighan. */
15149+
15150+#if !defined( __REISER4_FORWARD_H__ )
15151+#define __REISER4_FORWARD_H__
15152+
15153+#include <asm/errno.h>
15154+#include <linux/types.h>
15155+
15156+typedef struct zlock zlock;
15157+typedef struct lock_stack lock_stack;
15158+typedef struct lock_handle lock_handle;
15159+typedef struct znode znode;
15160+typedef struct flow flow_t;
15161+typedef struct coord coord_t;
15162+typedef struct tree_access_pointer tap_t;
15163+typedef struct reiser4_object_create_data reiser4_object_create_data;
15164+typedef union reiser4_plugin reiser4_plugin;
15165+typedef __u16 reiser4_plugin_id;
15166+typedef __u64 reiser4_plugin_groups;
15167+typedef struct item_plugin item_plugin;
15168+typedef struct jnode_plugin jnode_plugin;
15169+typedef struct reiser4_item_data reiser4_item_data;
15170+typedef union reiser4_key reiser4_key;
15171+typedef struct reiser4_tree reiser4_tree;
15172+typedef struct carry_cut_data carry_cut_data;
15173+typedef struct carry_kill_data carry_kill_data;
15174+typedef struct carry_tree_op carry_tree_op;
15175+typedef struct carry_tree_node carry_tree_node;
15176+typedef struct carry_plugin_info carry_plugin_info;
15177+typedef struct reiser4_journal reiser4_journal;
15178+typedef struct txn_atom txn_atom;
15179+typedef struct txn_handle txn_handle;
15180+typedef struct txn_mgr txn_mgr;
15181+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15182+typedef struct reiser4_context reiser4_context;
15183+typedef struct carry_level carry_level;
15184+typedef struct blocknr_set_entry blocknr_set_entry;
15185+/* super_block->s_fs_info points to this */
15186+typedef struct reiser4_super_info_data reiser4_super_info_data;
15187+/* next two objects are fields of reiser4_super_info_data */
15188+typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15189+typedef struct reiser4_space_allocator reiser4_space_allocator;
15190+
15191+typedef struct flush_scan flush_scan;
15192+typedef struct flush_position flush_pos_t;
15193+
15194+typedef unsigned short pos_in_node_t;
15195+#define MAX_POS_IN_NODE 65535
15196+
15197+typedef struct jnode jnode;
15198+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15199+
15200+typedef struct uf_coord uf_coord_t;
15201+typedef struct hint hint_t;
15202+
15203+typedef struct ktxnmgrd_context ktxnmgrd_context;
15204+
15205+struct inode;
15206+struct page;
15207+struct file;
15208+struct dentry;
15209+struct super_block;
15210+
15211+/* return values of coord_by_key(). cbk == coord_by_key */
15212+typedef enum {
15213+ CBK_COORD_FOUND = 0,
15214+ CBK_COORD_NOTFOUND = -ENOENT,
15215+} lookup_result;
15216+
15217+/* results of lookup with directory file */
15218+typedef enum {
15219+ FILE_NAME_FOUND = 0,
15220+ FILE_NAME_NOTFOUND = -ENOENT,
15221+ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15222+ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15223+} file_lookup_result;
15224+
15225+/* behaviors of lookup. If coord we are looking for is actually in a tree,
15226+ both coincide. */
15227+typedef enum {
15228+ /* search exactly for the coord with key given */
15229+ FIND_EXACT,
15230+ /* search for coord with the maximal key not greater than one
15231+ given */
15232+ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15233+} lookup_bias;
15234+
15235+typedef enum {
15236+ /* number of leaf level of the tree
15237+ The fake root has (tree_level=0). */
15238+ LEAF_LEVEL = 1,
15239+
15240+ /* number of level one above leaf level of the tree.
15241+
15242+ It is supposed that internal tree used by reiser4 to store file
15243+ system data and meta data will have height 2 initially (when
15244+ created by mkfs).
15245+ */
15246+ TWIG_LEVEL = 2,
15247+} tree_level;
15248+
15249+/* The "real" maximum ztree height is the 0-origin size of any per-level
15250+ array, since the zero'th level is not used. */
15251+#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15252+
15253+/* enumeration of possible mutual position of item and coord. This enum is
15254+ return type of ->is_in_item() item plugin method which see. */
15255+typedef enum {
15256+ /* coord is on the left of an item */
15257+ IP_ON_THE_LEFT,
15258+ /* coord is inside item */
15259+ IP_INSIDE,
15260+ /* coord is inside item, but to the right of the rightmost unit of
15261+ this item */
15262+ IP_RIGHT_EDGE,
15263+ /* coord is on the right of an item */
15264+ IP_ON_THE_RIGHT
15265+} interposition;
15266+
15267+/* type of lock to acquire on znode before returning it to caller */
15268+typedef enum {
15269+ ZNODE_NO_LOCK = 0,
15270+ ZNODE_READ_LOCK = 1,
15271+ ZNODE_WRITE_LOCK = 2,
15272+} znode_lock_mode;
15273+
15274+/* type of lock request */
15275+typedef enum {
15276+ ZNODE_LOCK_LOPRI = 0,
15277+ ZNODE_LOCK_HIPRI = (1 << 0),
15278+
15279+ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15280+ waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately
15281+ return the value -E_REPEAT. */
15282+ ZNODE_LOCK_NONBLOCK = (1 << 1),
15283+ /* An option for longterm_lock_znode which prevents atom fusion */
15284+ ZNODE_LOCK_DONT_FUSE = (1 << 2)
15285+} znode_lock_request;
15286+
15287+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15288+
15289+/* used to specify direction of shift. These must be -1 and 1 */
15290+typedef enum {
15291+ SHIFT_LEFT = 1,
15292+ SHIFT_RIGHT = -1
15293+} shift_direction;
15294+
15295+typedef enum {
15296+ LEFT_SIDE,
15297+ RIGHT_SIDE
15298+} sideof;
15299+
15300+#define round_up( value, order ) \
15301+ ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \
15302+ ~( ( order ) - 1 ) ) )
15303+
15304+/* values returned by squalloc_right_neighbor and its auxiliary functions */
15305+typedef enum {
15306+ /* unit of internal item is moved */
15307+ SUBTREE_MOVED = 0,
15308+ /* nothing else can be squeezed into left neighbor */
15309+ SQUEEZE_TARGET_FULL = 1,
15310+ /* all content of node is squeezed into its left neighbor */
15311+ SQUEEZE_SOURCE_EMPTY = 2,
15312+ /* one more item is copied (this is only returned by
15313+ allocate_and_copy_extent to squalloc_twig)) */
15314+ SQUEEZE_CONTINUE = 3
15315+} squeeze_result;
15316+
15317+/* Do not change items ids. If you do - there will be format change */
15318+typedef enum {
15319+ STATIC_STAT_DATA_ID = 0x0,
15320+ SIMPLE_DIR_ENTRY_ID = 0x1,
15321+ COMPOUND_DIR_ID = 0x2,
15322+ NODE_POINTER_ID = 0x3,
15323+ EXTENT_POINTER_ID = 0x5,
15324+ FORMATTING_ID = 0x6,
15325+ CTAIL_ID = 0x7,
15326+ BLACK_BOX_ID = 0x8,
15327+ LAST_ITEM_ID = 0x9
15328+} item_id;
15329+
15330+/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15331+ whether commit() was called or VM memory pressure was applied. */
15332+typedef enum {
15333+ /* submit flush queue to disk at jnode_flush completion */
15334+ JNODE_FLUSH_WRITE_BLOCKS = 1,
15335+
15336+ /* flush is called for commit */
15337+ JNODE_FLUSH_COMMIT = 2,
15338+ /* not implemented */
15339+ JNODE_FLUSH_MEMORY_FORMATTED = 4,
15340+
15341+ /* not implemented */
15342+ JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15343+} jnode_flush_flags;
15344+
15345+/* Flags to insert/paste carry operations. Currently they only used in
15346+ flushing code, but in future, they can be used to optimize for repetitive
15347+ accesses. */
15348+typedef enum {
15349+ /* carry is not allowed to shift data to the left when trying to find
15350+ free space */
15351+ COPI_DONT_SHIFT_LEFT = (1 << 0),
15352+ /* carry is not allowed to shift data to the right when trying to find
15353+ free space */
15354+ COPI_DONT_SHIFT_RIGHT = (1 << 1),
15355+ /* carry is not allowed to allocate new node(s) when trying to find
15356+ free space */
15357+ COPI_DONT_ALLOCATE = (1 << 2),
15358+ /* try to load left neighbor if its not in a cache */
15359+ COPI_LOAD_LEFT = (1 << 3),
15360+ /* try to load right neighbor if its not in a cache */
15361+ COPI_LOAD_RIGHT = (1 << 4),
15362+ /* shift insertion point to the left neighbor */
15363+ COPI_GO_LEFT = (1 << 5),
15364+ /* shift insertion point to the right neighbor */
15365+ COPI_GO_RIGHT = (1 << 6),
15366+ /* try to step back into original node if insertion into new node
15367+ fails after shifting data there. */
15368+ COPI_STEP_BACK = (1 << 7)
15369+} cop_insert_flag;
15370+
15371+typedef enum {
15372+ SAFE_UNLINK, /* safe-link for unlink */
15373+ SAFE_TRUNCATE /* safe-link for truncate */
15374+} reiser4_safe_link_t;
15375+
15376+/* this is to show on which list of atom jnode is */
15377+typedef enum {
15378+ NOT_CAPTURED,
15379+ DIRTY_LIST,
15380+ CLEAN_LIST,
15381+ FQ_LIST,
15382+ WB_LIST,
15383+ OVRWR_LIST
15384+} atom_list;
15385+
15386+/* __REISER4_FORWARD_H__ */
15387+#endif
15388+
15389+/* Make Linus happy.
15390+ Local variables:
15391+ c-indentation-style: "K&R"
15392+ mode-name: "LC"
15393+ c-basic-offset: 8
15394+ tab-width: 8
15395+ fill-column: 120
15396+ End:
15397+*/
15398diff -urN linux-2.6.24.orig/fs/reiser4/fsdata.c linux-2.6.24/fs/reiser4/fsdata.c
15399--- linux-2.6.24.orig/fs/reiser4/fsdata.c 1970-01-01 03:00:00.000000000 +0300
15400+++ linux-2.6.24/fs/reiser4/fsdata.c 2008-01-25 11:39:06.928205628 +0300
15401@@ -0,0 +1,804 @@
15402+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15403+ * reiser4/README */
15404+
15405+#include "fsdata.h"
15406+#include "inode.h"
15407+
15408+
15409+/* cache or dir_cursors */
15410+static struct kmem_cache *d_cursor_cache;
15411+
15412+/* list of unused cursors */
15413+static LIST_HEAD(cursor_cache);
15414+
15415+/* number of cursors in list of ununsed cursors */
15416+static unsigned long d_cursor_unused = 0;
15417+
15418+/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15419+DEFINE_SPINLOCK(d_lock);
15420+
15421+static reiser4_file_fsdata *create_fsdata(struct file *file);
15422+static int file_is_stateless(struct file *file);
15423+static void free_fsdata(reiser4_file_fsdata *fsdata);
15424+static void kill_cursor(dir_cursor *);
15425+
15426+/**
15427+ * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15428+ * @nr: number of objects to free
15429+ * @mask: GFP mask
15430+ *
15431+ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15432+ * number. Return number of still freeable cursors.
15433+ */
15434+static int d_cursor_shrink(int nr, gfp_t mask)
15435+{
15436+ if (nr != 0) {
15437+ dir_cursor *scan;
15438+ int killed;
15439+
15440+ killed = 0;
15441+ spin_lock(&d_lock);
15442+ while (!list_empty(&cursor_cache)) {
15443+ scan = list_entry(cursor_cache.next, dir_cursor, alist);
15444+ assert("nikita-3567", scan->ref == 0);
15445+ kill_cursor(scan);
15446+ ++killed;
15447+ --nr;
15448+ if (nr == 0)
15449+ break;
15450+ }
15451+ spin_unlock(&d_lock);
15452+ }
15453+ return d_cursor_unused;
15454+}
15455+
15456+/*
15457+ * actually, d_cursors are "priceless", because there is no way to
15458+ * recover information stored in them. On the other hand, we don't
15459+ * want to consume all kernel memory by them. As a compromise, just
15460+ * assign higher "seeks" value to d_cursor cache, so that it will be
15461+ * shrunk only if system is really tight on memory.
15462+ */
15463+static struct shrinker d_cursor_shrinker = {
15464+ .shrink = d_cursor_shrink,
15465+ .seeks = DEFAULT_SEEKS << 3,
15466+};
15467+
15468+/**
15469+ * reiser4_init_d_cursor - create d_cursor cache
15470+ *
15471+ * Initializes slab cache of d_cursors. It is part of reiser4 module
15472+ * initialization.
15473+ */
15474+int reiser4_init_d_cursor(void)
15475+{
15476+ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15477+ SLAB_HWCACHE_ALIGN, NULL);
15478+ if (d_cursor_cache == NULL)
15479+ return RETERR(-ENOMEM);
15480+
15481+ register_shrinker(&d_cursor_shrinker);
15482+ return 0;
15483+}
15484+
15485+/**
15486+ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15487+ *
15488+ * This is called on reiser4 module unloading or system shutdown.
15489+ */
15490+void reiser4_done_d_cursor(void)
15491+{
15492+ unregister_shrinker(&d_cursor_shrinker);
15493+
15494+ destroy_reiser4_cache(&d_cursor_cache);
15495+}
15496+
15497+#define D_CURSOR_TABLE_SIZE (256)
15498+
15499+static inline unsigned long
15500+d_cursor_hash(d_cursor_hash_table *table, const struct d_cursor_key *key)
15501+{
15502+ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15503+ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15504+}
15505+
15506+static inline int d_cursor_eq(const struct d_cursor_key *k1,
15507+ const struct d_cursor_key *k2)
15508+{
15509+ return k1->cid == k2->cid && k1->oid == k2->oid;
15510+}
15511+
15512+/*
15513+ * define functions to manipulate reiser4 super block's hash table of
15514+ * dir_cursors
15515+ */
15516+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15517+#define KFREE(ptr, size) kfree(ptr)
15518+TYPE_SAFE_HASH_DEFINE(d_cursor,
15519+ dir_cursor,
15520+ struct d_cursor_key,
15521+ key, hash, d_cursor_hash, d_cursor_eq);
15522+#undef KFREE
15523+#undef KMALLOC
15524+
15525+/**
15526+ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15527+ * @super: super block to initialize
15528+ *
15529+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15530+ * of mount.
15531+ */
15532+int reiser4_init_super_d_info(struct super_block *super)
15533+{
15534+ struct d_cursor_info *p;
15535+
15536+ p = &get_super_private(super)->d_info;
15537+
15538+ INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15539+ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15540+}
15541+
15542+/**
15543+ * reiser4_done_super_d_info - release per-super-block d_cursor resources
15544+ * @super: super block being umounted
15545+ *
15546+ * It is called on umount. Kills all directory cursors attached to suoer block.
15547+ */
15548+void reiser4_done_super_d_info(struct super_block *super)
15549+{
15550+ struct d_cursor_info *d_info;
15551+ dir_cursor *cursor, *next;
15552+
15553+ d_info = &get_super_private(super)->d_info;
15554+ for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15555+ kill_cursor(cursor);
15556+
15557+ BUG_ON(d_info->tree.rnode != NULL);
15558+ d_cursor_hash_done(&d_info->table);
15559+}
15560+
15561+/**
15562+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15563+ * @cursor: cursor to free
15564+ *
15565+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15566+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15567+ * indices, hash table, list of unused cursors and frees it.
15568+ */
15569+static void kill_cursor(dir_cursor *cursor)
15570+{
15571+ unsigned long index;
15572+
15573+ assert("nikita-3566", cursor->ref == 0);
15574+ assert("nikita-3572", cursor->fsdata != NULL);
15575+
15576+ index = (unsigned long)cursor->key.oid;
15577+ list_del_init(&cursor->fsdata->dir.linkage);
15578+ free_fsdata(cursor->fsdata);
15579+ cursor->fsdata = NULL;
15580+
15581+ if (list_empty_careful(&cursor->list))
15582+ /* this is last cursor for a file. Kill radix-tree entry */
15583+ radix_tree_delete(&cursor->info->tree, index);
15584+ else {
15585+ void **slot;
15586+
15587+ /*
15588+ * there are other cursors for the same oid.
15589+ */
15590+
15591+ /*
15592+ * if radix tree point to the cursor being removed, re-target
15593+ * radix tree slot to the next cursor in the (non-empty as was
15594+ * checked above) element of the circular list of all cursors
15595+ * for this oid.
15596+ */
15597+ slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15598+ assert("nikita-3571", *slot != NULL);
15599+ if (*slot == cursor)
15600+ *slot = list_entry(cursor->list.next, dir_cursor, list);
15601+ /* remove cursor from circular list */
15602+ list_del_init(&cursor->list);
15603+ }
15604+ /* remove cursor from the list of unused cursors */
15605+ list_del_init(&cursor->alist);
15606+ /* remove cursor from the hash table */
15607+ d_cursor_hash_remove(&cursor->info->table, cursor);
15608+ /* and free it */
15609+ kmem_cache_free(d_cursor_cache, cursor);
15610+ --d_cursor_unused;
15611+}
15612+
15613+/* possible actions that can be performed on all cursors for the given file */
15614+enum cursor_action {
15615+ /*
15616+ * load all detached state: this is called when stat-data is loaded
15617+ * from the disk to recover information about all pending readdirs
15618+ */
15619+ CURSOR_LOAD,
15620+ /*
15621+ * detach all state from inode, leaving it in the cache. This is called
15622+ * when inode is removed form the memory by memory pressure
15623+ */
15624+ CURSOR_DISPOSE,
15625+ /*
15626+ * detach cursors from the inode, and free them. This is called when
15627+ * inode is destroyed
15628+ */
15629+ CURSOR_KILL
15630+};
15631+
15632+/*
15633+ * return d_cursor data for the file system @inode is in.
15634+ */
15635+static inline struct d_cursor_info *d_info(struct inode *inode)
15636+{
15637+ return &get_super_private(inode->i_sb)->d_info;
15638+}
15639+
15640+/*
15641+ * lookup d_cursor in the per-super-block radix tree.
15642+ */
15643+static inline dir_cursor *lookup(struct d_cursor_info * info,
15644+ unsigned long index)
15645+{
15646+ return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15647+}
15648+
15649+/*
15650+ * attach @cursor to the radix tree. There may be multiple cursors for the
15651+ * same oid, they are chained into circular list.
15652+ */
15653+static void bind_cursor(dir_cursor * cursor, unsigned long index)
15654+{
15655+ dir_cursor *head;
15656+
15657+ head = lookup(cursor->info, index);
15658+ if (head == NULL) {
15659+ /* this is the first cursor for this index */
15660+ INIT_LIST_HEAD(&cursor->list);
15661+ radix_tree_insert(&cursor->info->tree, index, cursor);
15662+ } else {
15663+ /* some cursor already exists. Chain ours */
15664+ list_add(&cursor->list, &head->list);
15665+ }
15666+}
15667+
15668+/*
15669+ * detach fsdata (if detachable) from file descriptor, and put cursor on the
15670+ * "unused" list. Called when file descriptor is not longer in active use.
15671+ */
15672+static void clean_fsdata(struct file *file)
15673+{
15674+ dir_cursor *cursor;
15675+ reiser4_file_fsdata *fsdata;
15676+
15677+ assert("nikita-3570", file_is_stateless(file));
15678+
15679+ fsdata = (reiser4_file_fsdata *) file->private_data;
15680+ if (fsdata != NULL) {
15681+ cursor = fsdata->cursor;
15682+ if (cursor != NULL) {
15683+ spin_lock(&d_lock);
15684+ --cursor->ref;
15685+ if (cursor->ref == 0) {
15686+ list_add_tail(&cursor->alist, &cursor_cache);
15687+ ++d_cursor_unused;
15688+ }
15689+ spin_unlock(&d_lock);
15690+ file->private_data = NULL;
15691+ }
15692+ }
15693+}
15694+
15695+/*
15696+ * global counter used to generate "client ids". These ids are encoded into
15697+ * high bits of fpos.
15698+ */
15699+static __u32 cid_counter = 0;
15700+#define CID_SHIFT (20)
15701+#define CID_MASK (0xfffffull)
15702+
15703+static void free_file_fsdata_nolock(struct file *);
15704+
15705+/**
15706+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15707+ * @cursor:
15708+ * @file:
15709+ * @inode:
15710+ *
15711+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15712+ * reiser4 super block's hash table and radix tree.
15713+ add detachable readdir
15714+ * state to the @f
15715+ */
15716+static int insert_cursor(dir_cursor *cursor, struct file *file,
15717+ struct inode *inode)
15718+{
15719+ int result;
15720+ reiser4_file_fsdata *fsdata;
15721+
15722+ memset(cursor, 0, sizeof *cursor);
15723+
15724+ /* this is either first call to readdir, or rewind. Anyway, create new
15725+ * cursor. */
15726+ fsdata = create_fsdata(NULL);
15727+ if (fsdata != NULL) {
15728+ result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
15729+ if (result == 0) {
15730+ struct d_cursor_info *info;
15731+ oid_t oid;
15732+
15733+ info = d_info(inode);
15734+ oid = get_inode_oid(inode);
15735+ /* cid occupies higher 12 bits of f->f_pos. Don't
15736+ * allow it to become negative: this confuses
15737+ * nfsd_readdir() */
15738+ cursor->key.cid = (++cid_counter) & 0x7ff;
15739+ cursor->key.oid = oid;
15740+ cursor->fsdata = fsdata;
15741+ cursor->info = info;
15742+ cursor->ref = 1;
15743+
15744+ spin_lock_inode(inode);
15745+ /* install cursor as @f's private_data, discarding old
15746+ * one if necessary */
15747+#if REISER4_DEBUG
15748+ if (file->private_data)
15749+ warning("", "file has fsdata already");
15750+#endif
15751+ clean_fsdata(file);
15752+ free_file_fsdata_nolock(file);
15753+ file->private_data = fsdata;
15754+ fsdata->cursor = cursor;
15755+ spin_unlock_inode(inode);
15756+ spin_lock(&d_lock);
15757+ /* insert cursor into hash table */
15758+ d_cursor_hash_insert(&info->table, cursor);
15759+ /* and chain it into radix-tree */
15760+ bind_cursor(cursor, (unsigned long)oid);
15761+ spin_unlock(&d_lock);
15762+ radix_tree_preload_end();
15763+ file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15764+ }
15765+ } else
15766+ result = RETERR(-ENOMEM);
15767+ return result;
15768+}
15769+
15770+/**
15771+ * process_cursors - do action on each cursor attached to inode
15772+ * @inode:
15773+ * @act: action to do
15774+ *
15775+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15776+ * and performs action specified by @act on each of cursors.
15777+ */
15778+static void process_cursors(struct inode *inode, enum cursor_action act)
15779+{
15780+ oid_t oid;
15781+ dir_cursor *start;
15782+ struct list_head *head;
15783+ reiser4_context *ctx;
15784+ struct d_cursor_info *info;
15785+
15786+ /* this can be called by
15787+ *
15788+ * kswapd->...->prune_icache->..reiser4_destroy_inode
15789+ *
15790+ * without reiser4_context
15791+ */
15792+ ctx = reiser4_init_context(inode->i_sb);
15793+ if (IS_ERR(ctx)) {
15794+ warning("vs-23", "failed to init context");
15795+ return;
15796+ }
15797+
15798+ assert("nikita-3558", inode != NULL);
15799+
15800+ info = d_info(inode);
15801+ oid = get_inode_oid(inode);
15802+ spin_lock_inode(inode);
15803+ head = get_readdir_list(inode);
15804+ spin_lock(&d_lock);
15805+ /* find any cursor for this oid: reference to it is hanging of radix
15806+ * tree */
15807+ start = lookup(info, (unsigned long)oid);
15808+ if (start != NULL) {
15809+ dir_cursor *scan;
15810+ reiser4_file_fsdata *fsdata;
15811+
15812+ /* process circular list of cursors for this oid */
15813+ scan = start;
15814+ do {
15815+ dir_cursor *next;
15816+
15817+ next = list_entry(scan->list.next, dir_cursor, list);
15818+ fsdata = scan->fsdata;
15819+ assert("nikita-3557", fsdata != NULL);
15820+ if (scan->key.oid == oid) {
15821+ switch (act) {
15822+ case CURSOR_DISPOSE:
15823+ list_del_init(&fsdata->dir.linkage);
15824+ break;
15825+ case CURSOR_LOAD:
15826+ list_add(&fsdata->dir.linkage, head);
15827+ break;
15828+ case CURSOR_KILL:
15829+ kill_cursor(scan);
15830+ break;
15831+ }
15832+ }
15833+ if (scan == next)
15834+ /* last cursor was just killed */
15835+ break;
15836+ scan = next;
15837+ } while (scan != start);
15838+ }
15839+ spin_unlock(&d_lock);
15840+ /* check that we killed 'em all */
15841+ assert("nikita-3568",
15842+ ergo(act == CURSOR_KILL,
15843+ list_empty_careful(get_readdir_list(inode))));
15844+ assert("nikita-3569",
15845+ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
15846+ spin_unlock_inode(inode);
15847+ reiser4_exit_context(ctx);
15848+}
15849+
15850+/**
15851+ * reiser4_dispose_cursors - removes cursors from inode's list
15852+ * @inode: inode to dispose cursors of
15853+ *
15854+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
15855+ * attached to cursor from inode's readdir list. This is called when inode is
15856+ * removed from the memory by memory pressure.
15857+ */
15858+void reiser4_dispose_cursors(struct inode *inode)
15859+{
15860+ process_cursors(inode, CURSOR_DISPOSE);
15861+}
15862+
15863+/**
15864+ * reiser4_load_cursors - attach cursors to inode
15865+ * @inode: inode to load cursors to
15866+ *
15867+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
15868+ * attached to cursor to inode's readdir list. This is done when inode is
15869+ * loaded into memory.
15870+ */
15871+void reiser4_load_cursors(struct inode *inode)
15872+{
15873+ process_cursors(inode, CURSOR_LOAD);
15874+}
15875+
15876+/**
15877+ * reiser4_kill_cursors - kill all inode cursors
15878+ * @inode: inode to kill cursors of
15879+ *
15880+ * Frees all cursors for this inode. This is called when inode is destroyed.
15881+ */
15882+void reiser4_kill_cursors(struct inode *inode)
15883+{
15884+ process_cursors(inode, CURSOR_KILL);
15885+}
15886+
15887+/**
15888+ * file_is_stateless -
15889+ * @file:
15890+ *
15891+ * true, if file descriptor @f is created by NFS server by "demand" to serve
15892+ * one file system operation. This means that there may be "detached state"
15893+ * for underlying inode.
15894+ */
15895+static int file_is_stateless(struct file *file)
15896+{
15897+ return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
15898+}
15899+
15900+/**
15901+ * reiser4_get_dir_fpos -
15902+ * @dir:
15903+ *
15904+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
15905+ * in the case of stateless directory operation (readdir-over-nfs), client id
15906+ * was encoded in the high bits of cookie and should me masked off.
15907+ */
15908+loff_t reiser4_get_dir_fpos(struct file *dir)
15909+{
15910+ if (file_is_stateless(dir))
15911+ return dir->f_pos & CID_MASK;
15912+ else
15913+ return dir->f_pos;
15914+}
15915+
15916+/**
15917+ * reiser4_attach_fsdata - try to attach fsdata
15918+ * @file:
15919+ * @inode:
15920+ *
15921+ * Finds or creates cursor for readdir-over-nfs.
15922+ */
15923+int reiser4_attach_fsdata(struct file *file, struct inode *inode)
15924+{
15925+ loff_t pos;
15926+ int result;
15927+ dir_cursor *cursor;
15928+
15929+ /*
15930+ * we are serialized by inode->i_mutex
15931+ */
15932+ if (!file_is_stateless(file))
15933+ return 0;
15934+
15935+ pos = file->f_pos;
15936+ result = 0;
15937+ if (pos == 0) {
15938+ /*
15939+ * first call to readdir (or rewind to the beginning of
15940+ * directory)
15941+ */
15942+ cursor = kmem_cache_alloc(d_cursor_cache,
15943+ reiser4_ctx_gfp_mask_get());
15944+ if (cursor != NULL)
15945+ result = insert_cursor(cursor, file, inode);
15946+ else
15947+ result = RETERR(-ENOMEM);
15948+ } else {
15949+ /* try to find existing cursor */
15950+ struct d_cursor_key key;
15951+
15952+ key.cid = pos >> CID_SHIFT;
15953+ key.oid = get_inode_oid(inode);
15954+ spin_lock(&d_lock);
15955+ cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
15956+ if (cursor != NULL) {
15957+ /* cursor was found */
15958+ if (cursor->ref == 0) {
15959+ /* move it from unused list */
15960+ list_del_init(&cursor->alist);
15961+ --d_cursor_unused;
15962+ }
15963+ ++cursor->ref;
15964+ }
15965+ spin_unlock(&d_lock);
15966+ if (cursor != NULL) {
15967+ spin_lock_inode(inode);
15968+ assert("nikita-3556", cursor->fsdata->back == NULL);
15969+ clean_fsdata(file);
15970+ free_file_fsdata_nolock(file);
15971+ file->private_data = cursor->fsdata;
15972+ spin_unlock_inode(inode);
15973+ }
15974+ }
15975+ return result;
15976+}
15977+
15978+/**
15979+ * reiser4_detach_fsdata - ???
15980+ * @file:
15981+ *
15982+ * detach fsdata, if necessary
15983+ */
15984+void reiser4_detach_fsdata(struct file *file)
15985+{
15986+ struct inode *inode;
15987+
15988+ if (!file_is_stateless(file))
15989+ return;
15990+
15991+ inode = file->f_dentry->d_inode;
15992+ spin_lock_inode(inode);
15993+ clean_fsdata(file);
15994+ spin_unlock_inode(inode);
15995+}
15996+
15997+/* slab for reiser4_dentry_fsdata */
15998+static struct kmem_cache *dentry_fsdata_cache;
15999+
16000+/**
16001+ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
16002+ *
16003+ * Initializes slab cache of structures attached to denty->d_fsdata. It is
16004+ * part of reiser4 module initialization.
16005+ */
16006+int reiser4_init_dentry_fsdata(void)
16007+{
16008+ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
16009+ sizeof(struct reiser4_dentry_fsdata),
16010+ 0,
16011+ SLAB_HWCACHE_ALIGN |
16012+ SLAB_RECLAIM_ACCOUNT,
16013+ NULL);
16014+ if (dentry_fsdata_cache == NULL)
16015+ return RETERR(-ENOMEM);
16016+ return 0;
16017+}
16018+
16019+/**
16020+ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
16021+ *
16022+ * This is called on reiser4 module unloading or system shutdown.
16023+ */
16024+void reiser4_done_dentry_fsdata(void)
16025+{
16026+ destroy_reiser4_cache(&dentry_fsdata_cache);
16027+}
16028+
16029+/**
16030+ * reiser4_get_dentry_fsdata - get fs-specific dentry data
16031+ * @dentry: queried dentry
16032+ *
16033+ * Allocates if necessary and returns per-dentry data that we attach to each
16034+ * dentry.
16035+ */
16036+struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
16037+{
16038+ assert("nikita-1365", dentry != NULL);
16039+
16040+ if (dentry->d_fsdata == NULL) {
16041+ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
16042+ reiser4_ctx_gfp_mask_get());
16043+ if (dentry->d_fsdata == NULL)
16044+ return ERR_PTR(RETERR(-ENOMEM));
16045+ memset(dentry->d_fsdata, 0,
16046+ sizeof(struct reiser4_dentry_fsdata));
16047+ }
16048+ return dentry->d_fsdata;
16049+}
16050+
16051+/**
16052+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
16053+ * @dentry: dentry to free fsdata of
16054+ *
16055+ * Detaches and frees fs-specific dentry data
16056+ */
16057+void reiser4_free_dentry_fsdata(struct dentry *dentry)
16058+{
16059+ if (dentry->d_fsdata != NULL) {
16060+ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
16061+ dentry->d_fsdata = NULL;
16062+ }
16063+}
16064+
16065+/* slab for reiser4_file_fsdata */
16066+static struct kmem_cache *file_fsdata_cache;
16067+
16068+/**
16069+ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
16070+ *
16071+ * Initializes slab cache of structures attached to file->private_data. It is
16072+ * part of reiser4 module initialization.
16073+ */
16074+int reiser4_init_file_fsdata(void)
16075+{
16076+ file_fsdata_cache = kmem_cache_create("file_fsdata",
16077+ sizeof(reiser4_file_fsdata),
16078+ 0,
16079+ SLAB_HWCACHE_ALIGN |
16080+ SLAB_RECLAIM_ACCOUNT, NULL);
16081+ if (file_fsdata_cache == NULL)
16082+ return RETERR(-ENOMEM);
16083+ return 0;
16084+}
16085+
16086+/**
16087+ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
16088+ *
16089+ * This is called on reiser4 module unloading or system shutdown.
16090+ */
16091+void reiser4_done_file_fsdata(void)
16092+{
16093+ destroy_reiser4_cache(&file_fsdata_cache);
16094+}
16095+
16096+/**
16097+ * create_fsdata - allocate and initialize reiser4_file_fsdata
16098+ * @file: what to create file_fsdata for, may be NULL
16099+ *
16100+ * Allocates and initializes reiser4_file_fsdata structure.
16101+ */
16102+static reiser4_file_fsdata *create_fsdata(struct file *file)
16103+{
16104+ reiser4_file_fsdata *fsdata;
16105+
16106+ fsdata = kmem_cache_alloc(file_fsdata_cache,
16107+ reiser4_ctx_gfp_mask_get());
16108+ if (fsdata != NULL) {
16109+ memset(fsdata, 0, sizeof *fsdata);
16110+ fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16111+ fsdata->back = file;
16112+ INIT_LIST_HEAD(&fsdata->dir.linkage);
16113+ }
16114+ return fsdata;
16115+}
16116+
16117+/**
16118+ * free_fsdata - free reiser4_file_fsdata
16119+ * @fsdata: object to free
16120+ *
16121+ * Dual to create_fsdata(). Free reiser4_file_fsdata.
16122+ */
16123+static void free_fsdata(reiser4_file_fsdata *fsdata)
16124+{
16125+ BUG_ON(fsdata == NULL);
16126+ kmem_cache_free(file_fsdata_cache, fsdata);
16127+}
16128+
16129+/**
16130+ * reiser4_get_file_fsdata - get fs-specific file data
16131+ * @file: queried file
16132+ *
16133+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16134+ * to @file.
16135+ */
16136+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16137+{
16138+ assert("nikita-1603", file != NULL);
16139+
16140+ if (file->private_data == NULL) {
16141+ reiser4_file_fsdata *fsdata;
16142+ struct inode *inode;
16143+
16144+ fsdata = create_fsdata(file);
16145+ if (fsdata == NULL)
16146+ return ERR_PTR(RETERR(-ENOMEM));
16147+
16148+ inode = file->f_dentry->d_inode;
16149+ spin_lock_inode(inode);
16150+ if (file->private_data == NULL) {
16151+ file->private_data = fsdata;
16152+ fsdata = NULL;
16153+ }
16154+ spin_unlock_inode(inode);
16155+ if (fsdata != NULL)
16156+ /* other thread initialized ->fsdata */
16157+ kmem_cache_free(file_fsdata_cache, fsdata);
16158+ }
16159+ assert("nikita-2665", file->private_data != NULL);
16160+ return file->private_data;
16161+}
16162+
16163+/**
16164+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16165+ * @file:
16166+ *
16167+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16168+ * readdir list, frees if it is not linked to d_cursor object.
16169+ */
16170+static void free_file_fsdata_nolock(struct file *file)
16171+{
16172+ reiser4_file_fsdata *fsdata;
16173+
16174+ assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16175+ fsdata = file->private_data;
16176+ if (fsdata != NULL) {
16177+ list_del_init(&fsdata->dir.linkage);
16178+ if (fsdata->cursor == NULL)
16179+ free_fsdata(fsdata);
16180+ }
16181+ file->private_data = NULL;
16182+}
16183+
16184+/**
16185+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16186+ * @file:
16187+ *
16188+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16189+ */
16190+void reiser4_free_file_fsdata(struct file *file)
16191+{
16192+ spin_lock_inode(file->f_dentry->d_inode);
16193+ free_file_fsdata_nolock(file);
16194+ spin_unlock_inode(file->f_dentry->d_inode);
16195+}
16196+
16197+/*
16198+ * Local variables:
16199+ * c-indentation-style: "K&R"
16200+ * mode-name: "LC"
16201+ * c-basic-offset: 8
16202+ * tab-width: 8
16203+ * fill-column: 79
16204+ * End:
16205+ */
16206diff -urN linux-2.6.24.orig/fs/reiser4/fsdata.h linux-2.6.24/fs/reiser4/fsdata.h
16207--- linux-2.6.24.orig/fs/reiser4/fsdata.h 1970-01-01 03:00:00.000000000 +0300
16208+++ linux-2.6.24/fs/reiser4/fsdata.h 2008-01-25 11:39:06.928205628 +0300
16209@@ -0,0 +1,205 @@
16210+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16211+ * reiser4/README */
16212+
16213+#if !defined( __REISER4_FSDATA_H__ )
16214+#define __REISER4_FSDATA_H__
16215+
16216+#include "debug.h"
16217+#include "kassign.h"
16218+#include "seal.h"
16219+#include "type_safe_hash.h"
16220+#include "plugin/file/file.h"
16221+#include "readahead.h"
16222+
16223+/*
16224+ * comment about reiser4_dentry_fsdata
16225+ *
16226+ *
16227+ */
16228+
16229+/*
16230+ * locking: fields of per file descriptor readdir_pos and ->f_pos are
16231+ * protected by ->i_mutex on inode. Under this lock following invariant
16232+ * holds:
16233+ *
16234+ * file descriptor is "looking" at the entry_no-th directory entry from
16235+ * the beginning of directory. This entry has key dir_entry_key and is
16236+ * pos-th entry with duplicate-key sequence.
16237+ *
16238+ */
16239+
16240+/* logical position within directory */
16241+struct dir_pos {
16242+ /* key of directory entry (actually, part of a key sufficient to
16243+ identify directory entry) */
16244+ de_id dir_entry_key;
16245+ /* ordinal number of directory entry among all entries with the same
16246+ key. (Starting from 0.) */
16247+ unsigned pos;
16248+};
16249+
16250+struct readdir_pos {
16251+ /* f_pos corresponding to this readdir position */
16252+ __u64 fpos;
16253+ /* logical position within directory */
16254+ struct dir_pos position;
16255+ /* logical number of directory entry within
16256+ directory */
16257+ __u64 entry_no;
16258+};
16259+
16260+/*
16261+ * this is used to speed up lookups for directory entry: on initial call to
16262+ * ->lookup() seal and coord of directory entry (if found, that is) are stored
16263+ * in struct dentry and reused later to avoid tree traversals.
16264+ */
16265+struct de_location {
16266+ /* seal covering directory entry */
16267+ seal_t entry_seal;
16268+ /* coord of directory entry */
16269+ coord_t entry_coord;
16270+ /* ordinal number of directory entry among all entries with the same
16271+ key. (Starting from 0.) */
16272+ int pos;
16273+};
16274+
16275+/**
16276+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16277+ *
16278+ * This is allocated dynamically and released in d_op->d_release()
16279+ *
16280+ * Currently it only contains cached location (hint) of directory entry, but
16281+ * it is expected that other information will be accumulated here.
16282+ */
16283+struct reiser4_dentry_fsdata {
16284+ /*
16285+ * here will go fields filled by ->lookup() to speedup next
16286+ * create/unlink, like blocknr of znode with stat-data, or key of
16287+ * stat-data.
16288+ */
16289+ struct de_location dec;
16290+ int stateless; /* created through reiser4_decode_fh, needs special
16291+ * treatment in readdir. */
16292+};
16293+
16294+extern int reiser4_init_dentry_fsdata(void);
16295+extern void reiser4_done_dentry_fsdata(void);
16296+extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16297+extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16298+
16299+/**
16300+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16301+ *
16302+ * This is allocated dynamically and released in inode->i_fop->release
16303+ */
16304+typedef struct reiser4_file_fsdata {
16305+ /*
16306+ * pointer back to the struct file which this reiser4_file_fsdata is
16307+ * part of
16308+ */
16309+ struct file *back;
16310+ /* detached cursor for stateless readdir. */
16311+ struct dir_cursor *cursor;
16312+ /*
16313+ * We need both directory and regular file parts here, because there
16314+ * are file system objects that are files and directories.
16315+ */
16316+ struct {
16317+ /*
16318+ * position in directory. It is updated each time directory is
16319+ * modified
16320+ */
16321+ struct readdir_pos readdir;
16322+ /* head of this list is reiser4_inode->lists.readdir_list */
16323+ struct list_head linkage;
16324+ } dir;
16325+ /* hints to speed up operations with regular files: read and write. */
16326+ struct {
16327+ hint_t hint;
16328+ } reg;
16329+ struct reiser4_file_ra_state ra1;
16330+
16331+} reiser4_file_fsdata;
16332+
16333+extern int reiser4_init_file_fsdata(void);
16334+extern void reiser4_done_file_fsdata(void);
16335+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16336+extern void reiser4_free_file_fsdata(struct file *);
16337+
16338+/*
16339+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16340+ * used to address problem reiser4 has with readdir accesses via NFS. See
16341+ * plugin/file_ops_readdir.c for more details.
16342+ */
16343+struct d_cursor_key{
16344+ __u16 cid;
16345+ __u64 oid;
16346+};
16347+
16348+/*
16349+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16350+ * maintain hash table of dir_cursor-s in reiser4's super block
16351+ */
16352+typedef struct dir_cursor dir_cursor;
16353+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16354+
16355+struct dir_cursor {
16356+ int ref;
16357+ reiser4_file_fsdata *fsdata;
16358+
16359+ /* link to reiser4 super block hash table of cursors */
16360+ d_cursor_hash_link hash;
16361+
16362+ /*
16363+ * this is to link cursors to reiser4 super block's radix tree of
16364+ * cursors if there are more than one cursor of the same objectid
16365+ */
16366+ struct list_head list;
16367+ struct d_cursor_key key;
16368+ struct d_cursor_info *info;
16369+ /* list of unused cursors */
16370+ struct list_head alist;
16371+};
16372+
16373+extern int reiser4_init_d_cursor(void);
16374+extern void reiser4_done_d_cursor(void);
16375+
16376+extern int reiser4_init_super_d_info(struct super_block *);
16377+extern void reiser4_done_super_d_info(struct super_block *);
16378+
16379+extern loff_t reiser4_get_dir_fpos(struct file *);
16380+extern int reiser4_attach_fsdata(struct file *, struct inode *);
16381+extern void reiser4_detach_fsdata(struct file *);
16382+
16383+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16384+ more details */
16385+void reiser4_dispose_cursors(struct inode *inode);
16386+void reiser4_load_cursors(struct inode *inode);
16387+void reiser4_kill_cursors(struct inode *inode);
16388+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16389+ int offset, int adj);
16390+
16391+/*
16392+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16393+ * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16394+ */
16395+struct d_cursor_info {
16396+ d_cursor_hash_table table;
16397+ struct radix_tree_root tree;
16398+};
16399+
16400+/* spinlock protecting readdir cursors */
16401+extern spinlock_t d_lock;
16402+
16403+/* __REISER4_FSDATA_H__ */
16404+#endif
16405+
16406+/*
16407+ * Local variables:
16408+ * c-indentation-style: "K&R"
16409+ * mode-name: "LC"
16410+ * c-basic-offset: 8
16411+ * tab-width: 8
16412+ * fill-column: 120
16413+ * End:
16414+ */
16415diff -urN linux-2.6.24.orig/fs/reiser4/init_super.c linux-2.6.24/fs/reiser4/init_super.c
16416--- linux-2.6.24.orig/fs/reiser4/init_super.c 1970-01-01 03:00:00.000000000 +0300
16417+++ linux-2.6.24/fs/reiser4/init_super.c 2008-01-25 11:39:06.932206658 +0300
16418@@ -0,0 +1,751 @@
16419+/* Copyright by Hans Reiser, 2003 */
16420+
16421+#include "super.h"
16422+#include "inode.h"
16423+#include "plugin/plugin_set.h"
16424+
16425+#include <linux/swap.h>
16426+
16427+/**
16428+ * init_fs_info - allocate reiser4 specific super block
16429+ * @super: super block of filesystem
16430+ *
16431+ * Allocates and initialize reiser4_super_info_data, attaches it to
16432+ * super->s_fs_info, initializes structures maintaining d_cursor-s.
16433+ */
16434+int reiser4_init_fs_info(struct super_block *super)
16435+{
16436+ reiser4_super_info_data *sbinfo;
16437+
16438+ sbinfo = kzalloc(sizeof(reiser4_super_info_data),
16439+ reiser4_ctx_gfp_mask_get());
16440+ if (!sbinfo)
16441+ return RETERR(-ENOMEM);
16442+
16443+ super->s_fs_info = sbinfo;
16444+ super->s_op = NULL;
16445+
16446+ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16447+ ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16448+
16449+ mutex_init(&sbinfo->delete_mutex);
16450+ spin_lock_init(&(sbinfo->guard));
16451+
16452+ /* initialize per-super-block d_cursor resources */
16453+ reiser4_init_super_d_info(super);
16454+
16455+ return 0;
16456+}
16457+
16458+/**
16459+ * reiser4_done_fs_info - free reiser4 specific super block
16460+ * @super: super block of filesystem
16461+ *
16462+ * Performs some sanity checks, releases structures maintaining d_cursor-s,
16463+ * frees reiser4_super_info_data.
16464+ */
16465+void reiser4_done_fs_info(struct super_block *super)
16466+{
16467+ assert("zam-990", super->s_fs_info != NULL);
16468+
16469+ /* release per-super-block d_cursor resources */
16470+ reiser4_done_super_d_info(super);
16471+
16472+ /* make sure that there are not jnodes already */
16473+ assert("", list_empty(&get_super_private(super)->all_jnodes));
16474+ assert("", get_current_context()->trans->atom == NULL);
16475+ reiser4_check_block_counters(super);
16476+ kfree(super->s_fs_info);
16477+ super->s_fs_info = NULL;
16478+}
16479+
16480+/* type of option parseable by parse_option() */
16481+typedef enum {
16482+ /* value of option is arbitrary string */
16483+ OPT_STRING,
16484+
16485+ /*
16486+ * option specifies bit in a bitmask. When option is set - bit in
16487+ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16488+ * dont_load_bitmap, atomic_write.
16489+ */
16490+ OPT_BIT,
16491+
16492+ /*
16493+ * value of option should conform to sprintf() format. Examples are
16494+ * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16495+ */
16496+ OPT_FORMAT,
16497+
16498+ /*
16499+ * option can take one of predefined values. Example is onerror=panic or
16500+ * onerror=remount-ro
16501+ */
16502+ OPT_ONEOF,
16503+} opt_type_t;
16504+
16505+#if 0
16506+struct opt_bitmask_bit {
16507+ const char *bit_name;
16508+ int bit_nr;
16509+};
16510+#endif
16511+
16512+/* description of option parseable by parse_option() */
16513+struct opt_desc {
16514+ /* option name.
16515+
16516+ parsed portion of string has a form "name=value".
16517+ */
16518+ const char *name;
16519+ /* type of option */
16520+ opt_type_t type;
16521+ union {
16522+ /* where to store value of string option (type == OPT_STRING) */
16523+ char **string;
16524+ /* description of bits for bit option (type == OPT_BIT) */
16525+ struct {
16526+ int nr;
16527+ void *addr;
16528+ } bit;
16529+ /* description of format and targets for format option (type
16530+ == OPT_FORMAT) */
16531+ struct {
16532+ const char *format;
16533+ int nr_args;
16534+ void *arg1;
16535+ void *arg2;
16536+ void *arg3;
16537+ void *arg4;
16538+ } f;
16539+ struct {
16540+ int *result;
16541+ const char *list[10];
16542+ } oneof;
16543+ struct {
16544+ void *addr;
16545+ int nr_bits;
16546+ //struct opt_bitmask_bit *bits;
16547+ } bitmask;
16548+ } u;
16549+};
16550+
16551+/**
16552+ * parse_option - parse one option
16553+ * @opt_strin: starting point of parsing
16554+ * @opt: option description
16555+ *
16556+ * foo=bar,
16557+ * ^ ^ ^
16558+ * | | +-- replaced to '\0'
16559+ * | +-- val_start
16560+ * +-- opt_string
16561+ * Figures out option type and handles option correspondingly.
16562+ */
16563+static int parse_option(char *opt_string, struct opt_desc *opt)
16564+{
16565+ char *val_start;
16566+ int result;
16567+ const char *err_msg;
16568+
16569+ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16570+
16571+ val_start = strchr(opt_string, '=');
16572+ if (val_start != NULL) {
16573+ *val_start = '\0';
16574+ ++val_start;
16575+ }
16576+
16577+ err_msg = NULL;
16578+ result = 0;
16579+ switch (opt->type) {
16580+ case OPT_STRING:
16581+ if (val_start == NULL) {
16582+ err_msg = "String arg missing";
16583+ result = RETERR(-EINVAL);
16584+ } else
16585+ *opt->u.string = val_start;
16586+ break;
16587+ case OPT_BIT:
16588+ if (val_start != NULL)
16589+ err_msg = "Value ignored";
16590+ else
16591+ set_bit(opt->u.bit.nr, opt->u.bit.addr);
16592+ break;
16593+ case OPT_FORMAT:
16594+ if (val_start == NULL) {
16595+ err_msg = "Formatted arg missing";
16596+ result = RETERR(-EINVAL);
16597+ break;
16598+ }
16599+ if (sscanf(val_start, opt->u.f.format,
16600+ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16601+ opt->u.f.arg4) != opt->u.f.nr_args) {
16602+ err_msg = "Wrong conversion";
16603+ result = RETERR(-EINVAL);
16604+ }
16605+ break;
16606+ case OPT_ONEOF:
16607+ {
16608+ int i = 0;
16609+
16610+ if (val_start == NULL) {
16611+ err_msg = "Value is missing";
16612+ result = RETERR(-EINVAL);
16613+ break;
16614+ }
16615+ err_msg = "Wrong option value";
16616+ result = RETERR(-EINVAL);
16617+ while (opt->u.oneof.list[i]) {
16618+ if (!strcmp(opt->u.oneof.list[i], val_start)) {
16619+ result = 0;
16620+ err_msg = NULL;
16621+ *opt->u.oneof.result = i;
16622+ break;
16623+ }
16624+ i++;
16625+ }
16626+ break;
16627+ }
16628+ default:
16629+ wrong_return_value("nikita-2100", "opt -> type");
16630+ break;
16631+ }
16632+ if (err_msg != NULL) {
16633+ warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16634+ err_msg, opt->name, val_start ? "=" : "",
16635+ val_start ? : "");
16636+ }
16637+ return result;
16638+}
16639+
16640+/**
16641+ * parse_options - parse reiser4 mount options
16642+ * @opt_string: starting point
16643+ * @opts: array of option description
16644+ * @nr_opts: number of elements in @opts
16645+ *
16646+ * Parses comma separated list of reiser4 mount options.
16647+ */
16648+static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
16649+{
16650+ int result;
16651+
16652+ result = 0;
16653+ while ((result == 0) && opt_string && *opt_string) {
16654+ int j;
16655+ char *next;
16656+
16657+ next = strchr(opt_string, ',');
16658+ if (next != NULL) {
16659+ *next = '\0';
16660+ ++next;
16661+ }
16662+ for (j = 0; j < nr_opts; ++j) {
16663+ if (!strncmp(opt_string, opts[j].name,
16664+ strlen(opts[j].name))) {
16665+ result = parse_option(opt_string, &opts[j]);
16666+ break;
16667+ }
16668+ }
16669+ if (j == nr_opts) {
16670+ warning("nikita-2307", "Unrecognized option: \"%s\"",
16671+ opt_string);
16672+ /* traditionally, -EINVAL is returned on wrong mount
16673+ option */
16674+ result = RETERR(-EINVAL);
16675+ }
16676+ opt_string = next;
16677+ }
16678+ return result;
16679+}
16680+
16681+#define NUM_OPT( label, fmt, addr ) \
16682+ { \
16683+ .name = ( label ), \
16684+ .type = OPT_FORMAT, \
16685+ .u = { \
16686+ .f = { \
16687+ .format = ( fmt ), \
16688+ .nr_args = 1, \
16689+ .arg1 = ( addr ), \
16690+ .arg2 = NULL, \
16691+ .arg3 = NULL, \
16692+ .arg4 = NULL \
16693+ } \
16694+ } \
16695+ }
16696+
16697+#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
16698+
16699+#define BIT_OPT(label, bitnr) \
16700+ { \
16701+ .name = label, \
16702+ .type = OPT_BIT, \
16703+ .u = { \
16704+ .bit = { \
16705+ .nr = bitnr, \
16706+ .addr = &sbinfo->fs_flags \
16707+ } \
16708+ } \
16709+ }
16710+
16711+#define MAX_NR_OPTIONS (30)
16712+
16713+/**
16714+ * reiser4_init_super_data - initialize reiser4 private super block
16715+ * @super: super block to initialize
16716+ * @opt_string: list of reiser4 mount options
16717+ *
16718+ * Sets various reiser4 parameters to default values. Parses mount options and
16719+ * overwrites default settings.
16720+ */
16721+int reiser4_init_super_data(struct super_block *super, char *opt_string)
16722+{
16723+ int result;
16724+ struct opt_desc *opts, *p;
16725+ reiser4_super_info_data *sbinfo = get_super_private(super);
16726+
16727+ /* initialize super, export, dentry operations */
16728+ sbinfo->ops.super = reiser4_super_operations;
16729+ sbinfo->ops.export = reiser4_export_operations;
16730+ sbinfo->ops.dentry = reiser4_dentry_operations;
16731+ super->s_op = &sbinfo->ops.super;
16732+ super->s_export_op = &sbinfo->ops.export;
16733+
16734+ /* initialize transaction manager parameters to default values */
16735+ sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16736+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16737+ sbinfo->tmgr.atom_min_size = 256;
16738+ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16739+
16740+ /* initialize cbk cache parameter */
16741+ sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16742+
16743+ /* initialize flush parameters */
16744+ sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16745+ sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16746+ sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16747+ sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16748+
16749+ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16750+
16751+ /* preliminary tree initializations */
16752+ sbinfo->tree.super = super;
16753+ sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16754+ sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16755+ sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16756+ sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16757+ rwlock_init(&(sbinfo->tree.tree_lock));
16758+ spin_lock_init(&(sbinfo->tree.epoch_lock));
16759+
16760+ /* initialize default readahead params */
16761+ sbinfo->ra_params.max = num_physpages / 4;
16762+ sbinfo->ra_params.flags = 0;
16763+
16764+ /* allocate memory for structure describing reiser4 mount options */
16765+ opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
16766+ reiser4_ctx_gfp_mask_get());
16767+ if (opts == NULL)
16768+ return RETERR(-ENOMEM);
16769+
16770+ /* initialize structure describing reiser4 mount options */
16771+ p = opts;
16772+
16773+#if REISER4_DEBUG
16774+# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \
16775+ warning ("zam-1046", "opt array is overloaded"); break; \
16776+ }
16777+#else
16778+# define OPT_ARRAY_CHECK noop
16779+#endif
16780+
16781+#define PUSH_OPT(...) \
16782+do { \
16783+ struct opt_desc o = __VA_ARGS__; \
16784+ OPT_ARRAY_CHECK; \
16785+ *p ++ = o; \
16786+} while (0)
16787+
16788+#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
16789+#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
16790+
16791+ /*
16792+ * tmgr.atom_max_size=N
16793+ * Atoms containing more than N blocks will be forced to commit. N is
16794+ * decimal.
16795+ */
16796+ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
16797+ /*
16798+ * tmgr.atom_max_age=N
16799+ * Atoms older than N seconds will be forced to commit. N is decimal.
16800+ */
16801+ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
16802+ /*
16803+ * tmgr.atom_min_size=N
16804+ * In committing an atom to free dirty pages, force the atom less than
16805+ * N in size to fuse with another one.
16806+ */
16807+ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
16808+ /*
16809+ * tmgr.atom_max_flushers=N
16810+ * limit of concurrent flushers for one atom. 0 means no limit.
16811+ */
16812+ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
16813+ /*
16814+ * tree.cbk_cache_slots=N
16815+ * Number of slots in the cbk cache.
16816+ */
16817+ PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
16818+ /*
16819+ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
16820+ * leaf-level blocks it will force them to be relocated.
16821+ */
16822+ PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
16823+ /*
16824+ * If flush finds can find a block allocation closer than at most
16825+ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
16826+ * position.
16827+ */
16828+ PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
16829+ /*
16830+ * If we have written this much or more blocks before encountering busy
16831+ * jnode in flush list - abort flushing hoping that next time we get
16832+ * called this jnode will be clean already, and we will save some
16833+ * seeks.
16834+ */
16835+ PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
16836+ /* The maximum number of nodes to scan left on a level during flush. */
16837+ PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
16838+ /* preferred IO size */
16839+ PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
16840+ /* carry flags used for insertion of new nodes */
16841+ PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
16842+ /* carry flags used for insertion of new extents */
16843+ PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
16844+ /* carry flags used for paste operations */
16845+ PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
16846+ /* carry flags used for insert operations */
16847+ PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
16848+
16849+#ifdef CONFIG_REISER4_BADBLOCKS
16850+ /*
16851+ * Alternative master superblock location in case if it's original
16852+ * location is not writeable/accessable. This is offset in BYTES.
16853+ */
16854+ PUSH_SB_FIELD_OPT(altsuper, "%lu");
16855+#endif
16856+
16857+ /* turn on BSD-style gid assignment */
16858+ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
16859+ /* turn on 32 bit times */
16860+ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
16861+ /*
16862+ * Don't load all bitmap blocks at mount time, it is useful for
16863+ * machines with tiny RAM and large disks.
16864+ */
16865+ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
16866+ /* disable transaction commits during write() */
16867+ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
16868+ /* disable use of write barriers in the reiser4 log writer. */
16869+ PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
16870+
16871+ PUSH_OPT(
16872+ {
16873+ /*
16874+ * tree traversal readahead parameters:
16875+ * -o readahead:MAXNUM:FLAGS
16876+ * MAXNUM - max number fo nodes to request readahead for: -1UL
16877+ * will set it to max_sane_readahead()
16878+ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
16879+ * CONTINUE_ON_PRESENT
16880+ */
16881+ .name = "readahead",
16882+ .type = OPT_FORMAT,
16883+ .u = {
16884+ .f = {
16885+ .format = "%u:%u",
16886+ .nr_args = 2,
16887+ .arg1 = &sbinfo->ra_params.max,
16888+ .arg2 = &sbinfo->ra_params.flags,
16889+ .arg3 = NULL,
16890+ .arg4 = NULL
16891+ }
16892+ }
16893+ }
16894+ );
16895+
16896+ /* What to do in case of fs error */
16897+ PUSH_OPT(
16898+ {
16899+ .name = "onerror",
16900+ .type = OPT_ONEOF,
16901+ .u = {
16902+ .oneof = {
16903+ .result = &sbinfo->onerror,
16904+ .list = {
16905+ "panic", "remount-ro", NULL
16906+ },
16907+ }
16908+ }
16909+ }
16910+ );
16911+
16912+ /* modify default settings to values set by mount options */
16913+ result = parse_options(opt_string, opts, p - opts);
16914+ kfree(opts);
16915+ if (result != 0)
16916+ return result;
16917+
16918+ /* correct settings to sanity values */
16919+ sbinfo->tmgr.atom_max_age *= HZ;
16920+ if (sbinfo->tmgr.atom_max_age <= 0)
16921+ /* overflow */
16922+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
16923+
16924+ /* round optimal io size up to 512 bytes */
16925+ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
16926+ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
16927+ if (sbinfo->optimal_io_size == 0) {
16928+ warning("nikita-2497", "optimal_io_size is too small");
16929+ return RETERR(-EINVAL);
16930+ }
16931+ return result;
16932+}
16933+
16934+/**
16935+ * reiser4_init_read_super - read reiser4 master super block
16936+ * @super: super block to fill
16937+ * @silent: if 0 - print warnings
16938+ *
16939+ * Reads reiser4 master super block either from predefined location or from
16940+ * location specified by altsuper mount option, initializes disk format plugin.
16941+ */
16942+int reiser4_init_read_super(struct super_block *super, int silent)
16943+{
16944+ struct buffer_head *super_bh;
16945+ struct reiser4_master_sb *master_sb;
16946+ reiser4_super_info_data *sbinfo = get_super_private(super);
16947+ unsigned long blocksize;
16948+
16949+ read_super_block:
16950+#ifdef CONFIG_REISER4_BADBLOCKS
16951+ if (sbinfo->altsuper)
16952+ /*
16953+ * read reiser4 master super block at position specified by
16954+ * mount option
16955+ */
16956+ super_bh = sb_bread(super,
16957+ (sector_t)(sbinfo->altsuper / super->s_blocksize));
16958+ else
16959+#endif
16960+ /* read reiser4 master super block at 16-th 4096 block */
16961+ super_bh = sb_bread(super,
16962+ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
16963+ if (!super_bh)
16964+ return RETERR(-EIO);
16965+
16966+ master_sb = (struct reiser4_master_sb *)super_bh->b_data;
16967+ /* check reiser4 magic string */
16968+ if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
16969+ sizeof(REISER4_SUPER_MAGIC_STRING))) {
16970+ /* reiser4 master super block contains filesystem blocksize */
16971+ blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
16972+
16973+ if (blocksize != PAGE_CACHE_SIZE) {
16974+ /*
16975+ * currenly reiser4's blocksize must be equal to
16976+ * pagesize
16977+ */
16978+ if (!silent)
16979+ warning("nikita-2609",
16980+ "%s: wrong block size %ld\n", super->s_id,
16981+ blocksize);
16982+ brelse(super_bh);
16983+ return RETERR(-EINVAL);
16984+ }
16985+ if (blocksize != super->s_blocksize) {
16986+ /*
16987+ * filesystem uses different blocksize. Reread master
16988+ * super block with correct blocksize
16989+ */
16990+ brelse(super_bh);
16991+ if (!sb_set_blocksize(super, (int)blocksize))
16992+ return RETERR(-EINVAL);
16993+ goto read_super_block;
16994+ }
16995+
16996+ sbinfo->df_plug =
16997+ disk_format_plugin_by_id(
16998+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16999+ if (sbinfo->df_plug == NULL) {
17000+ if (!silent)
17001+ warning("nikita-26091",
17002+ "%s: unknown disk format plugin %d\n",
17003+ super->s_id,
17004+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17005+ brelse(super_bh);
17006+ return RETERR(-EINVAL);
17007+ }
17008+ sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
17009+ brelse(super_bh);
17010+ return 0;
17011+ }
17012+
17013+ /* there is no reiser4 on the device */
17014+ if (!silent)
17015+ warning("nikita-2608",
17016+ "%s: wrong master super block magic", super->s_id);
17017+ brelse(super_bh);
17018+ return RETERR(-EINVAL);
17019+}
17020+
17021+static struct {
17022+ reiser4_plugin_type type;
17023+ reiser4_plugin_id id;
17024+} default_plugins[PSET_LAST] = {
17025+ [PSET_FILE] = {
17026+ .type = REISER4_FILE_PLUGIN_TYPE,
17027+ .id = UNIX_FILE_PLUGIN_ID
17028+ },
17029+ [PSET_DIR] = {
17030+ .type = REISER4_DIR_PLUGIN_TYPE,
17031+ .id = HASHED_DIR_PLUGIN_ID
17032+ },
17033+ [PSET_HASH] = {
17034+ .type = REISER4_HASH_PLUGIN_TYPE,
17035+ .id = R5_HASH_ID
17036+ },
17037+ [PSET_FIBRATION] = {
17038+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
17039+ .id = FIBRATION_DOT_O
17040+ },
17041+ [PSET_PERM] = {
17042+ .type = REISER4_PERM_PLUGIN_TYPE,
17043+ .id = NULL_PERM_ID
17044+ },
17045+ [PSET_FORMATTING] = {
17046+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
17047+ .id = SMALL_FILE_FORMATTING_ID
17048+ },
17049+ [PSET_SD] = {
17050+ .type = REISER4_ITEM_PLUGIN_TYPE,
17051+ .id = STATIC_STAT_DATA_ID
17052+ },
17053+ [PSET_DIR_ITEM] = {
17054+ .type = REISER4_ITEM_PLUGIN_TYPE,
17055+ .id = COMPOUND_DIR_ID
17056+ },
17057+ [PSET_CIPHER] = {
17058+ .type = REISER4_CIPHER_PLUGIN_TYPE,
17059+ .id = NONE_CIPHER_ID
17060+ },
17061+ [PSET_DIGEST] = {
17062+ .type = REISER4_DIGEST_PLUGIN_TYPE,
17063+ .id = SHA256_32_DIGEST_ID
17064+ },
17065+ [PSET_COMPRESSION] = {
17066+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17067+ .id = LZO1_COMPRESSION_ID
17068+ },
17069+ [PSET_COMPRESSION_MODE] = {
17070+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17071+ .id = CONVX_COMPRESSION_MODE_ID
17072+ },
17073+ [PSET_CLUSTER] = {
17074+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
17075+ .id = CLUSTER_64K_ID
17076+ },
17077+ [PSET_CREATE] = {
17078+ .type = REISER4_FILE_PLUGIN_TYPE,
17079+ .id = UNIX_FILE_PLUGIN_ID
17080+ }
17081+};
17082+
17083+/* access to default plugin table */
17084+reiser4_plugin *get_default_plugin(pset_member memb)
17085+{
17086+ return plugin_by_id(default_plugins[memb].type,
17087+ default_plugins[memb].id);
17088+}
17089+
17090+/**
17091+ * reiser4_init_root_inode - obtain inode of root directory
17092+ * @super: super block of filesystem
17093+ *
17094+ * Obtains inode of root directory (reading it from disk), initializes plugin
17095+ * set it was not initialized.
17096+ */
17097+int reiser4_init_root_inode(struct super_block *super)
17098+{
17099+ reiser4_super_info_data *sbinfo = get_super_private(super);
17100+ struct inode *inode;
17101+ int result = 0;
17102+
17103+ inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17104+ if (IS_ERR(inode))
17105+ return RETERR(PTR_ERR(inode));
17106+
17107+ super->s_root = d_alloc_root(inode);
17108+ if (!super->s_root) {
17109+ iput(inode);
17110+ return RETERR(-ENOMEM);
17111+ }
17112+
17113+ super->s_root->d_op = &sbinfo->ops.dentry;
17114+
17115+ if (!is_inode_loaded(inode)) {
17116+ pset_member memb;
17117+ plugin_set *pset;
17118+
17119+ pset = reiser4_inode_data(inode)->pset;
17120+ for (memb = 0; memb < PSET_LAST; ++memb) {
17121+
17122+ if (aset_get(pset, memb) != NULL)
17123+ continue;
17124+
17125+ result = grab_plugin_pset(inode, NULL, memb);
17126+ if (result != 0)
17127+ break;
17128+
17129+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17130+ }
17131+
17132+ if (result == 0) {
17133+ if (REISER4_DEBUG) {
17134+ for (memb = 0; memb < PSET_LAST; ++memb)
17135+ assert("nikita-3500",
17136+ aset_get(pset, memb) != NULL);
17137+ }
17138+ } else
17139+ warning("nikita-3448", "Cannot set plugins of root: %i",
17140+ result);
17141+ reiser4_iget_complete(inode);
17142+
17143+ /* As the default pset kept in the root dir may has been changed
17144+ (length is unknown), call update_sd. */
17145+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17146+ result = reiser4_grab_space(
17147+ inode_file_plugin(inode)->estimate.update(inode),
17148+ BA_CAN_COMMIT);
17149+
17150+ if (result == 0)
17151+ result = reiser4_update_sd(inode);
17152+
17153+ all_grabbed2free();
17154+ }
17155+ }
17156+
17157+ super->s_maxbytes = MAX_LFS_FILESIZE;
17158+ return result;
17159+}
17160+
17161+/*
17162+ * Local variables:
17163+ * c-indentation-style: "K&R"
17164+ * mode-name: "LC"
17165+ * c-basic-offset: 8
17166+ * tab-width: 8
17167+ * fill-column: 79
17168+ * End:
17169+ */
17170diff -urN linux-2.6.24.orig/fs/reiser4/inode.c linux-2.6.24/fs/reiser4/inode.c
17171--- linux-2.6.24.orig/fs/reiser4/inode.c 1970-01-01 03:00:00.000000000 +0300
17172+++ linux-2.6.24/fs/reiser4/inode.c 2008-01-25 11:39:06.932206658 +0300
17173@@ -0,0 +1,709 @@
17174+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17175+
17176+/* Inode specific operations. */
17177+
17178+#include "forward.h"
17179+#include "debug.h"
17180+#include "key.h"
17181+#include "kassign.h"
17182+#include "coord.h"
17183+#include "seal.h"
17184+#include "dscale.h"
17185+#include "plugin/item/item.h"
17186+#include "plugin/security/perm.h"
17187+#include "plugin/plugin.h"
17188+#include "plugin/object.h"
17189+#include "znode.h"
17190+#include "vfs_ops.h"
17191+#include "inode.h"
17192+#include "super.h"
17193+#include "reiser4.h"
17194+
17195+#include <linux/fs.h> /* for struct super_block, address_space */
17196+
17197+/* return reiser4 internal tree which inode belongs to */
17198+/* Audited by: green(2002.06.17) */
17199+reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ )
17200+{
17201+ assert("nikita-256", inode != NULL);
17202+ assert("nikita-257", inode->i_sb != NULL);
17203+ return reiser4_get_tree(inode->i_sb);
17204+}
17205+
17206+/* return reiser4-specific inode flags */
17207+static inline unsigned long *inode_flags(const struct inode *const inode)
17208+{
17209+ assert("nikita-2842", inode != NULL);
17210+ return &reiser4_inode_data(inode)->flags;
17211+}
17212+
17213+/* set reiser4-specific flag @f in @inode */
17214+void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17215+{
17216+ assert("nikita-2248", inode != NULL);
17217+ set_bit((int)f, inode_flags(inode));
17218+}
17219+
17220+/* clear reiser4-specific flag @f in @inode */
17221+void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17222+{
17223+ assert("nikita-2250", inode != NULL);
17224+ clear_bit((int)f, inode_flags(inode));
17225+}
17226+
17227+/* true if reiser4-specific flag @f is set in @inode */
17228+int reiser4_inode_get_flag(const struct inode *inode,
17229+ reiser4_file_plugin_flags f)
17230+{
17231+ assert("nikita-2251", inode != NULL);
17232+ return test_bit((int)f, inode_flags(inode));
17233+}
17234+
17235+/* convert oid to inode number */
17236+ino_t oid_to_ino(oid_t oid)
17237+{
17238+ return (ino_t) oid;
17239+}
17240+
17241+/* convert oid to user visible inode number */
17242+ino_t oid_to_uino(oid_t oid)
17243+{
17244+ /* reiser4 object is uniquely identified by oid which is 64 bit
17245+ quantity. Kernel in-memory inode is indexed (in the hash table) by
17246+ 32 bit i_ino field, but this is not a problem, because there is a
17247+ way to further distinguish inodes with identical inode numbers
17248+ (find_actor supplied to iget()).
17249+
17250+ But user space expects unique 32 bit inode number. Obviously this
17251+ is impossible. Work-around is to somehow hash oid into user visible
17252+ inode number.
17253+ */
17254+ oid_t max_ino = (ino_t) ~ 0;
17255+
17256+ if (REISER4_INO_IS_OID || (oid <= max_ino))
17257+ return oid;
17258+ else
17259+ /* this is remotely similar to algorithm used to find next pid
17260+ to use for process: after wrap-around start from some
17261+ offset rather than from 0. Idea is that there are some long
17262+ living objects with which we don't want to collide.
17263+ */
17264+ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17265+}
17266+
17267+/* check that "inode" is on reiser4 file-system */
17268+int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17269+{
17270+ return inode != NULL && is_reiser4_super(inode->i_sb);
17271+}
17272+
17273+/* Maximal length of a name that can be stored in directory @inode.
17274+
17275+ This is used in check during file creation and lookup. */
17276+int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17277+{
17278+ assert("nikita-287", is_reiser4_inode(inode));
17279+ assert("nikita-1710", inode_dir_item_plugin(inode));
17280+ if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17281+ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17282+ else
17283+ return 255;
17284+}
17285+
17286+#if REISER4_USE_COLLISION_LIMIT
17287+/* Maximal number of hash collisions for this directory. */
17288+int max_hash_collisions(const struct inode *dir /* inode queried */ )
17289+{
17290+ assert("nikita-1711", dir != NULL);
17291+ return reiser4_inode_data(dir)->plugin.max_collisions;
17292+}
17293+#endif /* REISER4_USE_COLLISION_LIMIT */
17294+
17295+/* Install file, inode, and address_space operation on @inode, depending on
17296+ its mode. */
17297+int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17298+ reiser4_object_create_data * data /* parameters to create
17299+ * object */ )
17300+{
17301+ reiser4_super_info_data *sinfo;
17302+ file_plugin *fplug;
17303+ dir_plugin *dplug;
17304+
17305+ fplug = inode_file_plugin(inode);
17306+ dplug = inode_dir_plugin(inode);
17307+
17308+ sinfo = get_super_private(inode->i_sb);
17309+
17310+ switch (inode->i_mode & S_IFMT) {
17311+ case S_IFSOCK:
17312+ case S_IFBLK:
17313+ case S_IFCHR:
17314+ case S_IFIFO:
17315+ {
17316+ dev_t rdev; /* to keep gcc happy */
17317+
17318+ assert("vs-46", fplug != NULL);
17319+ /* ugly hack with rdev */
17320+ if (data == NULL) {
17321+ rdev = inode->i_rdev;
17322+ inode->i_rdev = 0;
17323+ } else
17324+ rdev = data->rdev;
17325+ inode->i_blocks = 0;
17326+ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17327+ inode->i_op = file_plugins[fplug->h.id].inode_ops;
17328+ /* initialize inode->i_fop and inode->i_rdev for block and char
17329+ devices */
17330+ init_special_inode(inode, inode->i_mode, rdev);
17331+ /* all address space operations are null */
17332+ inode->i_mapping->a_ops =
17333+ file_plugins[fplug->h.id].as_ops;
17334+ break;
17335+ }
17336+ case S_IFLNK:
17337+ assert("vs-46", fplug != NULL);
17338+ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17339+ inode->i_op = file_plugins[fplug->h.id].inode_ops;
17340+ inode->i_fop = NULL;
17341+ /* all address space operations are null */
17342+ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17343+ break;
17344+ case S_IFDIR:
17345+ assert("vs-46", dplug != NULL);
17346+ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17347+ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17348+ inode->i_op = dir_plugins[dplug->h.id].inode_ops;
17349+ inode->i_fop = dir_plugins[dplug->h.id].file_ops;
17350+ inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops;
17351+ break;
17352+ case S_IFREG:
17353+ assert("vs-46", fplug != NULL);
17354+ assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17355+ fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17356+ inode->i_op = file_plugins[fplug->h.id].inode_ops;
17357+ inode->i_fop = file_plugins[fplug->h.id].file_ops;
17358+ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17359+ break;
17360+ default:
17361+ warning("nikita-291", "wrong file mode: %o for %llu",
17362+ inode->i_mode,
17363+ (unsigned long long)get_inode_oid(inode));
17364+ reiser4_make_bad_inode(inode);
17365+ return RETERR(-EINVAL);
17366+ }
17367+ return 0;
17368+}
17369+
17370+/* Initialize inode from disk data. Called with inode locked.
17371+ Return inode locked. */
17372+static int init_inode(struct inode *inode /* inode to intialise */ ,
17373+ coord_t * coord /* coord of stat data */ )
17374+{
17375+ int result;
17376+ item_plugin *iplug;
17377+ void *body;
17378+ int length;
17379+ reiser4_inode *state;
17380+
17381+ assert("nikita-292", coord != NULL);
17382+ assert("nikita-293", inode != NULL);
17383+
17384+ coord_clear_iplug(coord);
17385+ result = zload(coord->node);
17386+ if (result)
17387+ return result;
17388+ iplug = item_plugin_by_coord(coord);
17389+ body = item_body_by_coord(coord);
17390+ length = item_length_by_coord(coord);
17391+
17392+ assert("nikita-295", iplug != NULL);
17393+ assert("nikita-296", body != NULL);
17394+ assert("nikita-297", length > 0);
17395+
17396+ /* inode is under I_LOCK now */
17397+
17398+ state = reiser4_inode_data(inode);
17399+ /* call stat-data plugin method to load sd content into inode */
17400+ result = iplug->s.sd.init_inode(inode, body, length);
17401+ set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17402+ if (result == 0) {
17403+ result = setup_inode_ops(inode, NULL);
17404+ if (result == 0 && inode->i_sb->s_root &&
17405+ inode->i_sb->s_root->d_inode)
17406+ result = finish_pset(inode);
17407+ }
17408+ zrelse(coord->node);
17409+ return result;
17410+}
17411+
17412+/* read `inode' from the disk. This is what was previously in
17413+ reiserfs_read_inode2().
17414+
17415+ Must be called with inode locked. Return inode still locked.
17416+*/
17417+static int read_inode(struct inode *inode /* inode to read from disk */ ,
17418+ const reiser4_key * key /* key of stat data */ ,
17419+ int silent)
17420+{
17421+ int result;
17422+ lock_handle lh;
17423+ reiser4_inode *info;
17424+ coord_t coord;
17425+
17426+ assert("nikita-298", inode != NULL);
17427+ assert("nikita-1945", !is_inode_loaded(inode));
17428+
17429+ info = reiser4_inode_data(inode);
17430+ assert("nikita-300", info->locality_id != 0);
17431+
17432+ coord_init_zero(&coord);
17433+ init_lh(&lh);
17434+ /* locate stat-data in a tree and return znode locked */
17435+ result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17436+ assert("nikita-301", !is_inode_loaded(inode));
17437+ if (result == 0) {
17438+ /* use stat-data plugin to load sd into inode. */
17439+ result = init_inode(inode, &coord);
17440+ if (result == 0) {
17441+ /* initialize stat-data seal */
17442+ spin_lock_inode(inode);
17443+ reiser4_seal_init(&info->sd_seal, &coord, key);
17444+ info->sd_coord = coord;
17445+ spin_unlock_inode(inode);
17446+
17447+ /* call file plugin's method to initialize plugin
17448+ * specific part of inode */
17449+ if (inode_file_plugin(inode)->init_inode_data)
17450+ inode_file_plugin(inode)->init_inode_data(inode,
17451+ NULL,
17452+ 0);
17453+ /* load detached directory cursors for stateless
17454+ * directory readers (NFS). */
17455+ reiser4_load_cursors(inode);
17456+
17457+ /* Check the opened inode for consistency. */
17458+ result =
17459+ get_super_private(inode->i_sb)->df_plug->
17460+ check_open(inode);
17461+ }
17462+ }
17463+ /* lookup_sd() doesn't release coord because we want znode
17464+ stay read-locked while stat-data fields are accessed in
17465+ init_inode() */
17466+ done_lh(&lh);
17467+
17468+ if (result != 0)
17469+ reiser4_make_bad_inode(inode);
17470+ return result;
17471+}
17472+
17473+/* initialise new reiser4 inode being inserted into hash table. */
17474+static int init_locked_inode(struct inode *inode /* new inode */ ,
17475+ void *opaque /* key of stat data passed to the
17476+ * iget5_locked as cookie */ )
17477+{
17478+ reiser4_key *key;
17479+
17480+ assert("nikita-1995", inode != NULL);
17481+ assert("nikita-1996", opaque != NULL);
17482+ key = opaque;
17483+ set_inode_oid(inode, get_key_objectid(key));
17484+ reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17485+ return 0;
17486+}
17487+
17488+/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17489+
17490+ This function is called by iget5_locked() to distinguish reiser4 inodes
17491+ having the same inode numbers. Such inodes can only exist due to some error
17492+ condition. One of them should be bad. Inodes with identical inode numbers
17493+ (objectids) are distinguished by their packing locality.
17494+
17495+*/
17496+static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to
17497+ * check */ ,
17498+ void *opaque /* "cookie" passed to
17499+ * iget5_locked(). This is stat data
17500+ * key */ )
17501+{
17502+ reiser4_key *key;
17503+
17504+ key = opaque;
17505+ return
17506+ /* oid is unique, so first term is enough, actually. */
17507+ get_inode_oid(inode) == get_key_objectid(key) &&
17508+ /*
17509+ * also, locality should be checked, but locality is stored in
17510+ * the reiser4-specific part of the inode, and actor can be
17511+ * called against arbitrary inode that happened to be in this
17512+ * hash chain. Hence we first have to check that this is
17513+ * reiser4 inode at least. is_reiser4_inode() is probably too
17514+ * early to call, as inode may have ->i_op not yet
17515+ * initialised.
17516+ */
17517+ is_reiser4_super(inode->i_sb) &&
17518+ /*
17519+ * usually objectid is unique, but pseudo files use counter to
17520+ * generate objectid. All pseudo files are placed into special
17521+ * (otherwise unused) locality.
17522+ */
17523+ reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17524+}
17525+
17526+/* hook for kmem_cache_create */
17527+void loading_init_once(reiser4_inode * info)
17528+{
17529+ mutex_init(&info->loading);
17530+}
17531+
17532+/* for reiser4_alloc_inode */
17533+void loading_alloc(reiser4_inode * info)
17534+{
17535+ assert("vs-1717", !mutex_is_locked(&info->loading));
17536+}
17537+
17538+/* for reiser4_destroy */
17539+void loading_destroy(reiser4_inode * info)
17540+{
17541+ assert("vs-1717a", !mutex_is_locked(&info->loading));
17542+}
17543+
17544+static void loading_begin(reiser4_inode * info)
17545+{
17546+ mutex_lock(&info->loading);
17547+}
17548+
17549+static void loading_end(reiser4_inode * info)
17550+{
17551+ mutex_unlock(&info->loading);
17552+}
17553+
17554+/**
17555+ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17556+ * @super: super block of filesystem
17557+ * @key: key of inode's stat-data
17558+ * @silent:
17559+ *
17560+ * This is our helper function a la iget(). This is be called by
17561+ * lookup_common() and reiser4_read_super(). Return inode locked or error
17562+ * encountered.
17563+ */
17564+struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17565+ int silent)
17566+{
17567+ struct inode *inode;
17568+ int result;
17569+ reiser4_inode *info;
17570+
17571+ assert("nikita-302", super != NULL);
17572+ assert("nikita-303", key != NULL);
17573+
17574+ result = 0;
17575+
17576+ /* call iget(). Our ->read_inode() is dummy, so this will either
17577+ find inode in cache or return uninitialised inode */
17578+ inode = iget5_locked(super,
17579+ (unsigned long)get_key_objectid(key),
17580+ reiser4_inode_find_actor,
17581+ init_locked_inode, (reiser4_key *) key);
17582+ if (inode == NULL)
17583+ return ERR_PTR(RETERR(-ENOMEM));
17584+ if (is_bad_inode(inode)) {
17585+ warning("nikita-304", "Bad inode found");
17586+ reiser4_print_key("key", key);
17587+ iput(inode);
17588+ return ERR_PTR(RETERR(-EIO));
17589+ }
17590+
17591+ info = reiser4_inode_data(inode);
17592+
17593+ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17594+ loaded and initialized inode from just allocated inode. If
17595+ REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17596+ info->loading. The place in reiser4 which uses not initialized inode
17597+ is the reiser4 repacker, see repacker-related functions in
17598+ plugin/item/extent.c */
17599+ if (!is_inode_loaded(inode)) {
17600+ loading_begin(info);
17601+ if (!is_inode_loaded(inode)) {
17602+ /* locking: iget5_locked returns locked inode */
17603+ assert("nikita-1941", !is_inode_loaded(inode));
17604+ assert("nikita-1949",
17605+ reiser4_inode_find_actor(inode,
17606+ (reiser4_key *) key));
17607+ /* now, inode has objectid as ->i_ino and locality in
17608+ reiser4-specific part. This is enough for
17609+ read_inode() to read stat data from the disk */
17610+ result = read_inode(inode, key, silent);
17611+ } else
17612+ loading_end(info);
17613+ }
17614+
17615+ if (inode->i_state & I_NEW)
17616+ unlock_new_inode(inode);
17617+
17618+ if (is_bad_inode(inode)) {
17619+ assert("vs-1717", result != 0);
17620+ loading_end(info);
17621+ iput(inode);
17622+ inode = ERR_PTR(result);
17623+ } else if (REISER4_DEBUG) {
17624+ reiser4_key found_key;
17625+
17626+ assert("vs-1717", result == 0);
17627+ build_sd_key(inode, &found_key);
17628+ if (!keyeq(&found_key, key)) {
17629+ warning("nikita-305", "Wrong key in sd");
17630+ reiser4_print_key("sought for", key);
17631+ reiser4_print_key("found", &found_key);
17632+ }
17633+ if (inode->i_nlink == 0) {
17634+ warning("nikita-3559", "Unlinked inode found: %llu\n",
17635+ (unsigned long long)get_inode_oid(inode));
17636+ }
17637+ }
17638+ return inode;
17639+}
17640+
17641+/* reiser4_iget() may return not fully initialized inode, this function should
17642+ * be called after one completes reiser4 inode initializing. */
17643+void reiser4_iget_complete(struct inode *inode)
17644+{
17645+ assert("zam-988", is_reiser4_inode(inode));
17646+
17647+ if (!is_inode_loaded(inode)) {
17648+ reiser4_inode_set_flag(inode, REISER4_LOADED);
17649+ loading_end(reiser4_inode_data(inode));
17650+ }
17651+}
17652+
17653+void reiser4_make_bad_inode(struct inode *inode)
17654+{
17655+ assert("nikita-1934", inode != NULL);
17656+
17657+ /* clear LOADED bit */
17658+ reiser4_inode_clr_flag(inode, REISER4_LOADED);
17659+ make_bad_inode(inode);
17660+ return;
17661+}
17662+
17663+file_plugin *inode_file_plugin(const struct inode * inode)
17664+{
17665+ assert("nikita-1997", inode != NULL);
17666+ return reiser4_inode_data(inode)->pset->file;
17667+}
17668+
17669+dir_plugin *inode_dir_plugin(const struct inode * inode)
17670+{
17671+ assert("nikita-1998", inode != NULL);
17672+ return reiser4_inode_data(inode)->pset->dir;
17673+}
17674+
17675+formatting_plugin *inode_formatting_plugin(const struct inode * inode)
17676+{
17677+ assert("nikita-2000", inode != NULL);
17678+ return reiser4_inode_data(inode)->pset->formatting;
17679+}
17680+
17681+hash_plugin *inode_hash_plugin(const struct inode * inode)
17682+{
17683+ assert("nikita-2001", inode != NULL);
17684+ return reiser4_inode_data(inode)->pset->hash;
17685+}
17686+
17687+fibration_plugin *inode_fibration_plugin(const struct inode * inode)
17688+{
17689+ assert("nikita-2001", inode != NULL);
17690+ return reiser4_inode_data(inode)->pset->fibration;
17691+}
17692+
17693+cipher_plugin *inode_cipher_plugin(const struct inode * inode)
17694+{
17695+ assert("edward-36", inode != NULL);
17696+ return reiser4_inode_data(inode)->pset->cipher;
17697+}
17698+
17699+compression_plugin *inode_compression_plugin(const struct inode * inode)
17700+{
17701+ assert("edward-37", inode != NULL);
17702+ return reiser4_inode_data(inode)->pset->compression;
17703+}
17704+
17705+compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17706+ inode)
17707+{
17708+ assert("edward-1330", inode != NULL);
17709+ return reiser4_inode_data(inode)->pset->compression_mode;
17710+}
17711+
17712+cluster_plugin *inode_cluster_plugin(const struct inode * inode)
17713+{
17714+ assert("edward-1328", inode != NULL);
17715+ return reiser4_inode_data(inode)->pset->cluster;
17716+}
17717+
17718+file_plugin *inode_create_plugin(const struct inode * inode)
17719+{
17720+ assert("edward-1329", inode != NULL);
17721+ return reiser4_inode_data(inode)->pset->create;
17722+}
17723+
17724+digest_plugin *inode_digest_plugin(const struct inode * inode)
17725+{
17726+ assert("edward-86", inode != NULL);
17727+ return reiser4_inode_data(inode)->pset->digest;
17728+}
17729+
17730+item_plugin *inode_sd_plugin(const struct inode * inode)
17731+{
17732+ assert("vs-534", inode != NULL);
17733+ return reiser4_inode_data(inode)->pset->sd;
17734+}
17735+
17736+item_plugin *inode_dir_item_plugin(const struct inode * inode)
17737+{
17738+ assert("vs-534", inode != NULL);
17739+ return reiser4_inode_data(inode)->pset->dir_item;
17740+}
17741+
17742+file_plugin *child_create_plugin(const struct inode * inode)
17743+{
17744+ assert("edward-1329", inode != NULL);
17745+ return reiser4_inode_data(inode)->hset->create;
17746+}
17747+
17748+void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17749+{
17750+ reiser4_inode *state;
17751+
17752+ assert("nikita-2716", inode != NULL);
17753+ assert("nikita-2717", ext < LAST_SD_EXTENSION);
17754+ assert("nikita-3491", spin_inode_is_locked(inode));
17755+
17756+ state = reiser4_inode_data(inode);
17757+ state->extmask |= 1 << ext;
17758+ /* force re-calculation of stat-data length on next call to
17759+ update_sd(). */
17760+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17761+}
17762+
17763+void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
17764+{
17765+ reiser4_inode *state;
17766+
17767+ assert("vpf-1926", inode != NULL);
17768+ assert("vpf-1927", ext < LAST_SD_EXTENSION);
17769+ assert("vpf-1928", spin_inode_is_locked(inode));
17770+
17771+ state = reiser4_inode_data(inode);
17772+ state->extmask &= ~(1 << ext);
17773+ /* force re-calculation of stat-data length on next call to
17774+ update_sd(). */
17775+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17776+}
17777+
17778+void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
17779+{
17780+ assert("edward-1287", inode != NULL);
17781+ if (!dscale_fit(old, new))
17782+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17783+ return;
17784+}
17785+
17786+void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
17787+{
17788+ assert("nikita-2875", inode != NULL);
17789+ spin_lock_inode(inode);
17790+ inode_check_scale_nolock(inode, old, new);
17791+ spin_unlock_inode(inode);
17792+}
17793+
17794+/*
17795+ * initialize ->ordering field of inode. This field defines how file stat-data
17796+ * and body is ordered within a tree with respect to other objects within the
17797+ * same parent directory.
17798+ */
17799+void
17800+init_inode_ordering(struct inode *inode,
17801+ reiser4_object_create_data * crd, int create)
17802+{
17803+ reiser4_key key;
17804+
17805+ if (create) {
17806+ struct inode *parent;
17807+
17808+ parent = crd->parent;
17809+ assert("nikita-3224", inode_dir_plugin(parent) != NULL);
17810+ inode_dir_plugin(parent)->build_entry_key(parent,
17811+ &crd->dentry->d_name,
17812+ &key);
17813+ } else {
17814+ coord_t *coord;
17815+
17816+ coord = &reiser4_inode_data(inode)->sd_coord;
17817+ coord_clear_iplug(coord);
17818+ /* safe to use ->sd_coord, because node is under long term
17819+ * lock */
17820+ WITH_DATA(coord->node, item_key_by_coord(coord, &key));
17821+ }
17822+
17823+ set_inode_ordering(inode, get_key_ordering(&key));
17824+}
17825+
17826+znode *inode_get_vroot(struct inode *inode)
17827+{
17828+ reiser4_block_nr blk;
17829+ znode *result;
17830+
17831+ spin_lock_inode(inode);
17832+ blk = reiser4_inode_data(inode)->vroot;
17833+ spin_unlock_inode(inode);
17834+ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
17835+ result = zlook(reiser4_tree_by_inode(inode), &blk);
17836+ else
17837+ result = NULL;
17838+ return result;
17839+}
17840+
17841+void inode_set_vroot(struct inode *inode, znode *vroot)
17842+{
17843+ spin_lock_inode(inode);
17844+ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
17845+ spin_unlock_inode(inode);
17846+}
17847+
17848+#if REISER4_DEBUG
17849+
17850+void reiser4_inode_invariant(const struct inode *inode)
17851+{
17852+ assert("nikita-3077", spin_inode_is_locked(inode));
17853+}
17854+
17855+int inode_has_no_jnodes(reiser4_inode * r4_inode)
17856+{
17857+ return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
17858+ r4_inode->nr_jnodes == 0;
17859+}
17860+
17861+#endif
17862+
17863+/* true if directory is empty (only contains dot and dotdot) */
17864+/* FIXME: shouldn't it be dir plugin method? */
17865+int is_dir_empty(const struct inode *dir)
17866+{
17867+ assert("nikita-1976", dir != NULL);
17868+
17869+ /* rely on our method to maintain directory i_size being equal to the
17870+ number of entries. */
17871+ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
17872+}
17873+
17874+/* Make Linus happy.
17875+ Local variables:
17876+ c-indentation-style: "K&R"
17877+ mode-name: "LC"
17878+ c-basic-offset: 8
17879+ tab-width: 8
17880+ fill-column: 120
17881+ End:
17882+*/
17883diff -urN linux-2.6.24.orig/fs/reiser4/inode.h linux-2.6.24/fs/reiser4/inode.h
17884--- linux-2.6.24.orig/fs/reiser4/inode.h 1970-01-01 03:00:00.000000000 +0300
17885+++ linux-2.6.24/fs/reiser4/inode.h 2008-01-25 11:39:06.936207689 +0300
17886@@ -0,0 +1,449 @@
17887+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17888+
17889+/* Inode functions. */
17890+
17891+#if !defined( __REISER4_INODE_H__ )
17892+#define __REISER4_INODE_H__
17893+
17894+#include "forward.h"
17895+#include "debug.h"
17896+#include "key.h"
17897+#include "seal.h"
17898+#include "plugin/plugin.h"
17899+#include "plugin/file/cryptcompress.h"
17900+#include "plugin/file/file.h"
17901+#include "plugin/dir/dir.h"
17902+#include "plugin/plugin_set.h"
17903+#include "plugin/security/perm.h"
17904+#include "vfs_ops.h"
17905+#include "jnode.h"
17906+#include "fsdata.h"
17907+
17908+#include <linux/types.h> /* for __u?? , ino_t */
17909+#include <linux/fs.h> /* for struct super_block, struct
17910+ * rw_semaphore, etc */
17911+#include <linux/spinlock.h>
17912+#include <asm/types.h>
17913+
17914+/* reiser4-specific inode flags. They are "transient" and are not
17915+ supposed to be stored on disk. Used to trace "state" of
17916+ inode
17917+*/
17918+typedef enum {
17919+ /* this is light-weight inode, inheriting some state from its
17920+ parent */
17921+ REISER4_LIGHT_WEIGHT = 0,
17922+ /* stat data wasn't yet created */
17923+ REISER4_NO_SD = 1,
17924+ /* internal immutable flag. Currently is only used
17925+ to avoid race condition during file creation.
17926+ See comment in create_object(). */
17927+ REISER4_IMMUTABLE = 2,
17928+ /* inode was read from storage */
17929+ REISER4_LOADED = 3,
17930+ /* this bit is set for symlinks. inode->i_private points to target
17931+ name of symlink. */
17932+ REISER4_GENERIC_PTR_USED = 4,
17933+ /* set if size of stat-data item for this inode is known. If this is
17934+ * set we can avoid recalculating size of stat-data on each update. */
17935+ REISER4_SDLEN_KNOWN = 5,
17936+ /* reiser4_inode->crypt points to the crypto stat */
17937+ REISER4_CRYPTO_STAT_LOADED = 6,
17938+ /* cryptcompress_inode_data points to the secret key */
17939+ REISER4_SECRET_KEY_INSTALLED = 7,
17940+ /* File (possibly) has pages corresponding to the tail items, that
17941+ * were created by ->readpage. It is set by mmap_unix_file() and
17942+ * sendfile_unix_file(). This bit is inspected by write_unix_file and
17943+ * kill-hook of tail items. It is never cleared once set. This bit is
17944+ * modified and inspected under i_mutex. */
17945+ REISER4_HAS_MMAP = 8,
17946+ REISER4_PART_MIXED = 9,
17947+ REISER4_PART_IN_CONV = 10,
17948+ /* This flag indicates that file plugin conversion is in progress */
17949+ REISER4_FILE_CONV_IN_PROGRESS = 11
17950+} reiser4_file_plugin_flags;
17951+
17952+/* state associated with each inode.
17953+ reiser4 inode.
17954+
17955+ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
17956+ be of the same size. File-system allocates inodes by itself through
17957+ s_op->allocate_inode() method. So, it is possible to adjust size of inode
17958+ at the time of its creation.
17959+
17960+ Invariants involving parts of this data-type:
17961+
17962+ [inode->eflushed]
17963+
17964+*/
17965+
17966+typedef struct reiser4_inode reiser4_inode;
17967+/* return pointer to reiser4-specific part of inode */
17968+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17969+ /* inode queried */ );
17970+
17971+#if BITS_PER_LONG == 64
17972+
17973+#define REISER4_INO_IS_OID (1)
17974+typedef struct {;
17975+} oid_hi_t;
17976+
17977+/* BITS_PER_LONG == 64 */
17978+#else
17979+
17980+#define REISER4_INO_IS_OID (0)
17981+typedef __u32 oid_hi_t;
17982+
17983+/* BITS_PER_LONG == 64 */
17984+#endif
17985+
17986+struct reiser4_inode {
17987+ /* spin lock protecting fields of this structure. */
17988+ spinlock_t guard;
17989+ /* main plugin set that control the file
17990+ (see comments in plugin/plugin_set.c) */
17991+ plugin_set *pset;
17992+ /* plugin set for inheritance
17993+ (see comments in plugin/plugin_set.c) */
17994+ plugin_set *hset;
17995+ /* high 32 bits of object id */
17996+ oid_hi_t oid_hi;
17997+ /* seal for stat-data */
17998+ seal_t sd_seal;
17999+ /* locality id for this file */
18000+ oid_t locality_id;
18001+#if REISER4_LARGE_KEY
18002+ __u64 ordering;
18003+#endif
18004+ /* coord of stat-data in sealed node */
18005+ coord_t sd_coord;
18006+ /* bit-mask of stat-data extentions used by this file */
18007+ __u64 extmask;
18008+ /* bitmask of non-default plugins for this inode */
18009+ __u16 plugin_mask;
18010+ /* bitmask of set heir plugins for this inode. */
18011+ __u16 heir_mask;
18012+ union {
18013+ struct list_head readdir_list;
18014+ struct list_head not_used;
18015+ } lists;
18016+ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
18017+ unsigned long flags;
18018+ union {
18019+ /* fields specific to unix_file plugin */
18020+ struct unix_file_info unix_file_info;
18021+ /* fields specific to cryptcompress file plugin */
18022+ struct cryptcompress_info cryptcompress_info;
18023+ } file_plugin_data;
18024+
18025+ /* this semaphore is to serialize readers and writers of @pset->file
18026+ * when file plugin conversion is enabled
18027+ */
18028+ struct rw_semaphore conv_sem;
18029+
18030+ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
18031+ tagged in that tree by EFLUSH_TAG_ANONYMOUS */
18032+ struct radix_tree_root jnodes_tree;
18033+#if REISER4_DEBUG
18034+ /* number of unformatted node jnodes of this file in jnode hash table */
18035+ unsigned long nr_jnodes;
18036+#endif
18037+
18038+ /* block number of virtual root for this object. See comment above
18039+ * fs/reiser4/search.c:handle_vroot() */
18040+ reiser4_block_nr vroot;
18041+ struct mutex loading;
18042+};
18043+
18044+void loading_init_once(reiser4_inode *);
18045+void loading_alloc(reiser4_inode *);
18046+void loading_destroy(reiser4_inode *);
18047+
18048+struct reiser4_inode_object {
18049+ /* private part */
18050+ reiser4_inode p;
18051+ /* generic fields not specific to reiser4, but used by VFS */
18052+ struct inode vfs_inode;
18053+};
18054+
18055+/* return pointer to the reiser4 specific portion of @inode */
18056+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18057+ /* inode queried */ )
18058+{
18059+ assert("nikita-254", inode != NULL);
18060+ return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p;
18061+}
18062+
18063+static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18064+ r4_inode /* inode queried */
18065+ )
18066+{
18067+ return &container_of(r4_inode, struct reiser4_inode_object, p)->vfs_inode;
18068+}
18069+
18070+/*
18071+ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18072+ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18073+ * bits.
18074+ *
18075+ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18076+ * of inode, otherwise whole oid is stored in i_ino.
18077+ *
18078+ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18079+ */
18080+
18081+#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18082+
18083+#if REISER4_INO_IS_OID
18084+
18085+static inline oid_t get_inode_oid(const struct inode *inode)
18086+{
18087+ return inode->i_ino;
18088+}
18089+
18090+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18091+{
18092+ inode->i_ino = oid;
18093+}
18094+
18095+/* REISER4_INO_IS_OID */
18096+#else
18097+
18098+static inline oid_t get_inode_oid(const struct inode *inode)
18099+{
18100+ return
18101+ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18102+ inode->i_ino;
18103+}
18104+
18105+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18106+{
18107+ assert("nikita-2519", inode != NULL);
18108+ inode->i_ino = (ino_t) (oid);
18109+ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18110+ assert("nikita-2521", get_inode_oid(inode) == (oid));
18111+}
18112+
18113+/* REISER4_INO_IS_OID */
18114+#endif
18115+
18116+static inline oid_t get_inode_locality(const struct inode *inode)
18117+{
18118+ return reiser4_inode_data(inode)->locality_id;
18119+}
18120+
18121+#if REISER4_LARGE_KEY
18122+static inline __u64 get_inode_ordering(const struct inode *inode)
18123+{
18124+ return reiser4_inode_data(inode)->ordering;
18125+}
18126+
18127+static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18128+{
18129+ reiser4_inode_data(inode)->ordering = ordering;
18130+}
18131+
18132+#else
18133+
18134+#define get_inode_ordering(inode) (0)
18135+#define set_inode_ordering(inode, val) noop
18136+
18137+#endif
18138+
18139+/* return inode in which @uf_info is embedded */
18140+static inline struct inode *
18141+unix_file_info_to_inode(const struct unix_file_info * uf_info)
18142+{
18143+ return &container_of(uf_info, struct reiser4_inode_object,
18144+ p.file_plugin_data.unix_file_info)->vfs_inode;
18145+}
18146+
18147+extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18148+extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18149+
18150+extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18151+
18152+#if REISER4_DEBUG
18153+extern void reiser4_inode_invariant(const struct inode *inode);
18154+extern int inode_has_no_jnodes(reiser4_inode *);
18155+#else
18156+#define reiser4_inode_invariant(inode) noop
18157+#endif
18158+
18159+static inline int spin_inode_is_locked(const struct inode *inode)
18160+{
18161+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18162+ return 1;
18163+}
18164+
18165+/**
18166+ * spin_lock_inode - lock reiser4_inode' embedded spinlock
18167+ * @inode: inode to lock
18168+ *
18169+ * In debug mode it checks that lower priority locks are not held and
18170+ * increments reiser4_context's lock counters on which lock ordering checking
18171+ * is based.
18172+ */
18173+static inline void spin_lock_inode(struct inode *inode)
18174+{
18175+ assert("", LOCK_CNT_NIL(spin_locked));
18176+ /* check lock ordering */
18177+ assert_spin_not_locked(&d_lock);
18178+
18179+ spin_lock(&reiser4_inode_data(inode)->guard);
18180+
18181+ LOCK_CNT_INC(spin_locked_inode);
18182+ LOCK_CNT_INC(spin_locked);
18183+
18184+ reiser4_inode_invariant(inode);
18185+}
18186+
18187+/**
18188+ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18189+ * @inode: inode to unlock
18190+ *
18191+ * In debug mode it checks that spinlock is held and decrements
18192+ * reiser4_context's lock counters on which lock ordering checking is based.
18193+ */
18194+static inline void spin_unlock_inode(struct inode *inode)
18195+{
18196+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18197+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18198+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18199+
18200+ reiser4_inode_invariant(inode);
18201+
18202+ LOCK_CNT_DEC(spin_locked_inode);
18203+ LOCK_CNT_DEC(spin_locked);
18204+
18205+ spin_unlock(&reiser4_inode_data(inode)->guard);
18206+}
18207+
18208+extern znode *inode_get_vroot(struct inode *inode);
18209+extern void inode_set_vroot(struct inode *inode, znode * vroot);
18210+
18211+extern int reiser4_max_filename_len(const struct inode *inode);
18212+extern int max_hash_collisions(const struct inode *dir);
18213+extern void reiser4_unlock_inode(struct inode *inode);
18214+extern int is_reiser4_inode(const struct inode *inode);
18215+extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18216+extern struct inode *reiser4_iget(struct super_block *super,
18217+ const reiser4_key * key, int silent);
18218+extern void reiser4_iget_complete(struct inode *inode);
18219+extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18220+extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18221+extern int reiser4_inode_get_flag(const struct inode *inode,
18222+ reiser4_file_plugin_flags f);
18223+
18224+/* has inode been initialized? */
18225+static inline int
18226+is_inode_loaded(const struct inode *inode /* inode queried */ )
18227+{
18228+ assert("nikita-1120", inode != NULL);
18229+ return reiser4_inode_get_flag(inode, REISER4_LOADED);
18230+}
18231+
18232+extern file_plugin *inode_file_plugin(const struct inode *inode);
18233+extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18234+extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18235+extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18236+extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18237+extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18238+extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18239+extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18240+extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18241+ *inode);
18242+extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18243+extern file_plugin *inode_create_plugin(const struct inode *inode);
18244+extern item_plugin *inode_sd_plugin(const struct inode *inode);
18245+extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18246+extern file_plugin *child_create_plugin(const struct inode *inode);
18247+
18248+extern void reiser4_make_bad_inode(struct inode *inode);
18249+
18250+extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18251+extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18252+extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18253+extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18254+
18255+#define INODE_SET_SIZE(i, value) \
18256+({ \
18257+ struct inode *__i; \
18258+ typeof(value) __v; \
18259+ \
18260+ __i = (i); \
18261+ __v = (value); \
18262+ inode_check_scale(__i, __i->i_size, __v); \
18263+ i_size_write(__i, __v); \
18264+})
18265+
18266+/*
18267+ * update field @field in inode @i to contain value @value.
18268+ */
18269+#define INODE_SET_FIELD(i, field, value) \
18270+({ \
18271+ struct inode *__i; \
18272+ typeof(value) __v; \
18273+ \
18274+ __i = (i); \
18275+ __v = (value); \
18276+ inode_check_scale(__i, __i->field, __v); \
18277+ __i->field = __v; \
18278+})
18279+
18280+#define INODE_INC_FIELD(i, field) \
18281+({ \
18282+ struct inode *__i; \
18283+ \
18284+ __i = (i); \
18285+ inode_check_scale(__i, __i->field, __i->field + 1); \
18286+ ++ __i->field; \
18287+})
18288+
18289+#define INODE_DEC_FIELD(i, field) \
18290+({ \
18291+ struct inode *__i; \
18292+ \
18293+ __i = (i); \
18294+ inode_check_scale(__i, __i->field, __i->field - 1); \
18295+ -- __i->field; \
18296+})
18297+
18298+/* See comment before reiser4_readdir_common() for description. */
18299+static inline struct list_head *get_readdir_list(const struct inode *inode)
18300+{
18301+ return &reiser4_inode_data(inode)->lists.readdir_list;
18302+}
18303+
18304+extern void init_inode_ordering(struct inode *inode,
18305+ reiser4_object_create_data * crd, int create);
18306+
18307+static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18308+{
18309+ return &reiser4_inode_data(inode)->jnodes_tree;
18310+}
18311+
18312+static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18313+ * r4_inode)
18314+{
18315+ return &r4_inode->jnodes_tree;
18316+}
18317+
18318+#if REISER4_DEBUG
18319+extern void print_inode(const char *prefix, const struct inode *i);
18320+#endif
18321+
18322+int is_dir_empty(const struct inode *);
18323+
18324+/* __REISER4_INODE_H__ */
18325+#endif
18326+
18327+/* Make Linus happy.
18328+ Local variables:
18329+ c-indentation-style: "K&R"
18330+ mode-name: "LC"
18331+ c-basic-offset: 8
18332+ tab-width: 8
18333+ fill-column: 120
18334+ End:
18335+*/
18336diff -urN linux-2.6.24.orig/fs/reiser4/ioctl.h linux-2.6.24/fs/reiser4/ioctl.h
18337--- linux-2.6.24.orig/fs/reiser4/ioctl.h 1970-01-01 03:00:00.000000000 +0300
18338+++ linux-2.6.24/fs/reiser4/ioctl.h 2008-01-25 11:39:06.936207689 +0300
18339@@ -0,0 +1,41 @@
18340+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18341+ * reiser4/README */
18342+
18343+#if !defined( __REISER4_IOCTL_H__ )
18344+#define __REISER4_IOCTL_H__
18345+
18346+#include <linux/fs.h>
18347+
18348+/*
18349+ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18350+ * extents and fix in this state. This is used by applications that rely on
18351+ *
18352+ * . files being block aligned, and
18353+ *
18354+ * . files never migrating on disk
18355+ *
18356+ * for example, boot loaders (LILO) need this.
18357+ *
18358+ * This ioctl should be used as
18359+ *
18360+ * result = ioctl(fd, REISER4_IOC_UNPACK);
18361+ *
18362+ * File behind fd descriptor will be converted to the extents (if necessary),
18363+ * and its stat-data will be updated so that it will never be converted back
18364+ * into tails again.
18365+ */
18366+#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18367+
18368+/* __REISER4_IOCTL_H__ */
18369+#endif
18370+
18371+/* Make Linus happy.
18372+ Local variables:
18373+ c-indentation-style: "K&R"
18374+ mode-name: "LC"
18375+ c-basic-offset: 8
18376+ tab-width: 8
18377+ fill-column: 120
18378+ scroll-step: 1
18379+ End:
18380+*/
18381diff -urN linux-2.6.24.orig/fs/reiser4/jnode.c linux-2.6.24/fs/reiser4/jnode.c
18382--- linux-2.6.24.orig/fs/reiser4/jnode.c 1970-01-01 03:00:00.000000000 +0300
18383+++ linux-2.6.24/fs/reiser4/jnode.c 2008-01-25 11:39:06.940208719 +0300
18384@@ -0,0 +1,1924 @@
18385+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18386+ * reiser4/README */
18387+/* Jnode manipulation functions. */
18388+/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18389+
18390+ In particular, jnodes are used to track transactional information
18391+ associated with each block. Each znode contains jnode as ->zjnode field.
18392+
18393+ Jnode stands for either Josh or Journal node.
18394+*/
18395+
18396+/*
18397+ * Taxonomy.
18398+ *
18399+ * Jnode represents block containing data or meta-data. There are jnodes
18400+ * for:
18401+ *
18402+ * unformatted blocks (jnodes proper). There are plans, however to
18403+ * have a handle per extent unit rather than per each unformatted
18404+ * block, because there are so many of them.
18405+ *
18406+ * For bitmaps. Each bitmap is actually represented by two jnodes--one
18407+ * for working and another for "commit" data, together forming bnode.
18408+ *
18409+ * For io-heads. These are used by log writer.
18410+ *
18411+ * For formatted nodes (znode). See comment at the top of znode.c for
18412+ * details specific to the formatted nodes (znodes).
18413+ *
18414+ * Node data.
18415+ *
18416+ * Jnode provides access to the data of node it represents. Data are
18417+ * stored in a page. Page is kept in a page cache. This means, that jnodes
18418+ * are highly interconnected with page cache and VM internals.
18419+ *
18420+ * jnode has a pointer to page (->pg) containing its data. Pointer to data
18421+ * themselves is cached in ->data field to avoid frequent calls to
18422+ * page_address().
18423+ *
18424+ * jnode and page are attached to each other by jnode_attach_page(). This
18425+ * function places pointer to jnode in set_page_private(), sets PG_private
18426+ * flag and increments page counter.
18427+ *
18428+ * Opposite operation is performed by page_clear_jnode().
18429+ *
18430+ * jnode->pg is protected by jnode spin lock, and page->private is
18431+ * protected by page lock. See comment at the top of page_cache.c for
18432+ * more.
18433+ *
18434+ * page can be detached from jnode for two reasons:
18435+ *
18436+ * . jnode is removed from a tree (file is truncated, of formatted
18437+ * node is removed by balancing).
18438+ *
18439+ * . during memory pressure, VM calls ->releasepage() method
18440+ * (reiser4_releasepage()) to evict page from memory.
18441+ *
18442+ * (there, of course, is also umount, but this is special case we are not
18443+ * concerned with here).
18444+ *
18445+ * To protect jnode page from eviction, one calls jload() function that
18446+ * "pins" page in memory (loading it if necessary), increments
18447+ * jnode->d_count, and kmap()s page. Page is unpinned through call to
18448+ * jrelse().
18449+ *
18450+ * Jnode life cycle.
18451+ *
18452+ * jnode is created, placed in hash table, and, optionally, in per-inode
18453+ * radix tree. Page can be attached to jnode, pinned, released, etc.
18454+ *
18455+ * When jnode is captured into atom its reference counter is
18456+ * increased. While being part of an atom, jnode can be "early
18457+ * flushed". This means that as part of flush procedure, jnode is placed
18458+ * into "relocate set", and its page is submitted to the disk. After io
18459+ * completes, page can be detached, then loaded again, re-dirtied, etc.
18460+ *
18461+ * Thread acquired reference to jnode by calling jref() and releases it by
18462+ * jput(). When last reference is removed, jnode is still retained in
18463+ * memory (cached) if it has page attached, _unless_ it is scheduled for
18464+ * destruction (has JNODE_HEARD_BANSHEE bit set).
18465+ *
18466+ * Tree read-write lock was used as "existential" lock for jnodes. That is,
18467+ * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18468+ * that is, tree lock protected unreferenced jnodes stored in the hash
18469+ * table, from recycling.
18470+ *
18471+ * This resulted in high contention on tree lock, because jref()/jput() is
18472+ * frequent operation. To ameliorate this problem, RCU is used: when jput()
18473+ * is just about to release last reference on jnode it sets JNODE_RIP bit
18474+ * on it, and then proceed with jnode destruction (removing jnode from hash
18475+ * table, cbk_cache, detaching page, etc.). All places that change jnode
18476+ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18477+ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18478+ * jnode_rip_check() function), and pretend that nothing was found in hash
18479+ * table if bit is set.
18480+ *
18481+ * jput defers actual return of jnode into slab cache to some later time
18482+ * (by call_rcu()), this guarantees that other threads can safely continue
18483+ * working with JNODE_RIP-ped jnode.
18484+ *
18485+ */
18486+
18487+#include "reiser4.h"
18488+#include "debug.h"
18489+#include "dformat.h"
18490+#include "jnode.h"
18491+#include "plugin/plugin_header.h"
18492+#include "plugin/plugin.h"
18493+#include "txnmgr.h"
18494+/*#include "jnode.h"*/
18495+#include "znode.h"
18496+#include "tree.h"
18497+#include "tree_walk.h"
18498+#include "super.h"
18499+#include "inode.h"
18500+#include "page_cache.h"
18501+
18502+#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18503+#include <linux/types.h>
18504+#include <linux/slab.h>
18505+#include <linux/pagemap.h>
18506+#include <linux/swap.h>
18507+#include <linux/fs.h> /* for struct address_space */
18508+#include <linux/writeback.h> /* for inode_lock */
18509+
18510+static struct kmem_cache *_jnode_slab = NULL;
18511+
18512+static void jnode_set_type(jnode * node, jnode_type type);
18513+static int jdelete(jnode * node);
18514+static int jnode_try_drop(jnode * node);
18515+
18516+#if REISER4_DEBUG
18517+static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18518+#endif
18519+
18520+/* true if valid page is attached to jnode */
18521+static inline int jnode_is_parsed(jnode * node)
18522+{
18523+ return JF_ISSET(node, JNODE_PARSED);
18524+}
18525+
18526+/* hash table support */
18527+
18528+/* compare two jnode keys for equality. Used by hash-table macros */
18529+static inline int jnode_key_eq(const struct jnode_key * k1,
18530+ const struct jnode_key * k2)
18531+{
18532+ assert("nikita-2350", k1 != NULL);
18533+ assert("nikita-2351", k2 != NULL);
18534+
18535+ return (k1->index == k2->index && k1->objectid == k2->objectid);
18536+}
18537+
18538+/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18539+static inline __u32 jnode_key_hashfn(j_hash_table * table,
18540+ const struct jnode_key * key)
18541+{
18542+ assert("nikita-2352", key != NULL);
18543+ assert("nikita-3346", IS_POW(table->_buckets));
18544+
18545+ /* yes, this is remarkable simply (where not stupid) hash function. */
18546+ return (key->objectid + key->index) & (table->_buckets - 1);
18547+}
18548+
18549+/* The hash table definition */
18550+#define KMALLOC(size) reiser4_vmalloc(size)
18551+#define KFREE(ptr, size) vfree(ptr)
18552+TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j,
18553+ jnode_key_hashfn, jnode_key_eq);
18554+#undef KFREE
18555+#undef KMALLOC
18556+
18557+/* call this to initialise jnode hash table */
18558+int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18559+{
18560+ assert("nikita-2359", tree != NULL);
18561+ return j_hash_init(&tree->jhash_table, 16384);
18562+}
18563+
18564+/* call this to destroy jnode hash table. This is called during umount. */
18565+int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18566+{
18567+ j_hash_table *jtable;
18568+ jnode *node;
18569+ jnode *next;
18570+
18571+ assert("nikita-2360", tree != NULL);
18572+
18573+ /*
18574+ * Scan hash table and free all jnodes.
18575+ */
18576+ jtable = &tree->jhash_table;
18577+ if (jtable->_table) {
18578+ for_all_in_htable(jtable, j, node, next) {
18579+ assert("nikita-2361", !atomic_read(&node->x_count));
18580+ jdrop(node);
18581+ }
18582+
18583+ j_hash_done(&tree->jhash_table);
18584+ }
18585+ return 0;
18586+}
18587+
18588+/**
18589+ * init_jnodes - create jnode cache
18590+ *
18591+ * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18592+ */
18593+int init_jnodes(void)
18594+{
18595+ assert("umka-168", _jnode_slab == NULL);
18596+
18597+ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18598+ SLAB_HWCACHE_ALIGN |
18599+ SLAB_RECLAIM_ACCOUNT, NULL);
18600+ if (_jnode_slab == NULL)
18601+ return RETERR(-ENOMEM);
18602+
18603+ return 0;
18604+}
18605+
18606+/**
18607+ * done_znodes - delete znode cache
18608+ *
18609+ * This is called on reiser4 module unloading or system shutdown.
18610+ */
18611+void done_jnodes(void)
18612+{
18613+ destroy_reiser4_cache(&_jnode_slab);
18614+}
18615+
18616+/* Initialize a jnode. */
18617+void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18618+{
18619+ assert("umka-175", node != NULL);
18620+
18621+ memset(node, 0, sizeof(jnode));
18622+ ON_DEBUG(node->magic = JMAGIC);
18623+ jnode_set_type(node, type);
18624+ atomic_set(&node->d_count, 0);
18625+ atomic_set(&node->x_count, 0);
18626+ spin_lock_init(&node->guard);
18627+ spin_lock_init(&node->load);
18628+ node->atom = NULL;
18629+ node->tree = tree;
18630+ INIT_LIST_HEAD(&node->capture_link);
18631+
18632+ ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18633+
18634+ INIT_RCU_HEAD(&node->rcu);
18635+
18636+#if REISER4_DEBUG
18637+ {
18638+ reiser4_super_info_data *sbinfo;
18639+
18640+ sbinfo = get_super_private(tree->super);
18641+ spin_lock_irq(&sbinfo->all_guard);
18642+ list_add(&node->jnodes, &sbinfo->all_jnodes);
18643+ spin_unlock_irq(&sbinfo->all_guard);
18644+ }
18645+#endif
18646+}
18647+
18648+#if REISER4_DEBUG
18649+/*
18650+ * Remove jnode from ->all_jnodes list.
18651+ */
18652+static void jnode_done(jnode * node, reiser4_tree * tree)
18653+{
18654+ reiser4_super_info_data *sbinfo;
18655+
18656+ sbinfo = get_super_private(tree->super);
18657+
18658+ spin_lock_irq(&sbinfo->all_guard);
18659+ assert("nikita-2422", !list_empty(&node->jnodes));
18660+ list_del_init(&node->jnodes);
18661+ spin_unlock_irq(&sbinfo->all_guard);
18662+}
18663+#endif
18664+
18665+/* return already existing jnode of page */
18666+jnode *jnode_by_page(struct page *pg)
18667+{
18668+ assert("nikita-2066", pg != NULL);
18669+ assert("nikita-2400", PageLocked(pg));
18670+ assert("nikita-2068", PagePrivate(pg));
18671+ assert("nikita-2067", jprivate(pg) != NULL);
18672+ return jprivate(pg);
18673+}
18674+
18675+/* exported functions to allocate/free jnode objects outside this file */
18676+jnode *jalloc(void)
18677+{
18678+ jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
18679+ return jal;
18680+}
18681+
18682+/* return jnode back to the slab allocator */
18683+inline void jfree(jnode * node)
18684+{
18685+ assert("zam-449", node != NULL);
18686+
18687+ assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18688+ NODE_LIST(node) == NOT_CAPTURED));
18689+ assert("nikita-3222", list_empty(&node->jnodes));
18690+ assert("nikita-3221", jnode_page(node) == NULL);
18691+
18692+ /* not yet phash_jnode_destroy(node); */
18693+
18694+ kmem_cache_free(_jnode_slab, node);
18695+}
18696+
18697+/*
18698+ * This function is supplied as RCU callback. It actually frees jnode when
18699+ * last reference to it is gone.
18700+ */
18701+static void jnode_free_actor(struct rcu_head *head)
18702+{
18703+ jnode *node;
18704+ jnode_type jtype;
18705+
18706+ node = container_of(head, jnode, rcu);
18707+ jtype = jnode_get_type(node);
18708+
18709+ ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18710+
18711+ switch (jtype) {
18712+ case JNODE_IO_HEAD:
18713+ case JNODE_BITMAP:
18714+ case JNODE_UNFORMATTED_BLOCK:
18715+ jfree(node);
18716+ break;
18717+ case JNODE_FORMATTED_BLOCK:
18718+ zfree(JZNODE(node));
18719+ break;
18720+ case JNODE_INODE:
18721+ default:
18722+ wrong_return_value("nikita-3197", "Wrong jnode type");
18723+ }
18724+}
18725+
18726+/*
18727+ * Free a jnode. Post a callback to be executed later through RCU when all
18728+ * references to @node are released.
18729+ */
18730+static inline void jnode_free(jnode * node, jnode_type jtype)
18731+{
18732+ if (jtype != JNODE_INODE) {
18733+ /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18734+ call_rcu(&node->rcu, jnode_free_actor);
18735+ } else
18736+ jnode_list_remove(node);
18737+}
18738+
18739+/* allocate new unformatted jnode */
18740+static jnode *jnew_unformatted(void)
18741+{
18742+ jnode *jal;
18743+
18744+ jal = jalloc();
18745+ if (jal == NULL)
18746+ return NULL;
18747+
18748+ jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18749+ jal->key.j.mapping = NULL;
18750+ jal->key.j.index = (unsigned long)-1;
18751+ jal->key.j.objectid = 0;
18752+ return jal;
18753+}
18754+
18755+/* look for jnode with given mapping and offset within hash table */
18756+jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18757+{
18758+ struct jnode_key jkey;
18759+ jnode *node;
18760+
18761+ assert("nikita-2353", tree != NULL);
18762+
18763+ jkey.objectid = objectid;
18764+ jkey.index = index;
18765+
18766+ /*
18767+ * hash table is _not_ protected by any lock during lookups. All we
18768+ * have to do is to disable preemption to keep RCU happy.
18769+ */
18770+
18771+ rcu_read_lock();
18772+ node = j_hash_find(&tree->jhash_table, &jkey);
18773+ if (node != NULL) {
18774+ /* protect @node from recycling */
18775+ jref(node);
18776+ assert("nikita-2955", jnode_invariant(node, 0, 0));
18777+ node = jnode_rip_check(tree, node);
18778+ }
18779+ rcu_read_unlock();
18780+ return node;
18781+}
18782+
18783+/* per inode radix tree of jnodes is protected by tree's read write spin lock */
18784+static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
18785+{
18786+ assert("vs-1694", mapping->host != NULL);
18787+
18788+ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
18789+}
18790+
18791+jnode *jfind(struct address_space * mapping, unsigned long index)
18792+{
18793+ reiser4_tree *tree;
18794+ jnode *node;
18795+
18796+ assert("vs-1694", mapping->host != NULL);
18797+ tree = reiser4_tree_by_inode(mapping->host);
18798+
18799+ read_lock_tree(tree);
18800+ node = jfind_nolock(mapping, index);
18801+ if (node != NULL)
18802+ jref(node);
18803+ read_unlock_tree(tree);
18804+ return node;
18805+}
18806+
18807+static void inode_attach_jnode(jnode * node)
18808+{
18809+ struct inode *inode;
18810+ reiser4_inode *info;
18811+ struct radix_tree_root *rtree;
18812+
18813+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18814+ assert("zam-1043", node->key.j.mapping != NULL);
18815+ inode = node->key.j.mapping->host;
18816+ info = reiser4_inode_data(inode);
18817+ rtree = jnode_tree_by_reiser4_inode(info);
18818+ if (rtree->rnode == NULL) {
18819+ /* prevent inode from being pruned when it has jnodes attached
18820+ to it */
18821+ write_lock_irq(&inode->i_data.tree_lock);
18822+ inode->i_data.nrpages++;
18823+ write_unlock_irq(&inode->i_data.tree_lock);
18824+ }
18825+ assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
18826+ check_me("zam-1045",
18827+ !radix_tree_insert(rtree, node->key.j.index, node));
18828+ ON_DEBUG(info->nr_jnodes++);
18829+}
18830+
18831+static void inode_detach_jnode(jnode * node)
18832+{
18833+ struct inode *inode;
18834+ reiser4_inode *info;
18835+ struct radix_tree_root *rtree;
18836+
18837+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18838+ assert("zam-1044", node->key.j.mapping != NULL);
18839+ inode = node->key.j.mapping->host;
18840+ info = reiser4_inode_data(inode);
18841+ rtree = jnode_tree_by_reiser4_inode(info);
18842+
18843+ assert("zam-1051", info->nr_jnodes != 0);
18844+ assert("zam-1052", rtree->rnode != NULL);
18845+ ON_DEBUG(info->nr_jnodes--);
18846+
18847+ /* delete jnode from inode's radix tree of jnodes */
18848+ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
18849+ if (rtree->rnode == NULL) {
18850+ /* inode can be pruned now */
18851+ write_lock_irq(&inode->i_data.tree_lock);
18852+ inode->i_data.nrpages--;
18853+ write_unlock_irq(&inode->i_data.tree_lock);
18854+ }
18855+}
18856+
18857+/* put jnode into hash table (where they can be found by flush who does not know
18858+ mapping) and to inode's tree of jnodes (where they can be found (hopefully
18859+ faster) in places where mapping is known). Currently it is used by
18860+ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
18861+ created */
18862+static void
18863+hash_unformatted_jnode(jnode * node, struct address_space *mapping,
18864+ unsigned long index)
18865+{
18866+ j_hash_table *jtable;
18867+
18868+ assert("vs-1446", jnode_is_unformatted(node));
18869+ assert("vs-1442", node->key.j.mapping == 0);
18870+ assert("vs-1443", node->key.j.objectid == 0);
18871+ assert("vs-1444", node->key.j.index == (unsigned long)-1);
18872+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18873+
18874+ node->key.j.mapping = mapping;
18875+ node->key.j.objectid = get_inode_oid(mapping->host);
18876+ node->key.j.index = index;
18877+
18878+ jtable = &jnode_get_tree(node)->jhash_table;
18879+
18880+ /* race with some other thread inserting jnode into the hash table is
18881+ * impossible, because we keep the page lock. */
18882+ /*
18883+ * following assertion no longer holds because of RCU: it is possible
18884+ * jnode is in the hash table, but with JNODE_RIP bit set.
18885+ */
18886+ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
18887+ j_hash_insert_rcu(jtable, node);
18888+ inode_attach_jnode(node);
18889+}
18890+
18891+static void unhash_unformatted_node_nolock(jnode * node)
18892+{
18893+ assert("vs-1683", node->key.j.mapping != NULL);
18894+ assert("vs-1684",
18895+ node->key.j.objectid ==
18896+ get_inode_oid(node->key.j.mapping->host));
18897+
18898+ /* remove jnode from hash-table */
18899+ j_hash_remove_rcu(&node->tree->jhash_table, node);
18900+ inode_detach_jnode(node);
18901+ node->key.j.mapping = NULL;
18902+ node->key.j.index = (unsigned long)-1;
18903+ node->key.j.objectid = 0;
18904+
18905+}
18906+
18907+/* remove jnode from hash table and from inode's tree of jnodes. This is used in
18908+ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
18909+ reiser4_uncapture_jnode */
18910+void unhash_unformatted_jnode(jnode * node)
18911+{
18912+ assert("vs-1445", jnode_is_unformatted(node));
18913+
18914+ write_lock_tree(node->tree);
18915+ unhash_unformatted_node_nolock(node);
18916+ write_unlock_tree(node->tree);
18917+}
18918+
18919+/*
18920+ * search hash table for a jnode with given oid and index. If not found,
18921+ * allocate new jnode, insert it, and also insert into radix tree for the
18922+ * given inode/mapping.
18923+ */
18924+static jnode *find_get_jnode(reiser4_tree * tree,
18925+ struct address_space *mapping,
18926+ oid_t oid, unsigned long index)
18927+{
18928+ jnode *result;
18929+ jnode *shadow;
18930+ int preload;
18931+
18932+ result = jnew_unformatted();
18933+
18934+ if (unlikely(result == NULL))
18935+ return ERR_PTR(RETERR(-ENOMEM));
18936+
18937+ preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
18938+ if (preload != 0)
18939+ return ERR_PTR(preload);
18940+
18941+ write_lock_tree(tree);
18942+ shadow = jfind_nolock(mapping, index);
18943+ if (likely(shadow == NULL)) {
18944+ /* add new jnode to hash table and inode's radix tree of jnodes */
18945+ jref(result);
18946+ hash_unformatted_jnode(result, mapping, index);
18947+ } else {
18948+ /* jnode is found in inode's radix tree of jnodes */
18949+ jref(shadow);
18950+ jnode_free(result, JNODE_UNFORMATTED_BLOCK);
18951+ assert("vs-1498", shadow->key.j.mapping == mapping);
18952+ result = shadow;
18953+ }
18954+ write_unlock_tree(tree);
18955+
18956+ assert("nikita-2955",
18957+ ergo(result != NULL, jnode_invariant(result, 0, 0)));
18958+ radix_tree_preload_end();
18959+ return result;
18960+}
18961+
18962+/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
18963+ creates) jnode corresponding to page @pg. jnode is attached to page and
18964+ inserted into jnode hash-table. */
18965+static jnode *do_jget(reiser4_tree * tree, struct page *pg)
18966+{
18967+ /*
18968+ * There are two ways to create jnode: starting with pre-existing page
18969+ * and without page.
18970+ *
18971+ * When page already exists, jnode is created
18972+ * (jnode_of_page()->do_jget()) under page lock. This is done in
18973+ * ->writepage(), or when capturing anonymous page dirtied through
18974+ * mmap.
18975+ *
18976+ * Jnode without page is created by index_extent_jnode().
18977+ *
18978+ */
18979+
18980+ jnode *result;
18981+ oid_t oid = get_inode_oid(pg->mapping->host);
18982+
18983+ assert("umka-176", pg != NULL);
18984+ assert("nikita-2394", PageLocked(pg));
18985+
18986+ result = jprivate(pg);
18987+ if (likely(result != NULL))
18988+ return jref(result);
18989+
18990+ tree = reiser4_tree_by_page(pg);
18991+
18992+ /* check hash-table first */
18993+ result = jfind(pg->mapping, pg->index);
18994+ if (unlikely(result != NULL)) {
18995+ spin_lock_jnode(result);
18996+ jnode_attach_page(result, pg);
18997+ spin_unlock_jnode(result);
18998+ result->key.j.mapping = pg->mapping;
18999+ return result;
19000+ }
19001+
19002+ /* since page is locked, jnode should be allocated with GFP_NOFS flag */
19003+ reiser4_ctx_gfp_mask_force(GFP_NOFS);
19004+ result = find_get_jnode(tree, pg->mapping, oid, pg->index);
19005+ if (unlikely(IS_ERR(result)))
19006+ return result;
19007+ /* attach jnode to page */
19008+ spin_lock_jnode(result);
19009+ jnode_attach_page(result, pg);
19010+ spin_unlock_jnode(result);
19011+ return result;
19012+}
19013+
19014+/*
19015+ * return jnode for @pg, creating it if necessary.
19016+ */
19017+jnode *jnode_of_page(struct page * pg)
19018+{
19019+ jnode *result;
19020+
19021+ assert("umka-176", pg != NULL);
19022+ assert("nikita-2394", PageLocked(pg));
19023+
19024+ result = do_jget(reiser4_tree_by_page(pg), pg);
19025+
19026+ if (REISER4_DEBUG && !IS_ERR(result)) {
19027+ assert("nikita-3210", result == jprivate(pg));
19028+ assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
19029+ if (jnode_is_unformatted(jprivate(pg))) {
19030+ assert("nikita-2364",
19031+ jprivate(pg)->key.j.index == pg->index);
19032+ assert("nikita-2367",
19033+ jprivate(pg)->key.j.mapping == pg->mapping);
19034+ assert("nikita-2365",
19035+ jprivate(pg)->key.j.objectid ==
19036+ get_inode_oid(pg->mapping->host));
19037+ assert("vs-1200",
19038+ jprivate(pg)->key.j.objectid ==
19039+ pg->mapping->host->i_ino);
19040+ assert("nikita-2356",
19041+ jnode_is_unformatted(jnode_by_page(pg)));
19042+ }
19043+ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
19044+ }
19045+ return result;
19046+}
19047+
19048+/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
19049+ * page.*/
19050+void jnode_attach_page(jnode * node, struct page *pg)
19051+{
19052+ assert("nikita-2060", node != NULL);
19053+ assert("nikita-2061", pg != NULL);
19054+
19055+ assert("nikita-2050", jprivate(pg) == 0ul);
19056+ assert("nikita-2393", !PagePrivate(pg));
19057+ assert("vs-1741", node->pg == NULL);
19058+
19059+ assert("nikita-2396", PageLocked(pg));
19060+ assert_spin_locked(&(node->guard));
19061+
19062+ page_cache_get(pg);
19063+ set_page_private(pg, (unsigned long)node);
19064+ node->pg = pg;
19065+ SetPagePrivate(pg);
19066+}
19067+
19068+/* Dual to jnode_attach_page: break a binding between page and jnode */
19069+void page_clear_jnode(struct page *page, jnode * node)
19070+{
19071+ assert("nikita-2424", page != NULL);
19072+ assert("nikita-2425", PageLocked(page));
19073+ assert("nikita-2426", node != NULL);
19074+ assert_spin_locked(&(node->guard));
19075+ assert("nikita-2428", PagePrivate(page));
19076+
19077+ assert("nikita-3551", !PageWriteback(page));
19078+
19079+ JF_CLR(node, JNODE_PARSED);
19080+ set_page_private(page, 0ul);
19081+ ClearPagePrivate(page);
19082+ node->pg = NULL;
19083+ page_cache_release(page);
19084+}
19085+
19086+#if 0
19087+/* it is only used in one place to handle error */
19088+void
19089+page_detach_jnode(struct page *page, struct address_space *mapping,
19090+ unsigned long index)
19091+{
19092+ assert("nikita-2395", page != NULL);
19093+
19094+ lock_page(page);
19095+ if ((page->mapping == mapping) && (page->index == index)
19096+ && PagePrivate(page)) {
19097+ jnode *node;
19098+
19099+ node = jprivate(page);
19100+ spin_lock_jnode(node);
19101+ page_clear_jnode(page, node);
19102+ spin_unlock_jnode(node);
19103+ }
19104+ unlock_page(page);
19105+}
19106+#endif /* 0 */
19107+
19108+/* return @node page locked.
19109+
19110+ Locking ordering requires that one first takes page lock and afterwards
19111+ spin lock on node attached to this page. Sometimes it is necessary to go in
19112+ the opposite direction. This is done through standard trylock-and-release
19113+ loop.
19114+*/
19115+static struct page *jnode_lock_page(jnode * node)
19116+{
19117+ struct page *page;
19118+
19119+ assert("nikita-2052", node != NULL);
19120+ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19121+
19122+ while (1) {
19123+
19124+ spin_lock_jnode(node);
19125+ page = jnode_page(node);
19126+ if (page == NULL) {
19127+ break;
19128+ }
19129+
19130+ /* no need to page_cache_get( page ) here, because page cannot
19131+ be evicted from memory without detaching it from jnode and
19132+ this requires spin lock on jnode that we already hold.
19133+ */
19134+ if (!TestSetPageLocked(page)) {
19135+ /* We won a lock on jnode page, proceed. */
19136+ break;
19137+ }
19138+
19139+ /* Page is locked by someone else. */
19140+ page_cache_get(page);
19141+ spin_unlock_jnode(node);
19142+ wait_on_page_locked(page);
19143+ /* it is possible that page was detached from jnode and
19144+ returned to the free pool, or re-assigned while we were
19145+ waiting on locked bit. This will be rechecked on the next
19146+ loop iteration.
19147+ */
19148+ page_cache_release(page);
19149+
19150+ /* try again */
19151+ }
19152+ return page;
19153+}
19154+
19155+/*
19156+ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19157+ * validness of jnode content.
19158+ */
19159+static inline int jparse(jnode * node)
19160+{
19161+ int result;
19162+
19163+ assert("nikita-2466", node != NULL);
19164+
19165+ spin_lock_jnode(node);
19166+ if (likely(!jnode_is_parsed(node))) {
19167+ result = jnode_ops(node)->parse(node);
19168+ if (likely(result == 0))
19169+ JF_SET(node, JNODE_PARSED);
19170+ } else
19171+ result = 0;
19172+ spin_unlock_jnode(node);
19173+ return result;
19174+}
19175+
19176+/* Lock a page attached to jnode, create and attach page to jnode if it had no
19177+ * one. */
19178+static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19179+{
19180+ struct page *page;
19181+
19182+ spin_lock_jnode(node);
19183+ page = jnode_page(node);
19184+
19185+ if (page == NULL) {
19186+ spin_unlock_jnode(node);
19187+ page = find_or_create_page(jnode_get_mapping(node),
19188+ jnode_get_index(node), gfp_flags);
19189+ if (page == NULL)
19190+ return ERR_PTR(RETERR(-ENOMEM));
19191+ } else {
19192+ if (!TestSetPageLocked(page)) {
19193+ spin_unlock_jnode(node);
19194+ return page;
19195+ }
19196+ page_cache_get(page);
19197+ spin_unlock_jnode(node);
19198+ lock_page(page);
19199+ assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19200+ }
19201+
19202+ spin_lock_jnode(node);
19203+ if (!jnode_page(node))
19204+ jnode_attach_page(node, page);
19205+ spin_unlock_jnode(node);
19206+
19207+ page_cache_release(page);
19208+ assert("zam-894", jnode_page(node) == page);
19209+ return page;
19210+}
19211+
19212+/* Start read operation for jnode's page if page is not up-to-date. */
19213+static int jnode_start_read(jnode * node, struct page *page)
19214+{
19215+ assert("zam-893", PageLocked(page));
19216+
19217+ if (PageUptodate(page)) {
19218+ unlock_page(page);
19219+ return 0;
19220+ }
19221+ return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19222+}
19223+
19224+#if REISER4_DEBUG
19225+static void check_jload(jnode * node, struct page *page)
19226+{
19227+ if (jnode_is_znode(node)) {
19228+ node40_header *nh;
19229+ znode *z;
19230+
19231+ z = JZNODE(node);
19232+ if (znode_is_any_locked(z)) {
19233+ nh = (node40_header *) kmap(page);
19234+ /* this only works for node40-only file systems. For
19235+ * debugging. */
19236+ assert("nikita-3253",
19237+ z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19238+ kunmap(page);
19239+ }
19240+ assert("nikita-3565", znode_invariant(z));
19241+ }
19242+}
19243+#else
19244+#define check_jload(node, page) noop
19245+#endif
19246+
19247+/* prefetch jnode to speed up next call to jload. Call this when you are going
19248+ * to call jload() shortly. This will bring appropriate portion of jnode into
19249+ * CPU cache. */
19250+void jload_prefetch(jnode * node)
19251+{
19252+ prefetchw(&node->x_count);
19253+}
19254+
19255+/* load jnode's data into memory */
19256+int jload_gfp(jnode * node /* node to load */ ,
19257+ gfp_t gfp_flags /* allocation flags */ ,
19258+ int do_kmap /* true if page should be kmapped */ )
19259+{
19260+ struct page *page;
19261+ int result = 0;
19262+ int parsed;
19263+
19264+ assert("nikita-3010", reiser4_schedulable());
19265+
19266+ prefetchw(&node->pg);
19267+
19268+ /* taking d-reference implies taking x-reference. */
19269+ jref(node);
19270+
19271+ /*
19272+ * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19273+ * should be atomic, otherwise there is a race against
19274+ * reiser4_releasepage().
19275+ */
19276+ spin_lock(&(node->load));
19277+ add_d_ref(node);
19278+ parsed = jnode_is_parsed(node);
19279+ spin_unlock(&(node->load));
19280+
19281+ if (unlikely(!parsed)) {
19282+ page = jnode_get_page_locked(node, gfp_flags);
19283+ if (unlikely(IS_ERR(page))) {
19284+ result = PTR_ERR(page);
19285+ goto failed;
19286+ }
19287+
19288+ result = jnode_start_read(node, page);
19289+ if (unlikely(result != 0))
19290+ goto failed;
19291+
19292+ wait_on_page_locked(page);
19293+ if (unlikely(!PageUptodate(page))) {
19294+ result = RETERR(-EIO);
19295+ goto failed;
19296+ }
19297+
19298+ if (do_kmap)
19299+ node->data = kmap(page);
19300+
19301+ result = jparse(node);
19302+ if (unlikely(result != 0)) {
19303+ if (do_kmap)
19304+ kunmap(page);
19305+ goto failed;
19306+ }
19307+ check_jload(node, page);
19308+ } else {
19309+ page = jnode_page(node);
19310+ check_jload(node, page);
19311+ if (do_kmap)
19312+ node->data = kmap(page);
19313+ }
19314+
19315+ if (!is_writeout_mode())
19316+ /* We do not mark pages active if jload is called as a part of
19317+ * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19318+ * and write_logs() add no value to cached data, there is no
19319+ * sense to mark pages as active when they go to disk, it just
19320+ * confuses vm scanning routines because clean page could be
19321+ * moved out from inactive list as a result of this
19322+ * mark_page_accessed() call. */
19323+ mark_page_accessed(page);
19324+
19325+ return 0;
19326+
19327+ failed:
19328+ jrelse_tail(node);
19329+ return result;
19330+
19331+}
19332+
19333+/* start asynchronous reading for given jnode's page. */
19334+int jstartio(jnode * node)
19335+{
19336+ struct page *page;
19337+
19338+ page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19339+ if (IS_ERR(page))
19340+ return PTR_ERR(page);
19341+
19342+ return jnode_start_read(node, page);
19343+}
19344+
19345+/* Initialize a node by calling appropriate plugin instead of reading
19346+ * node from disk as in jload(). */
19347+int jinit_new(jnode * node, gfp_t gfp_flags)
19348+{
19349+ struct page *page;
19350+ int result;
19351+
19352+ jref(node);
19353+ add_d_ref(node);
19354+
19355+ page = jnode_get_page_locked(node, gfp_flags);
19356+ if (IS_ERR(page)) {
19357+ result = PTR_ERR(page);
19358+ goto failed;
19359+ }
19360+
19361+ SetPageUptodate(page);
19362+ unlock_page(page);
19363+
19364+ node->data = kmap(page);
19365+
19366+ if (!jnode_is_parsed(node)) {
19367+ jnode_plugin *jplug = jnode_ops(node);
19368+ spin_lock_jnode(node);
19369+ result = jplug->init(node);
19370+ spin_unlock_jnode(node);
19371+ if (result) {
19372+ kunmap(page);
19373+ goto failed;
19374+ }
19375+ JF_SET(node, JNODE_PARSED);
19376+ }
19377+
19378+ return 0;
19379+
19380+ failed:
19381+ jrelse(node);
19382+ return result;
19383+}
19384+
19385+/* release a reference to jnode acquired by jload(), decrement ->d_count */
19386+void jrelse_tail(jnode * node /* jnode to release references to */ )
19387+{
19388+ assert("nikita-489", atomic_read(&node->d_count) > 0);
19389+ atomic_dec(&node->d_count);
19390+ /* release reference acquired in jload_gfp() or jinit_new() */
19391+ jput(node);
19392+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
19393+ LOCK_CNT_DEC(d_refs);
19394+}
19395+
19396+/* drop reference to node data. When last reference is dropped, data are
19397+ unloaded. */
19398+void jrelse(jnode * node /* jnode to release references to */ )
19399+{
19400+ struct page *page;
19401+
19402+ assert("nikita-487", node != NULL);
19403+ assert_spin_not_locked(&(node->guard));
19404+
19405+ page = jnode_page(node);
19406+ if (likely(page != NULL)) {
19407+ /*
19408+ * it is safe not to lock jnode here, because at this point
19409+ * @node->d_count is greater than zero (if jrelse() is used
19410+ * correctly, that is). JNODE_PARSED may be not set yet, if,
19411+ * for example, we got here as a result of error handling path
19412+ * in jload(). Anyway, page cannot be detached by
19413+ * reiser4_releasepage(). truncate will invalidate page
19414+ * regardless, but this should not be a problem.
19415+ */
19416+ kunmap(page);
19417+ }
19418+ jrelse_tail(node);
19419+}
19420+
19421+/* called from jput() to wait for io completion */
19422+static void jnode_finish_io(jnode * node)
19423+{
19424+ struct page *page;
19425+
19426+ assert("nikita-2922", node != NULL);
19427+
19428+ spin_lock_jnode(node);
19429+ page = jnode_page(node);
19430+ if (page != NULL) {
19431+ page_cache_get(page);
19432+ spin_unlock_jnode(node);
19433+ wait_on_page_writeback(page);
19434+ page_cache_release(page);
19435+ } else
19436+ spin_unlock_jnode(node);
19437+}
19438+
19439+/*
19440+ * This is called by jput() when last reference to jnode is released. This is
19441+ * separate function, because we want fast path of jput() to be inline and,
19442+ * therefore, small.
19443+ */
19444+void jput_final(jnode * node)
19445+{
19446+ int r_i_p;
19447+
19448+ /* A fast check for keeping node in cache. We always keep node in cache
19449+ * if its page is present and node was not marked for deletion */
19450+ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19451+ rcu_read_unlock();
19452+ return;
19453+ }
19454+ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19455+ /*
19456+ * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19457+ * this case it is safe to access node after unlock.
19458+ */
19459+ rcu_read_unlock();
19460+ if (r_i_p) {
19461+ jnode_finish_io(node);
19462+ if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19463+ /* node is removed from the tree. */
19464+ jdelete(node);
19465+ else
19466+ jnode_try_drop(node);
19467+ }
19468+ /* if !r_i_p some other thread is already killing it */
19469+}
19470+
19471+int jwait_io(jnode * node, int rw)
19472+{
19473+ struct page *page;
19474+ int result;
19475+
19476+ assert("zam-447", node != NULL);
19477+ assert("zam-448", jnode_page(node) != NULL);
19478+
19479+ page = jnode_page(node);
19480+
19481+ result = 0;
19482+ if (rw == READ) {
19483+ wait_on_page_locked(page);
19484+ } else {
19485+ assert("nikita-2227", rw == WRITE);
19486+ wait_on_page_writeback(page);
19487+ }
19488+ if (PageError(page))
19489+ result = RETERR(-EIO);
19490+
19491+ return result;
19492+}
19493+
19494+/*
19495+ * jnode types and plugins.
19496+ *
19497+ * jnode by itself is a "base type". There are several different jnode
19498+ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19499+ * has to do different things based on jnode type. In the standard reiser4 way
19500+ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19501+ *
19502+ * Functions below deal with jnode types and define methods of jnode plugin.
19503+ *
19504+ */
19505+
19506+/* set jnode type. This is done during jnode initialization. */
19507+static void jnode_set_type(jnode * node, jnode_type type)
19508+{
19509+ static unsigned long type_to_mask[] = {
19510+ [JNODE_UNFORMATTED_BLOCK] = 1,
19511+ [JNODE_FORMATTED_BLOCK] = 0,
19512+ [JNODE_BITMAP] = 2,
19513+ [JNODE_IO_HEAD] = 6,
19514+ [JNODE_INODE] = 4
19515+ };
19516+
19517+ assert("zam-647", type < LAST_JNODE_TYPE);
19518+ assert("nikita-2815", !jnode_is_loaded(node));
19519+ assert("nikita-3386", node->state == 0);
19520+
19521+ node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19522+}
19523+
19524+/* ->init() method of jnode plugin for jnodes that don't require plugin
19525+ * specific initialization. */
19526+static int init_noinit(jnode * node UNUSED_ARG)
19527+{
19528+ return 0;
19529+}
19530+
19531+/* ->parse() method of jnode plugin for jnodes that don't require plugin
19532+ * specific pasring. */
19533+static int parse_noparse(jnode * node UNUSED_ARG)
19534+{
19535+ return 0;
19536+}
19537+
19538+/* ->mapping() method for unformatted jnode */
19539+struct address_space *mapping_jnode(const jnode * node)
19540+{
19541+ struct address_space *map;
19542+
19543+ assert("nikita-2713", node != NULL);
19544+
19545+ /* mapping is stored in jnode */
19546+
19547+ map = node->key.j.mapping;
19548+ assert("nikita-2714", map != NULL);
19549+ assert("nikita-2897", is_reiser4_inode(map->host));
19550+ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19551+ return map;
19552+}
19553+
19554+/* ->index() method for unformatted jnodes */
19555+unsigned long index_jnode(const jnode * node)
19556+{
19557+ /* index is stored in jnode */
19558+ return node->key.j.index;
19559+}
19560+
19561+/* ->remove() method for unformatted jnodes */
19562+static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19563+{
19564+ /* remove jnode from hash table and radix tree */
19565+ if (node->key.j.mapping)
19566+ unhash_unformatted_node_nolock(node);
19567+}
19568+
19569+/* ->mapping() method for znodes */
19570+static struct address_space *mapping_znode(const jnode * node)
19571+{
19572+ /* all znodes belong to fake inode */
19573+ return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19574+}
19575+
19576+/* ->index() method for znodes */
19577+static unsigned long index_znode(const jnode * node)
19578+{
19579+ unsigned long addr;
19580+ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19581+
19582+ /* index of znode is just its address (shifted) */
19583+ addr = (unsigned long)node;
19584+ return (addr - PAGE_OFFSET) >> znode_shift_order;
19585+}
19586+
19587+/* ->mapping() method for bitmap jnode */
19588+static struct address_space *mapping_bitmap(const jnode * node)
19589+{
19590+ /* all bitmap blocks belong to special bitmap inode */
19591+ return get_super_private(jnode_get_tree(node)->super)->bitmap->
19592+ i_mapping;
19593+}
19594+
19595+/* ->index() method for jnodes that are indexed by address */
19596+static unsigned long index_is_address(const jnode * node)
19597+{
19598+ unsigned long ind;
19599+
19600+ ind = (unsigned long)node;
19601+ return ind - PAGE_OFFSET;
19602+}
19603+
19604+/* resolve race with jput */
19605+jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19606+{
19607+ /*
19608+ * This is used as part of RCU-based jnode handling.
19609+ *
19610+ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19611+ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19612+ * not protected during this, so concurrent thread may execute
19613+ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19614+ * freed in jput_final(). To avoid such races, jput_final() sets
19615+ * JNODE_RIP on jnode (under tree lock). All places that work with
19616+ * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19617+ * (first without taking tree lock), and if this bit is set, released
19618+ * reference acquired by the current thread and returns NULL.
19619+ *
19620+ * As a result, if jnode is being concurrently freed, NULL is returned
19621+ * and caller should pretend that jnode wasn't found in the first
19622+ * place.
19623+ *
19624+ * Otherwise it's safe to release "rcu-read-lock" and continue with
19625+ * jnode.
19626+ */
19627+ if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19628+ read_lock_tree(tree);
19629+ if (JF_ISSET(node, JNODE_RIP)) {
19630+ dec_x_ref(node);
19631+ node = NULL;
19632+ }
19633+ read_unlock_tree(tree);
19634+ }
19635+ return node;
19636+}
19637+
19638+reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19639+{
19640+ struct inode *inode;
19641+ item_plugin *iplug;
19642+ loff_t off;
19643+
19644+ assert("nikita-3092", node != NULL);
19645+ assert("nikita-3093", key != NULL);
19646+ assert("nikita-3094", jnode_is_unformatted(node));
19647+
19648+ off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19649+ inode = mapping_jnode(node)->host;
19650+
19651+ if (node->parent_item_id != 0)
19652+ iplug = item_plugin_by_id(node->parent_item_id);
19653+ else
19654+ iplug = NULL;
19655+
19656+ if (iplug != NULL && iplug->f.key_by_offset)
19657+ iplug->f.key_by_offset(inode, off, key);
19658+ else {
19659+ file_plugin *fplug;
19660+
19661+ fplug = inode_file_plugin(inode);
19662+ assert("zam-1007", fplug != NULL);
19663+ assert("zam-1008", fplug->key_by_inode != NULL);
19664+
19665+ fplug->key_by_inode(inode, off, key);
19666+ }
19667+
19668+ return key;
19669+}
19670+
19671+/* ->parse() method for formatted nodes */
19672+static int parse_znode(jnode * node)
19673+{
19674+ return zparse(JZNODE(node));
19675+}
19676+
19677+/* ->delete() method for formatted nodes */
19678+static void delete_znode(jnode * node, reiser4_tree * tree)
19679+{
19680+ znode *z;
19681+
19682+ assert_rw_write_locked(&(tree->tree_lock));
19683+ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19684+
19685+ z = JZNODE(node);
19686+ assert("vs-899", z->c_count == 0);
19687+
19688+ /* delete znode from sibling list. */
19689+ sibling_list_remove(z);
19690+
19691+ znode_remove(z, tree);
19692+}
19693+
19694+/* ->remove() method for formatted nodes */
19695+static int remove_znode(jnode * node, reiser4_tree * tree)
19696+{
19697+ znode *z;
19698+
19699+ assert_rw_write_locked(&(tree->tree_lock));
19700+ z = JZNODE(node);
19701+
19702+ if (z->c_count == 0) {
19703+ /* detach znode from sibling list. */
19704+ sibling_list_drop(z);
19705+ /* this is called with tree spin-lock held, so call
19706+ znode_remove() directly (rather than znode_lock_remove()). */
19707+ znode_remove(z, tree);
19708+ return 0;
19709+ }
19710+ return RETERR(-EBUSY);
19711+}
19712+
19713+/* ->init() method for formatted nodes */
19714+static int init_znode(jnode * node)
19715+{
19716+ znode *z;
19717+
19718+ z = JZNODE(node);
19719+ /* call node plugin to do actual initialization */
19720+ return z->nplug->init(z);
19721+}
19722+
19723+/* ->clone() method for formatted nodes */
19724+static jnode *clone_formatted(jnode * node)
19725+{
19726+ znode *clone;
19727+
19728+ assert("vs-1430", jnode_is_znode(node));
19729+ clone = zalloc(reiser4_ctx_gfp_mask_get());
19730+ if (clone == NULL)
19731+ return ERR_PTR(RETERR(-ENOMEM));
19732+ zinit(clone, NULL, current_tree);
19733+ jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19734+ /* ZJNODE(clone)->key.z is not initialized */
19735+ clone->level = JZNODE(node)->level;
19736+
19737+ return ZJNODE(clone);
19738+}
19739+
19740+/* jplug->clone for unformatted nodes */
19741+static jnode *clone_unformatted(jnode * node)
19742+{
19743+ jnode *clone;
19744+
19745+ assert("vs-1431", jnode_is_unformatted(node));
19746+ clone = jalloc();
19747+ if (clone == NULL)
19748+ return ERR_PTR(RETERR(-ENOMEM));
19749+
19750+ jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19751+ jnode_set_block(clone, jnode_get_block(node));
19752+
19753+ return clone;
19754+
19755+}
19756+
19757+/*
19758+ * Setup jnode plugin methods for various jnode types.
19759+ */
19760+jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19761+ [JNODE_UNFORMATTED_BLOCK] = {
19762+ .h = {
19763+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19764+ .id = JNODE_UNFORMATTED_BLOCK,
19765+ .pops = NULL,
19766+ .label = "unformatted",
19767+ .desc = "unformatted node",
19768+ .linkage = {NULL, NULL}
19769+ },
19770+ .init = init_noinit,
19771+ .parse = parse_noparse,
19772+ .mapping = mapping_jnode,
19773+ .index = index_jnode,
19774+ .clone = clone_unformatted
19775+ },
19776+ [JNODE_FORMATTED_BLOCK] = {
19777+ .h = {
19778+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19779+ .id = JNODE_FORMATTED_BLOCK,
19780+ .pops = NULL,
19781+ .label = "formatted",
19782+ .desc = "formatted tree node",
19783+ .linkage = {NULL, NULL}
19784+ },
19785+ .init = init_znode,
19786+ .parse = parse_znode,
19787+ .mapping = mapping_znode,
19788+ .index = index_znode,
19789+ .clone = clone_formatted
19790+ },
19791+ [JNODE_BITMAP] = {
19792+ .h = {
19793+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19794+ .id = JNODE_BITMAP,
19795+ .pops = NULL,
19796+ .label = "bitmap",
19797+ .desc = "bitmap node",
19798+ .linkage = {NULL, NULL}
19799+ },
19800+ .init = init_noinit,
19801+ .parse = parse_noparse,
19802+ .mapping = mapping_bitmap,
19803+ .index = index_is_address,
19804+ .clone = NULL
19805+ },
19806+ [JNODE_IO_HEAD] = {
19807+ .h = {
19808+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19809+ .id = JNODE_IO_HEAD,
19810+ .pops = NULL,
19811+ .label = "io head",
19812+ .desc = "io head",
19813+ .linkage = {NULL, NULL}
19814+ },
19815+ .init = init_noinit,
19816+ .parse = parse_noparse,
19817+ .mapping = mapping_bitmap,
19818+ .index = index_is_address,
19819+ .clone = NULL
19820+ },
19821+ [JNODE_INODE] = {
19822+ .h = {
19823+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19824+ .id = JNODE_INODE,
19825+ .pops = NULL,
19826+ .label = "inode",
19827+ .desc = "inode's builtin jnode",
19828+ .linkage = {NULL, NULL}
19829+ },
19830+ .init = NULL,
19831+ .parse = NULL,
19832+ .mapping = NULL,
19833+ .index = NULL,
19834+ .clone = NULL
19835+ }
19836+};
19837+
19838+/*
19839+ * jnode destruction.
19840+ *
19841+ * Thread may use a jnode after it acquired a reference to it. References are
19842+ * counted in ->x_count field. Reference protects jnode from being
19843+ * recycled. This is different from protecting jnode data (that are stored in
19844+ * jnode page) from being evicted from memory. Data are protected by jload()
19845+ * and released by jrelse().
19846+ *
19847+ * If thread already possesses a reference to the jnode it can acquire another
19848+ * one through jref(). Initial reference is obtained (usually) by locating
19849+ * jnode in some indexing structure that depends on jnode type: formatted
19850+ * nodes are kept in global hash table, where they are indexed by block
19851+ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
19852+ * table, which is indexed by oid and offset within file, and in per-inode
19853+ * radix tree.
19854+ *
19855+ * Reference to jnode is released by jput(). If last reference is released,
19856+ * jput_final() is called. This function determines whether jnode has to be
19857+ * deleted (this happens when corresponding node is removed from the file
19858+ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
19859+ * should be just "removed" (deleted from memory).
19860+ *
19861+ * Jnode destruction is signally delicate dance because of locking and RCU.
19862+ */
19863+
19864+/*
19865+ * Returns true if jnode cannot be removed right now. This check is called
19866+ * under tree lock. If it returns true, jnode is irrevocably committed to be
19867+ * deleted/removed.
19868+ */
19869+static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
19870+{
19871+ /* if other thread managed to acquire a reference to this jnode, don't
19872+ * free it. */
19873+ if (atomic_read(&node->x_count) > 0)
19874+ return 1;
19875+ /* also, don't free znode that has children in memory */
19876+ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
19877+ return 1;
19878+ return 0;
19879+}
19880+
19881+/*
19882+ * this is called as part of removing jnode. Based on jnode type, call
19883+ * corresponding function that removes jnode from indices and returns it back
19884+ * to the appropriate slab (through RCU).
19885+ */
19886+static inline void
19887+jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
19888+{
19889+ switch (jtype) {
19890+ case JNODE_UNFORMATTED_BLOCK:
19891+ remove_jnode(node, tree);
19892+ break;
19893+ case JNODE_IO_HEAD:
19894+ case JNODE_BITMAP:
19895+ break;
19896+ case JNODE_INODE:
19897+ break;
19898+ case JNODE_FORMATTED_BLOCK:
19899+ remove_znode(node, tree);
19900+ break;
19901+ default:
19902+ wrong_return_value("nikita-3196", "Wrong jnode type");
19903+ }
19904+}
19905+
19906+/*
19907+ * this is called as part of deleting jnode. Based on jnode type, call
19908+ * corresponding function that removes jnode from indices and returns it back
19909+ * to the appropriate slab (through RCU).
19910+ *
19911+ * This differs from jnode_remove() only for formatted nodes---for them
19912+ * sibling list handling is different for removal and deletion.
19913+ */
19914+static inline void
19915+jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
19916+{
19917+ switch (jtype) {
19918+ case JNODE_UNFORMATTED_BLOCK:
19919+ remove_jnode(node, tree);
19920+ break;
19921+ case JNODE_IO_HEAD:
19922+ case JNODE_BITMAP:
19923+ break;
19924+ case JNODE_FORMATTED_BLOCK:
19925+ delete_znode(node, tree);
19926+ break;
19927+ case JNODE_INODE:
19928+ default:
19929+ wrong_return_value("nikita-3195", "Wrong jnode type");
19930+ }
19931+}
19932+
19933+#if REISER4_DEBUG
19934+/*
19935+ * remove jnode from the debugging list of all jnodes hanging off super-block.
19936+ */
19937+void jnode_list_remove(jnode * node)
19938+{
19939+ reiser4_super_info_data *sbinfo;
19940+
19941+ sbinfo = get_super_private(jnode_get_tree(node)->super);
19942+
19943+ spin_lock_irq(&sbinfo->all_guard);
19944+ assert("nikita-2422", !list_empty(&node->jnodes));
19945+ list_del_init(&node->jnodes);
19946+ spin_unlock_irq(&sbinfo->all_guard);
19947+}
19948+#endif
19949+
19950+/*
19951+ * this is called by jput_final() to remove jnode when last reference to it is
19952+ * released.
19953+ */
19954+static int jnode_try_drop(jnode * node)
19955+{
19956+ int result;
19957+ reiser4_tree *tree;
19958+ jnode_type jtype;
19959+
19960+ assert("nikita-2491", node != NULL);
19961+ assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
19962+
19963+ tree = jnode_get_tree(node);
19964+ jtype = jnode_get_type(node);
19965+
19966+ spin_lock_jnode(node);
19967+ write_lock_tree(tree);
19968+ /*
19969+ * if jnode has a page---leave it alone. Memory pressure will
19970+ * eventually kill page and jnode.
19971+ */
19972+ if (jnode_page(node) != NULL) {
19973+ write_unlock_tree(tree);
19974+ spin_unlock_jnode(node);
19975+ JF_CLR(node, JNODE_RIP);
19976+ return RETERR(-EBUSY);
19977+ }
19978+
19979+ /* re-check ->x_count under tree lock. */
19980+ result = jnode_is_busy(node, jtype);
19981+ if (result == 0) {
19982+ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19983+ assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
19984+
19985+ spin_unlock_jnode(node);
19986+ /* no page and no references---despatch him. */
19987+ jnode_remove(node, jtype, tree);
19988+ write_unlock_tree(tree);
19989+ jnode_free(node, jtype);
19990+ } else {
19991+ /* busy check failed: reference was acquired by concurrent
19992+ * thread. */
19993+ write_unlock_tree(tree);
19994+ spin_unlock_jnode(node);
19995+ JF_CLR(node, JNODE_RIP);
19996+ }
19997+ return result;
19998+}
19999+
20000+/* jdelete() -- Delete jnode from the tree and file system */
20001+static int jdelete(jnode * node /* jnode to finish with */ )
20002+{
20003+ struct page *page;
20004+ int result;
20005+ reiser4_tree *tree;
20006+ jnode_type jtype;
20007+
20008+ assert("nikita-467", node != NULL);
20009+ assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
20010+
20011+ jtype = jnode_get_type(node);
20012+
20013+ page = jnode_lock_page(node);
20014+ assert_spin_locked(&(node->guard));
20015+
20016+ tree = jnode_get_tree(node);
20017+
20018+ write_lock_tree(tree);
20019+ /* re-check ->x_count under tree lock. */
20020+ result = jnode_is_busy(node, jtype);
20021+ if (likely(!result)) {
20022+ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20023+ assert("jmacd-511", atomic_read(&node->d_count) == 0);
20024+
20025+ /* detach page */
20026+ if (page != NULL) {
20027+ /*
20028+ * FIXME this is racy against jnode_extent_write().
20029+ */
20030+ page_clear_jnode(page, node);
20031+ }
20032+ spin_unlock_jnode(node);
20033+ /* goodbye */
20034+ jnode_delete(node, jtype, tree);
20035+ write_unlock_tree(tree);
20036+ jnode_free(node, jtype);
20037+ /* @node is no longer valid pointer */
20038+ if (page != NULL)
20039+ reiser4_drop_page(page);
20040+ } else {
20041+ /* busy check failed: reference was acquired by concurrent
20042+ * thread. */
20043+ JF_CLR(node, JNODE_RIP);
20044+ write_unlock_tree(tree);
20045+ spin_unlock_jnode(node);
20046+ if (page != NULL)
20047+ unlock_page(page);
20048+ }
20049+ return result;
20050+}
20051+
20052+/* drop jnode on the floor.
20053+
20054+ Return value:
20055+
20056+ -EBUSY: failed to drop jnode, because there are still references to it
20057+
20058+ 0: successfully dropped jnode
20059+
20060+*/
20061+static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
20062+{
20063+ struct page *page;
20064+ jnode_type jtype;
20065+ int result;
20066+
20067+ assert("zam-602", node != NULL);
20068+ assert_rw_not_read_locked(&(tree->tree_lock));
20069+ assert_rw_not_write_locked(&(tree->tree_lock));
20070+ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20071+
20072+ jtype = jnode_get_type(node);
20073+
20074+ page = jnode_lock_page(node);
20075+ assert_spin_locked(&(node->guard));
20076+
20077+ write_lock_tree(tree);
20078+
20079+ /* re-check ->x_count under tree lock. */
20080+ result = jnode_is_busy(node, jtype);
20081+ if (!result) {
20082+ assert("nikita-2488", page == jnode_page(node));
20083+ assert("nikita-2533", atomic_read(&node->d_count) == 0);
20084+ if (page != NULL) {
20085+ assert("nikita-2126", !PageDirty(page));
20086+ assert("nikita-2127", PageUptodate(page));
20087+ assert("nikita-2181", PageLocked(page));
20088+ page_clear_jnode(page, node);
20089+ }
20090+ spin_unlock_jnode(node);
20091+ jnode_remove(node, jtype, tree);
20092+ write_unlock_tree(tree);
20093+ jnode_free(node, jtype);
20094+ if (page != NULL) {
20095+ reiser4_drop_page(page);
20096+ }
20097+ } else {
20098+ /* busy check failed: reference was acquired by concurrent
20099+ * thread. */
20100+ JF_CLR(node, JNODE_RIP);
20101+ write_unlock_tree(tree);
20102+ spin_unlock_jnode(node);
20103+ if (page != NULL)
20104+ unlock_page(page);
20105+ }
20106+ return result;
20107+}
20108+
20109+/* This function frees jnode "if possible". In particular, [dcx]_count has to
20110+ be 0 (where applicable). */
20111+void jdrop(jnode * node)
20112+{
20113+ jdrop_in_tree(node, jnode_get_tree(node));
20114+}
20115+
20116+/* IO head jnode implementation; The io heads are simple j-nodes with limited
20117+ functionality (these j-nodes are not in any hash table) just for reading
20118+ from and writing to disk. */
20119+
20120+jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20121+{
20122+ jnode *jal = jalloc();
20123+
20124+ if (jal != NULL) {
20125+ jnode_init(jal, current_tree, JNODE_IO_HEAD);
20126+ jnode_set_block(jal, block);
20127+ }
20128+
20129+ jref(jal);
20130+
20131+ return jal;
20132+}
20133+
20134+void reiser4_drop_io_head(jnode * node)
20135+{
20136+ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20137+
20138+ jput(node);
20139+ jdrop(node);
20140+}
20141+
20142+/* protect keep jnode data from reiser4_releasepage() */
20143+void pin_jnode_data(jnode * node)
20144+{
20145+ assert("zam-671", jnode_page(node) != NULL);
20146+ page_cache_get(jnode_page(node));
20147+}
20148+
20149+/* make jnode data free-able again */
20150+void unpin_jnode_data(jnode * node)
20151+{
20152+ assert("zam-672", jnode_page(node) != NULL);
20153+ page_cache_release(jnode_page(node));
20154+}
20155+
20156+struct address_space *jnode_get_mapping(const jnode * node)
20157+{
20158+ assert("nikita-3162", node != NULL);
20159+ return jnode_ops(node)->mapping(node);
20160+}
20161+
20162+#if REISER4_DEBUG
20163+/* debugging aid: jnode invariant */
20164+int jnode_invariant_f(const jnode * node, char const **msg)
20165+{
20166+#define _ergo(ant, con) \
20167+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20168+#define _check(exp) ((*msg) = #exp, (exp))
20169+
20170+ return _check(node != NULL) &&
20171+ /* [jnode-queued] */
20172+ /* only relocated node can be queued, except that when znode
20173+ * is being deleted, its JNODE_RELOC bit is cleared */
20174+ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20175+ JF_ISSET(node, JNODE_RELOC) ||
20176+ JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20177+ _check(node->jnodes.prev != NULL) &&
20178+ _check(node->jnodes.next != NULL) &&
20179+ /* [jnode-dirty] invariant */
20180+ /* dirty inode is part of atom */
20181+ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20182+ /* [jnode-oid] invariant */
20183+ /* for unformatted node ->objectid and ->mapping fields are
20184+ * consistent */
20185+ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20186+ node->key.j.objectid ==
20187+ get_inode_oid(node->key.j.mapping->host)) &&
20188+ /* [jnode-atom-valid] invariant */
20189+ /* node atom has valid state */
20190+ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20191+ /* [jnode-page-binding] invariant */
20192+ /* if node points to page, it points back to node */
20193+ _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20194+ /* [jnode-refs] invariant */
20195+ /* only referenced jnode can be loaded */
20196+ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20197+
20198+}
20199+
20200+static const char *jnode_type_name(jnode_type type)
20201+{
20202+ switch (type) {
20203+ case JNODE_UNFORMATTED_BLOCK:
20204+ return "unformatted";
20205+ case JNODE_FORMATTED_BLOCK:
20206+ return "formatted";
20207+ case JNODE_BITMAP:
20208+ return "bitmap";
20209+ case JNODE_IO_HEAD:
20210+ return "io head";
20211+ case JNODE_INODE:
20212+ return "inode";
20213+ case LAST_JNODE_TYPE:
20214+ return "last";
20215+ default:{
20216+ static char unknown[30];
20217+
20218+ sprintf(unknown, "unknown %i", type);
20219+ return unknown;
20220+ }
20221+ }
20222+}
20223+
20224+#define jnode_state_name( node, flag ) \
20225+ ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20226+
20227+/* debugging aid: output human readable information about @node */
20228+static void info_jnode(const char *prefix /* prefix to print */ ,
20229+ const jnode * node /* node to print */ )
20230+{
20231+ assert("umka-068", prefix != NULL);
20232+
20233+ if (node == NULL) {
20234+ printk("%s: null\n", prefix);
20235+ return;
20236+ }
20237+
20238+ printk
20239+ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20240+ " block: %s, d_count: %d, x_count: %d, "
20241+ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20242+ node->state,
20243+ jnode_state_name(node, JNODE_PARSED),
20244+ jnode_state_name(node, JNODE_HEARD_BANSHEE),
20245+ jnode_state_name(node, JNODE_LEFT_CONNECTED),
20246+ jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20247+ jnode_state_name(node, JNODE_ORPHAN),
20248+ jnode_state_name(node, JNODE_CREATED),
20249+ jnode_state_name(node, JNODE_RELOC),
20250+ jnode_state_name(node, JNODE_OVRWR),
20251+ jnode_state_name(node, JNODE_DIRTY),
20252+ jnode_state_name(node, JNODE_IS_DYING),
20253+ jnode_state_name(node, JNODE_RIP),
20254+ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20255+ jnode_state_name(node, JNODE_WRITEBACK),
20256+ jnode_state_name(node, JNODE_NEW),
20257+ jnode_state_name(node, JNODE_DKSET),
20258+ jnode_state_name(node, JNODE_REPACK),
20259+ jnode_state_name(node, JNODE_CLUSTER_PAGE),
20260+ jnode_get_level(node), sprint_address(jnode_get_block(node)),
20261+ atomic_read(&node->d_count), atomic_read(&node->x_count),
20262+ jnode_page(node), node->atom, 0, 0,
20263+ jnode_type_name(jnode_get_type(node)));
20264+ if (jnode_is_unformatted(node)) {
20265+ printk("inode: %llu, index: %lu, ",
20266+ node->key.j.objectid, node->key.j.index);
20267+ }
20268+}
20269+
20270+/* debugging aid: check znode invariant and panic if it doesn't hold */
20271+static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20272+{
20273+ char const *failed_msg;
20274+ int result;
20275+ reiser4_tree *tree;
20276+
20277+ tree = jnode_get_tree(node);
20278+
20279+ assert("umka-063312", node != NULL);
20280+ assert("umka-064321", tree != NULL);
20281+
20282+ if (!jlocked && !tlocked)
20283+ spin_lock_jnode((jnode *) node);
20284+ if (!tlocked)
20285+ read_lock_tree(jnode_get_tree(node));
20286+ result = jnode_invariant_f(node, &failed_msg);
20287+ if (!result) {
20288+ info_jnode("corrupted node", node);
20289+ warning("jmacd-555", "Condition %s failed", failed_msg);
20290+ }
20291+ if (!tlocked)
20292+ read_unlock_tree(jnode_get_tree(node));
20293+ if (!jlocked && !tlocked)
20294+ spin_unlock_jnode((jnode *) node);
20295+ return result;
20296+}
20297+
20298+#endif /* REISER4_DEBUG */
20299+
20300+/* Make Linus happy.
20301+ Local variables:
20302+ c-indentation-style: "K&R"
20303+ mode-name: "LC"
20304+ c-basic-offset: 8
20305+ tab-width: 8
20306+ fill-column: 80
20307+ End:
20308+*/
20309diff -urN linux-2.6.24.orig/fs/reiser4/jnode.h linux-2.6.24/fs/reiser4/jnode.h
20310--- linux-2.6.24.orig/fs/reiser4/jnode.h 1970-01-01 03:00:00.000000000 +0300
20311+++ linux-2.6.24/fs/reiser4/jnode.h 2008-01-25 11:39:06.940208719 +0300
20312@@ -0,0 +1,702 @@
20313+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20314+ * reiser4/README */
20315+
20316+/* Declaration of jnode. See jnode.c for details. */
20317+
20318+#ifndef __JNODE_H__
20319+#define __JNODE_H__
20320+
20321+#include "forward.h"
20322+#include "type_safe_hash.h"
20323+#include "txnmgr.h"
20324+#include "key.h"
20325+#include "debug.h"
20326+#include "dformat.h"
20327+#include "page_cache.h"
20328+#include "context.h"
20329+
20330+#include "plugin/plugin.h"
20331+
20332+#include <linux/fs.h>
20333+#include <linux/mm.h>
20334+#include <linux/spinlock.h>
20335+#include <asm/atomic.h>
20336+#include <linux/bitops.h>
20337+#include <linux/list.h>
20338+#include <linux/rcupdate.h>
20339+
20340+/* declare hash table of jnodes (jnodes proper, that is, unformatted
20341+ nodes) */
20342+TYPE_SAFE_HASH_DECLARE(j, jnode);
20343+
20344+/* declare hash table of znodes */
20345+TYPE_SAFE_HASH_DECLARE(z, znode);
20346+
20347+struct jnode_key {
20348+ __u64 objectid;
20349+ unsigned long index;
20350+ struct address_space *mapping;
20351+};
20352+
20353+/*
20354+ Jnode is the "base class" of other nodes in reiser4. It is also happens to
20355+ be exactly the node we use for unformatted tree nodes.
20356+
20357+ Jnode provides following basic functionality:
20358+
20359+ . reference counting and indexing.
20360+
20361+ . integration with page cache. Jnode has ->pg reference to which page can
20362+ be attached.
20363+
20364+ . interface to transaction manager. It is jnode that is kept in transaction
20365+ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20366+ means, there should be special type of jnode for inode.)
20367+
20368+ Locking:
20369+
20370+ Spin lock: the following fields are protected by the per-jnode spin lock:
20371+
20372+ ->state
20373+ ->atom
20374+ ->capture_link
20375+
20376+ Following fields are protected by the global tree lock:
20377+
20378+ ->link
20379+ ->key.z (content of ->key.z is only changed in znode_rehash())
20380+ ->key.j
20381+
20382+ Atomic counters
20383+
20384+ ->x_count
20385+ ->d_count
20386+
20387+ ->pg, and ->data are protected by spin lock for unused jnode and are
20388+ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20389+ is false).
20390+
20391+ ->tree is immutable after creation
20392+
20393+ Unclear
20394+
20395+ ->blocknr: should be under jnode spin-lock, but current interface is based
20396+ on passing of block address.
20397+
20398+ If you ever need to spin lock two nodes at once, do this in "natural"
20399+ memory order: lock znode with lower address first. (See lock_two_nodes().)
20400+
20401+ Invariants involving this data-type:
20402+
20403+ [jnode-dirty]
20404+ [jnode-refs]
20405+ [jnode-oid]
20406+ [jnode-queued]
20407+ [jnode-atom-valid]
20408+ [jnode-page-binding]
20409+*/
20410+
20411+struct jnode {
20412+#if REISER4_DEBUG
20413+#define JMAGIC 0x52654973 /* "ReIs" */
20414+ int magic;
20415+#endif
20416+ /* FIRST CACHE LINE (16 bytes): data used by jload */
20417+
20418+ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20419+ /* 0 */ unsigned long state;
20420+
20421+ /* lock, protecting jnode's fields. */
20422+ /* 4 */ spinlock_t load;
20423+
20424+ /* counter of references to jnode itself. Increased on jref().
20425+ Decreased on jput().
20426+ */
20427+ /* 8 */ atomic_t x_count;
20428+
20429+ /* counter of references to jnode's data. Pin data page(s) in
20430+ memory while this is greater than 0. Increased on jload().
20431+ Decreased on jrelse().
20432+ */
20433+ /* 12 */ atomic_t d_count;
20434+
20435+ /* SECOND CACHE LINE: data used by hash table lookups */
20436+
20437+ /* 16 */ union {
20438+ /* znodes are hashed by block number */
20439+ reiser4_block_nr z;
20440+ /* unformatted nodes are hashed by mapping plus offset */
20441+ struct jnode_key j;
20442+ } key;
20443+
20444+ /* THIRD CACHE LINE */
20445+
20446+ /* 32 */ union {
20447+ /* pointers to maintain hash-table */
20448+ z_hash_link z;
20449+ j_hash_link j;
20450+ } link;
20451+
20452+ /* pointer to jnode page. */
20453+ /* 36 */ struct page *pg;
20454+ /* pointer to node itself. This is page_address(node->pg) when page is
20455+ attached to the jnode
20456+ */
20457+ /* 40 */ void *data;
20458+
20459+ /* 44 */ reiser4_tree *tree;
20460+
20461+ /* FOURTH CACHE LINE: atom related fields */
20462+
20463+ /* 48 */ spinlock_t guard;
20464+
20465+ /* atom the block is in, if any */
20466+ /* 52 */ txn_atom *atom;
20467+
20468+ /* capture list */
20469+ /* 56 */ struct list_head capture_link;
20470+
20471+ /* FIFTH CACHE LINE */
20472+
20473+ /* 64 */ struct rcu_head rcu;
20474+ /* crosses cache line */
20475+
20476+ /* SIXTH CACHE LINE */
20477+
20478+ /* the real blocknr (where io is going to/from) */
20479+ /* 80 */ reiser4_block_nr blocknr;
20480+ /* Parent item type, unformatted and CRC need it for offset => key conversion. */
20481+ /* NOTE: this parent_item_id looks like jnode type. */
20482+ /* 88 */ reiser4_plugin_id parent_item_id;
20483+ /* 92 */
20484+#if REISER4_DEBUG
20485+ /* list of all jnodes for debugging purposes. */
20486+ struct list_head jnodes;
20487+ /* how many times this jnode was written in one transaction */
20488+ int written;
20489+ /* this indicates which atom's list the jnode is on */
20490+ atom_list list;
20491+#endif
20492+} __attribute__ ((aligned(16)));
20493+
20494+/*
20495+ * jnode types. Enumeration of existing jnode types.
20496+ */
20497+typedef enum {
20498+ JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20499+ JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20500+ JNODE_BITMAP, /* bitmap */
20501+ JNODE_IO_HEAD, /* jnode representing a block in the
20502+ * wandering log */
20503+ JNODE_INODE, /* jnode embedded into inode */
20504+ LAST_JNODE_TYPE
20505+} jnode_type;
20506+
20507+/* jnode states */
20508+typedef enum {
20509+ /* jnode's page is loaded and data checked */
20510+ JNODE_PARSED = 0,
20511+ /* node was deleted, not all locks on it were released. This
20512+ node is empty and is going to be removed from the tree
20513+ shortly. */
20514+ JNODE_HEARD_BANSHEE = 1,
20515+ /* left sibling pointer is valid */
20516+ JNODE_LEFT_CONNECTED = 2,
20517+ /* right sibling pointer is valid */
20518+ JNODE_RIGHT_CONNECTED = 3,
20519+
20520+ /* znode was just created and doesn't yet have a pointer from
20521+ its parent */
20522+ JNODE_ORPHAN = 4,
20523+
20524+ /* this node was created by its transaction and has not been assigned
20525+ a block address. */
20526+ JNODE_CREATED = 5,
20527+
20528+ /* this node is currently relocated */
20529+ JNODE_RELOC = 6,
20530+ /* this node is currently wandered */
20531+ JNODE_OVRWR = 7,
20532+
20533+ /* this znode has been modified */
20534+ JNODE_DIRTY = 8,
20535+
20536+ /* znode lock is being invalidated */
20537+ JNODE_IS_DYING = 9,
20538+
20539+ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20540+
20541+ /* jnode is queued for flushing. */
20542+ JNODE_FLUSH_QUEUED = 12,
20543+
20544+ /* In the following bits jnode type is encoded. */
20545+ JNODE_TYPE_1 = 13,
20546+ JNODE_TYPE_2 = 14,
20547+ JNODE_TYPE_3 = 15,
20548+
20549+ /* jnode is being destroyed */
20550+ JNODE_RIP = 16,
20551+
20552+ /* znode was not captured during locking (it might so be because
20553+ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20554+ JNODE_MISSED_IN_CAPTURE = 17,
20555+
20556+ /* write is in progress */
20557+ JNODE_WRITEBACK = 18,
20558+
20559+ /* FIXME: now it is used by crypto-compress plugin only */
20560+ JNODE_NEW = 19,
20561+
20562+ /* delimiting keys are already set for this znode. */
20563+ JNODE_DKSET = 20,
20564+
20565+ /* when this bit is set page and jnode can not be disconnected */
20566+ JNODE_WRITE_PREPARED = 21,
20567+
20568+ JNODE_CLUSTER_PAGE = 22,
20569+ /* Jnode is marked for repacking, that means the reiser4 flush and the
20570+ * block allocator should process this node special way */
20571+ JNODE_REPACK = 23,
20572+ /* node should be converted by flush in squalloc phase */
20573+ JNODE_CONVERTIBLE = 24,
20574+ /*
20575+ * When jnode is dirtied for the first time in given transaction,
20576+ * do_jnode_make_dirty() checks whether this jnode can possible became
20577+ * member of overwrite set. If so, this bit is set, and one block is
20578+ * reserved in the ->flush_reserved space of atom.
20579+ *
20580+ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20581+ *
20582+ * (1) flush decides that we want this block to go into relocate
20583+ * set after all.
20584+ *
20585+ * (2) wandering log is allocated (by log writer)
20586+ *
20587+ * (3) extent is allocated
20588+ *
20589+ */
20590+ JNODE_FLUSH_RESERVED = 29
20591+} reiser4_jnode_state;
20592+
20593+/* Macros for accessing the jnode state. */
20594+
20595+static inline void JF_CLR(jnode * j, int f)
20596+{
20597+ assert("unknown-1", j->magic == JMAGIC);
20598+ clear_bit(f, &j->state);
20599+}
20600+static inline int JF_ISSET(const jnode * j, int f)
20601+{
20602+ assert("unknown-2", j->magic == JMAGIC);
20603+ return test_bit(f, &((jnode *) j)->state);
20604+}
20605+static inline void JF_SET(jnode * j, int f)
20606+{
20607+ assert("unknown-3", j->magic == JMAGIC);
20608+ set_bit(f, &j->state);
20609+}
20610+
20611+static inline int JF_TEST_AND_SET(jnode * j, int f)
20612+{
20613+ assert("unknown-4", j->magic == JMAGIC);
20614+ return test_and_set_bit(f, &j->state);
20615+}
20616+
20617+static inline void spin_lock_jnode(jnode *node)
20618+{
20619+ /* check that spinlocks of lower priorities are not held */
20620+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20621+ LOCK_CNT_NIL(spin_locked_txnh) &&
20622+ LOCK_CNT_NIL(spin_locked_zlock) &&
20623+ LOCK_CNT_NIL(rw_locked_dk) &&
20624+ LOCK_CNT_LT(spin_locked_jnode, 2)));
20625+
20626+ spin_lock(&(node->guard));
20627+
20628+ LOCK_CNT_INC(spin_locked_jnode);
20629+ LOCK_CNT_INC(spin_locked);
20630+}
20631+
20632+static inline void spin_unlock_jnode(jnode *node)
20633+{
20634+ assert_spin_locked(&(node->guard));
20635+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20636+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20637+
20638+ LOCK_CNT_DEC(spin_locked_jnode);
20639+ LOCK_CNT_DEC(spin_locked);
20640+
20641+ spin_unlock(&(node->guard));
20642+}
20643+
20644+static inline int jnode_is_in_deleteset(const jnode * node)
20645+{
20646+ return JF_ISSET(node, JNODE_RELOC);
20647+}
20648+
20649+extern int init_jnodes(void);
20650+extern void done_jnodes(void);
20651+
20652+/* Jnode routines */
20653+extern jnode *jalloc(void);
20654+extern void jfree(jnode * node) NONNULL;
20655+extern jnode *jclone(jnode *);
20656+extern jnode *jlookup(reiser4_tree * tree,
20657+ oid_t objectid, unsigned long ind) NONNULL;
20658+extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20659+extern jnode *jnode_by_page(struct page *pg) NONNULL;
20660+extern jnode *jnode_of_page(struct page *pg) NONNULL;
20661+void jnode_attach_page(jnode * node, struct page *pg);
20662+
20663+void unhash_unformatted_jnode(jnode *);
20664+extern jnode *page_next_jnode(jnode * node) NONNULL;
20665+extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20666+extern void jnode_make_dirty(jnode * node) NONNULL;
20667+extern void jnode_make_clean(jnode * node) NONNULL;
20668+extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20669+extern void jnode_make_wander(jnode *) NONNULL;
20670+extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
20671+extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20672+extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20673+
20674+/**
20675+ * jnode_get_block
20676+ * @node: jnode to query
20677+ *
20678+ */
20679+static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20680+{
20681+ assert("nikita-528", node != NULL);
20682+
20683+ return &node->blocknr;
20684+}
20685+
20686+/**
20687+ * jnode_set_block
20688+ * @node: jnode to update
20689+ * @blocknr: new block nr
20690+ */
20691+static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20692+{
20693+ assert("nikita-2020", node != NULL);
20694+ assert("umka-055", blocknr != NULL);
20695+ node->blocknr = *blocknr;
20696+}
20697+
20698+
20699+/* block number for IO. Usually this is the same as jnode_get_block(), unless
20700+ * jnode was emergency flushed---then block number chosen by eflush is
20701+ * used. */
20702+static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20703+{
20704+ assert("nikita-2768", node != NULL);
20705+ assert_spin_locked(&(node->guard));
20706+
20707+ return jnode_get_block(node);
20708+}
20709+
20710+/* Jnode flush interface. */
20711+extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos);
20712+extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos);
20713+
20714+/* FIXME-VS: these are used in plugin/item/extent.c */
20715+
20716+/* does extent_get_block have to be called */
20717+#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
20718+#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20719+
20720+/* the node should be converted during flush squalloc phase */
20721+#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
20722+#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
20723+
20724+/* Macros to convert from jnode to znode, znode to jnode. These are macros
20725+ because C doesn't allow overloading of const prototypes. */
20726+#define ZJNODE(x) (& (x) -> zjnode)
20727+#define JZNODE(x) \
20728+({ \
20729+ typeof (x) __tmp_x; \
20730+ \
20731+ __tmp_x = (x); \
20732+ assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \
20733+ (znode*) __tmp_x; \
20734+})
20735+
20736+extern int jnodes_tree_init(reiser4_tree * tree);
20737+extern int jnodes_tree_done(reiser4_tree * tree);
20738+
20739+#if REISER4_DEBUG
20740+
20741+extern int znode_is_any_locked(const znode * node);
20742+extern void jnode_list_remove(jnode * node);
20743+
20744+#else
20745+
20746+#define jnode_list_remove(node) noop
20747+
20748+#endif
20749+
20750+int znode_is_root(const znode * node) NONNULL;
20751+
20752+/* bump reference counter on @node */
20753+static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
20754+{
20755+ assert("nikita-1911", node != NULL);
20756+
20757+ atomic_inc(&node->x_count);
20758+ LOCK_CNT_INC(x_refs);
20759+}
20760+
20761+static inline void dec_x_ref(jnode * node)
20762+{
20763+ assert("nikita-3215", node != NULL);
20764+ assert("nikita-3216", atomic_read(&node->x_count) > 0);
20765+
20766+ atomic_dec(&node->x_count);
20767+ assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20768+ LOCK_CNT_DEC(x_refs);
20769+}
20770+
20771+/* jref() - increase counter of references to jnode/znode (x_count) */
20772+static inline jnode *jref(jnode * node)
20773+{
20774+ assert("jmacd-508", (node != NULL) && !IS_ERR(node));
20775+ add_x_ref(node);
20776+ return node;
20777+}
20778+
20779+/* get the page of jnode */
20780+static inline struct page *jnode_page(const jnode * node)
20781+{
20782+ return node->pg;
20783+}
20784+
20785+/* return pointer to jnode data */
20786+static inline char *jdata(const jnode * node)
20787+{
20788+ assert("nikita-1415", node != NULL);
20789+ assert("nikita-3198", jnode_page(node) != NULL);
20790+ return node->data;
20791+}
20792+
20793+static inline int jnode_is_loaded(const jnode * node)
20794+{
20795+ assert("zam-506", node != NULL);
20796+ return atomic_read(&node->d_count) > 0;
20797+}
20798+
20799+extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
20800+
20801+static inline void jnode_set_reloc(jnode * node)
20802+{
20803+ assert("nikita-2431", node != NULL);
20804+ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
20805+ JF_SET(node, JNODE_RELOC);
20806+}
20807+
20808+/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
20809+
20810+extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
20811+
20812+static inline int jload(jnode *node)
20813+{
20814+ return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
20815+}
20816+
20817+extern int jinit_new(jnode *, gfp_t) NONNULL;
20818+extern int jstartio(jnode *) NONNULL;
20819+
20820+extern void jdrop(jnode *) NONNULL;
20821+extern int jwait_io(jnode *, int rw) NONNULL;
20822+
20823+void jload_prefetch(jnode *);
20824+
20825+extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
20826+extern void reiser4_drop_io_head(jnode * node) NONNULL;
20827+
20828+static inline reiser4_tree *jnode_get_tree(const jnode * node)
20829+{
20830+ assert("nikita-2691", node != NULL);
20831+ return node->tree;
20832+}
20833+
20834+extern void pin_jnode_data(jnode *);
20835+extern void unpin_jnode_data(jnode *);
20836+
20837+static inline jnode_type jnode_get_type(const jnode * node)
20838+{
20839+ static const unsigned long state_mask =
20840+ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
20841+
20842+ static jnode_type mask_to_type[] = {
20843+ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
20844+
20845+ /* 000 */
20846+ [0] = JNODE_FORMATTED_BLOCK,
20847+ /* 001 */
20848+ [1] = JNODE_UNFORMATTED_BLOCK,
20849+ /* 010 */
20850+ [2] = JNODE_BITMAP,
20851+ /* 011 */
20852+ [3] = LAST_JNODE_TYPE, /*invalid */
20853+ /* 100 */
20854+ [4] = JNODE_INODE,
20855+ /* 101 */
20856+ [5] = LAST_JNODE_TYPE,
20857+ /* 110 */
20858+ [6] = JNODE_IO_HEAD,
20859+ /* 111 */
20860+ [7] = LAST_JNODE_TYPE, /* invalid */
20861+ };
20862+
20863+ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
20864+}
20865+
20866+/* returns true if node is a znode */
20867+static inline int jnode_is_znode(const jnode * node)
20868+{
20869+ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
20870+}
20871+
20872+static inline int jnode_is_flushprepped(jnode * node)
20873+{
20874+ assert("jmacd-78212", node != NULL);
20875+ assert_spin_locked(&(node->guard));
20876+ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
20877+ JF_ISSET(node, JNODE_OVRWR);
20878+}
20879+
20880+/* Return true if @node has already been processed by the squeeze and allocate
20881+ process. This implies the block address has been finalized for the
20882+ duration of this atom (or it is clean and will remain in place). If this
20883+ returns true you may use the block number as a hint. */
20884+static inline int jnode_check_flushprepped(jnode * node)
20885+{
20886+ int result;
20887+
20888+ /* It must be clean or relocated or wandered. New allocations are set to relocate. */
20889+ spin_lock_jnode(node);
20890+ result = jnode_is_flushprepped(node);
20891+ spin_unlock_jnode(node);
20892+ return result;
20893+}
20894+
20895+/* returns true if node is unformatted */
20896+static inline int jnode_is_unformatted(const jnode * node)
20897+{
20898+ assert("jmacd-0123", node != NULL);
20899+ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
20900+}
20901+
20902+/* returns true if node represents a cluster cache page */
20903+static inline int jnode_is_cluster_page(const jnode * node)
20904+{
20905+ assert("edward-50", node != NULL);
20906+ return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
20907+}
20908+
20909+/* returns true is node is builtin inode's jnode */
20910+static inline int jnode_is_inode(const jnode * node)
20911+{
20912+ assert("vs-1240", node != NULL);
20913+ return jnode_get_type(node) == JNODE_INODE;
20914+}
20915+
20916+static inline jnode_plugin *jnode_ops_of(const jnode_type type)
20917+{
20918+ assert("nikita-2367", type < LAST_JNODE_TYPE);
20919+ return jnode_plugin_by_id((reiser4_plugin_id) type);
20920+}
20921+
20922+static inline jnode_plugin *jnode_ops(const jnode * node)
20923+{
20924+ assert("nikita-2366", node != NULL);
20925+
20926+ return jnode_ops_of(jnode_get_type(node));
20927+}
20928+
20929+/* Get the index of a block. */
20930+static inline unsigned long jnode_get_index(jnode * node)
20931+{
20932+ return jnode_ops(node)->index(node);
20933+}
20934+
20935+/* return true if "node" is the root */
20936+static inline int jnode_is_root(const jnode * node)
20937+{
20938+ return jnode_is_znode(node) && znode_is_root(JZNODE(node));
20939+}
20940+
20941+extern struct address_space *mapping_jnode(const jnode * node);
20942+extern unsigned long index_jnode(const jnode * node);
20943+
20944+static inline void jput(jnode * node);
20945+extern void jput_final(jnode * node);
20946+
20947+/* bump data counter on @node */
20948+static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
20949+{
20950+ assert("nikita-1962", node != NULL);
20951+
20952+ atomic_inc(&node->d_count);
20953+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
20954+ LOCK_CNT_INC(d_refs);
20955+}
20956+
20957+/* jput() - decrement x_count reference counter on znode.
20958+
20959+ Count may drop to 0, jnode stays in cache until memory pressure causes the
20960+ eviction of its page. The c_count variable also ensures that children are
20961+ pressured out of memory before the parent. The jnode remains hashed as
20962+ long as the VM allows its page to stay in memory.
20963+*/
20964+static inline void jput(jnode * node)
20965+{
20966+ assert("jmacd-509", node != NULL);
20967+ assert("jmacd-510", atomic_read(&node->x_count) > 0);
20968+ assert("zam-926", reiser4_schedulable());
20969+ LOCK_CNT_DEC(x_refs);
20970+
20971+ rcu_read_lock();
20972+ /*
20973+ * we don't need any kind of lock here--jput_final() uses RCU.
20974+ */
20975+ if (unlikely(atomic_dec_and_test(&node->x_count))) {
20976+ jput_final(node);
20977+ } else
20978+ rcu_read_unlock();
20979+ assert("nikita-3473", reiser4_schedulable());
20980+}
20981+
20982+extern void jrelse(jnode * node);
20983+extern void jrelse_tail(jnode * node);
20984+
20985+extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
20986+
20987+/* resolve race with jput */
20988+static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
20989+{
20990+ if (unlikely(JF_ISSET(node, JNODE_RIP)))
20991+ node = jnode_rip_sync(tree, node);
20992+ return node;
20993+}
20994+
20995+extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
20996+
20997+#if REISER4_DEBUG
20998+extern int jnode_invariant_f(const jnode *node, char const **msg);
20999+#endif
21000+
21001+extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
21002+
21003+/* __JNODE_H__ */
21004+#endif
21005+
21006+/* Make Linus happy.
21007+ Local variables:
21008+ c-indentation-style: "K&R"
21009+ mode-name: "LC"
21010+ c-basic-offset: 8
21011+ tab-width: 8
21012+ fill-column: 120
21013+ End:
21014+*/
21015diff -urN linux-2.6.24.orig/fs/reiser4/kassign.c linux-2.6.24/fs/reiser4/kassign.c
21016--- linux-2.6.24.orig/fs/reiser4/kassign.c 1970-01-01 03:00:00.000000000 +0300
21017+++ linux-2.6.24/fs/reiser4/kassign.c 2008-01-25 11:55:43.900543447 +0300
21018@@ -0,0 +1,677 @@
21019+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21020+ * reiser4/README */
21021+
21022+/* Key assignment policy implementation */
21023+
21024+/*
21025+ * In reiser4 every piece of file system data and meta-data has a key. Keys
21026+ * are used to store information in and retrieve it from reiser4 internal
21027+ * tree. In addition to this, keys define _ordering_ of all file system
21028+ * information: things having close keys are placed into the same or
21029+ * neighboring (in the tree order) nodes of the tree. As our block allocator
21030+ * tries to respect tree order (see flush.c), keys also define order in which
21031+ * things are laid out on the disk, and hence, affect performance directly.
21032+ *
21033+ * Obviously, assignment of keys to data and meta-data should be consistent
21034+ * across whole file system. Algorithm that calculates a key for a given piece
21035+ * of data or meta-data is referred to as "key assignment".
21036+ *
21037+ * Key assignment is too expensive to be implemented as a plugin (that is,
21038+ * with an ability to support different key assignment schemas in the same
21039+ * compiled kernel image). As a compromise, all key-assignment functions and
21040+ * data-structures are collected in this single file, so that modifications to
21041+ * key assignment algorithm can be localized. Additional changes may be
21042+ * required in key.[ch].
21043+ *
21044+ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
21045+ * may guess, there is "Plan B" too.
21046+ *
21047+ */
21048+
21049+/*
21050+ * Additional complication with key assignment implementation is a requirement
21051+ * to support different key length.
21052+ */
21053+
21054+/*
21055+ * KEY ASSIGNMENT: PLAN A, LONG KEYS.
21056+ *
21057+ * DIRECTORY ITEMS
21058+ *
21059+ * | 60 | 4 | 7 |1| 56 | 64 | 64 |
21060+ * +--------------+---+---+-+-------------+------------------+-----------------+
21061+ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
21062+ * +--------------+---+---+-+-------------+------------------+-----------------+
21063+ * | | | | |
21064+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21065+ *
21066+ * dirid objectid of directory this item is for
21067+ *
21068+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21069+ *
21070+ * H 1 if last 8 bytes of the key contain hash,
21071+ * 0 if last 8 bytes of the key contain prefix-3
21072+ *
21073+ * prefix-1 first 7 characters of file name.
21074+ * Padded by zeroes if name is not long enough.
21075+ *
21076+ * prefix-2 next 8 characters of the file name.
21077+ *
21078+ * prefix-3 next 8 characters of the file name.
21079+ *
21080+ * hash hash of the rest of file name (i.e., portion of file
21081+ * name not included into prefix-1 and prefix-2).
21082+ *
21083+ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21084+ * in the key. Such file names are called "short". They are distinguished by H
21085+ * bit set 0 in the key.
21086+ *
21087+ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21088+ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21089+ * key. Last 8 bytes of the key are occupied by hash of the remaining
21090+ * characters of the name.
21091+ *
21092+ * This key assignment reaches following important goals:
21093+ *
21094+ * (1) directory entries are sorted in approximately lexicographical
21095+ * order.
21096+ *
21097+ * (2) collisions (when multiple directory items have the same key), while
21098+ * principally unavoidable in a tree with fixed length keys, are rare.
21099+ *
21100+ * STAT DATA
21101+ *
21102+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21103+ * +--------------+---+-----------------+---+--------------+-----------------+
21104+ * | locality id | 1 | ordering | 0 | objectid | 0 |
21105+ * +--------------+---+-----------------+---+--------------+-----------------+
21106+ * | | | | |
21107+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21108+ *
21109+ * locality id object id of a directory where first name was created for
21110+ * the object
21111+ *
21112+ * ordering copy of second 8-byte portion of the key of directory
21113+ * entry for the first name of this object. Ordering has a form
21114+ * {
21115+ * fibration :7;
21116+ * h :1;
21117+ * prefix1 :56;
21118+ * }
21119+ * see description of key for directory entry above.
21120+ *
21121+ * objectid object id for this object
21122+ *
21123+ * This key assignment policy is designed to keep stat-data in the same order
21124+ * as corresponding directory items, thus speeding up readdir/stat types of
21125+ * workload.
21126+ *
21127+ * FILE BODY
21128+ *
21129+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21130+ * +--------------+---+-----------------+---+--------------+-----------------+
21131+ * | locality id | 4 | ordering | 0 | objectid | offset |
21132+ * +--------------+---+-----------------+---+--------------+-----------------+
21133+ * | | | | |
21134+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21135+ *
21136+ * locality id object id of a directory where first name was created for
21137+ * the object
21138+ *
21139+ * ordering the same as in the key of stat-data for this object
21140+ *
21141+ * objectid object id for this object
21142+ *
21143+ * offset logical offset from the beginning of this file.
21144+ * Measured in bytes.
21145+ *
21146+ *
21147+ * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21148+ *
21149+ * DIRECTORY ITEMS
21150+ *
21151+ * | 60 | 4 | 7 |1| 56 | 64 |
21152+ * +--------------+---+---+-+-------------+-----------------+
21153+ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21154+ * +--------------+---+---+-+-------------+-----------------+
21155+ * | | | |
21156+ * | 8 bytes | 8 bytes | 8 bytes |
21157+ *
21158+ * dirid objectid of directory this item is for
21159+ *
21160+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21161+ *
21162+ * H 1 if last 8 bytes of the key contain hash,
21163+ * 0 if last 8 bytes of the key contain prefix-2
21164+ *
21165+ * prefix-1 first 7 characters of file name.
21166+ * Padded by zeroes if name is not long enough.
21167+ *
21168+ * prefix-2 next 8 characters of the file name.
21169+ *
21170+ * hash hash of the rest of file name (i.e., portion of file
21171+ * name not included into prefix-1).
21172+ *
21173+ * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21174+ * the key. Such file names are called "short". They are distinguished by H
21175+ * bit set in the key.
21176+ *
21177+ * Other file names are "long". For long name, H bit is 0, and first 7
21178+ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21179+ * key are occupied by hash of the remaining characters of the name.
21180+ *
21181+ * STAT DATA
21182+ *
21183+ * | 60 | 4 | 4 | 60 | 64 |
21184+ * +--------------+---+---+--------------+-----------------+
21185+ * | locality id | 1 | 0 | objectid | 0 |
21186+ * +--------------+---+---+--------------+-----------------+
21187+ * | | | |
21188+ * | 8 bytes | 8 bytes | 8 bytes |
21189+ *
21190+ * locality id object id of a directory where first name was created for
21191+ * the object
21192+ *
21193+ * objectid object id for this object
21194+ *
21195+ * FILE BODY
21196+ *
21197+ * | 60 | 4 | 4 | 60 | 64 |
21198+ * +--------------+---+---+--------------+-----------------+
21199+ * | locality id | 4 | 0 | objectid | offset |
21200+ * +--------------+---+---+--------------+-----------------+
21201+ * | | | |
21202+ * | 8 bytes | 8 bytes | 8 bytes |
21203+ *
21204+ * locality id object id of a directory where first name was created for
21205+ * the object
21206+ *
21207+ * objectid object id for this object
21208+ *
21209+ * offset logical offset from the beginning of this file.
21210+ * Measured in bytes.
21211+ *
21212+ *
21213+ */
21214+
21215+#include "debug.h"
21216+#include "key.h"
21217+#include "kassign.h"
21218+#include "vfs_ops.h"
21219+#include "inode.h"
21220+#include "super.h"
21221+#include "dscale.h"
21222+
21223+#include <linux/types.h> /* for __u?? */
21224+#include <linux/fs.h> /* for struct super_block, etc */
21225+
21226+/* bitmask for H bit (see comment at the beginning of this file */
21227+static const __u64 longname_mark = 0x0100000000000000ull;
21228+/* bitmask for F and H portions of the key. */
21229+static const __u64 fibration_mask = 0xff00000000000000ull;
21230+
21231+/* return true if name is not completely encoded in @key */
21232+int is_longname_key(const reiser4_key * key)
21233+{
21234+ __u64 highpart;
21235+
21236+ assert("nikita-2863", key != NULL);
21237+ if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21238+ reiser4_print_key("oops", key);
21239+ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21240+
21241+ if (REISER4_LARGE_KEY)
21242+ highpart = get_key_ordering(key);
21243+ else
21244+ highpart = get_key_objectid(key);
21245+
21246+ return (highpart & longname_mark) ? 1 : 0;
21247+}
21248+
21249+/* return true if @name is too long to be completely encoded in the key */
21250+int is_longname(const char *name UNUSED_ARG, int len)
21251+{
21252+ if (REISER4_LARGE_KEY)
21253+ return len > 23;
21254+ else
21255+ return len > 15;
21256+}
21257+
21258+/* code ascii string into __u64.
21259+
21260+ Put characters of @name into result (@str) one after another starting
21261+ from @start_idx-th highest (arithmetically) byte. This produces
21262+ endian-safe encoding. memcpy(2) will not do.
21263+
21264+*/
21265+static __u64 pack_string(const char *name /* string to encode */ ,
21266+ int start_idx /* highest byte in result from
21267+ * which to start encoding */ )
21268+{
21269+ unsigned i;
21270+ __u64 str;
21271+
21272+ str = 0;
21273+ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21274+ str <<= 8;
21275+ str |= (unsigned char)name[i];
21276+ }
21277+ str <<= (sizeof str - i - start_idx) << 3;
21278+ return str;
21279+}
21280+
21281+/* opposite to pack_string(). Takes value produced by pack_string(), restores
21282+ * string encoded in it and stores result in @buf */
21283+char * reiser4_unpack_string(__u64 value, char *buf)
21284+{
21285+ do {
21286+ *buf = value >> (64 - 8);
21287+ if (*buf)
21288+ ++buf;
21289+ value <<= 8;
21290+ } while (value != 0);
21291+ *buf = 0;
21292+ return buf;
21293+}
21294+
21295+/* obtain name encoded in @key and store it in @buf */
21296+char *extract_name_from_key(const reiser4_key * key, char *buf)
21297+{
21298+ char *c;
21299+
21300+ assert("nikita-2868", !is_longname_key(key));
21301+
21302+ c = buf;
21303+ if (REISER4_LARGE_KEY) {
21304+ c = reiser4_unpack_string(get_key_ordering(key) &
21305+ ~fibration_mask, c);
21306+ c = reiser4_unpack_string(get_key_fulloid(key), c);
21307+ } else
21308+ c = reiser4_unpack_string(get_key_fulloid(key) &
21309+ ~fibration_mask, c);
21310+ reiser4_unpack_string(get_key_offset(key), c);
21311+ return buf;
21312+}
21313+
21314+/**
21315+ * complete_entry_key - calculate entry key by name
21316+ * @dir: directory where entry is (or will be) in
21317+ * @name: name to calculate key of
21318+ * @len: lenth of name
21319+ * @result: place to store result in
21320+ *
21321+ * Sets fields of entry key @result which depend on file name.
21322+ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21323+ * objectid and offset. Otherwise, objectid and offset are set.
21324+ */
21325+void complete_entry_key(const struct inode *dir, const char *name,
21326+ int len, reiser4_key *result)
21327+{
21328+#if REISER4_LARGE_KEY
21329+ __u64 ordering;
21330+ __u64 objectid;
21331+ __u64 offset;
21332+
21333+ assert("nikita-1139", dir != NULL);
21334+ assert("nikita-1142", result != NULL);
21335+ assert("nikita-2867", strlen(name) == len);
21336+
21337+ /*
21338+ * key allocation algorithm for directory entries in case of large
21339+ * keys:
21340+ *
21341+ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21342+ * characters into ordering field of key, next 8 charactes (if any)
21343+ * into objectid field of key and next 8 ones (of any) into offset
21344+ * field of key
21345+ *
21346+ * If file name is longer than 23 characters, put first 7 characters
21347+ * into key's ordering, next 8 to objectid and hash of remaining
21348+ * characters into offset field.
21349+ *
21350+ * To distinguish above cases, in latter set up unused high bit in
21351+ * ordering field.
21352+ */
21353+
21354+ /* [0-6] characters to ordering */
21355+ ordering = pack_string(name, 1);
21356+ if (len > 7) {
21357+ /* [7-14] characters to objectid */
21358+ objectid = pack_string(name + 7, 0);
21359+ if (len > 15) {
21360+ if (len <= 23) {
21361+ /* [15-23] characters to offset */
21362+ offset = pack_string(name + 15, 0);
21363+ } else {
21364+ /* note in a key the fact that offset contains hash. */
21365+ ordering |= longname_mark;
21366+
21367+ /* offset is the hash of the file name's tail. */
21368+ offset = inode_hash_plugin(dir)->hash(name + 15,
21369+ len - 15);
21370+ }
21371+ } else {
21372+ offset = 0ull;
21373+ }
21374+ } else {
21375+ objectid = 0ull;
21376+ offset = 0ull;
21377+ }
21378+
21379+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21380+ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21381+
21382+ set_key_ordering(result, ordering);
21383+ set_key_fulloid(result, objectid);
21384+ set_key_offset(result, offset);
21385+ return;
21386+
21387+#else
21388+ __u64 objectid;
21389+ __u64 offset;
21390+
21391+ assert("nikita-1139", dir != NULL);
21392+ assert("nikita-1142", result != NULL);
21393+ assert("nikita-2867", strlen(name) == len);
21394+
21395+ /*
21396+ * key allocation algorithm for directory entries in case of not large
21397+ * keys:
21398+ *
21399+ * If name is not longer than 7 + 8 = 15 characters, put first 7
21400+ * characters into objectid field of key, next 8 charactes (if any)
21401+ * into offset field of key
21402+ *
21403+ * If file name is longer than 15 characters, put first 7 characters
21404+ * into key's objectid, and hash of remaining characters into offset
21405+ * field.
21406+ *
21407+ * To distinguish above cases, in latter set up unused high bit in
21408+ * objectid field.
21409+ */
21410+
21411+ /* [0-6] characters to objectid */
21412+ objectid = pack_string(name, 1);
21413+ if (len > 7) {
21414+ if (len <= 15) {
21415+ /* [7-14] characters to offset */
21416+ offset = pack_string(name + 7, 0);
21417+ } else {
21418+ /* note in a key the fact that offset contains hash. */
21419+ objectid |= longname_mark;
21420+
21421+ /* offset is the hash of the file name. */
21422+ offset = inode_hash_plugin(dir)->hash(name + 7,
21423+ len - 7);
21424+ }
21425+ } else
21426+ offset = 0ull;
21427+
21428+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21429+ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21430+
21431+ set_key_fulloid(result, objectid);
21432+ set_key_offset(result, offset);
21433+ return;
21434+#endif /* ! REISER4_LARGE_KEY */
21435+}
21436+
21437+/* true, if @key is the key of "." */
21438+int is_dot_key(const reiser4_key * key /* key to check */ )
21439+{
21440+ assert("nikita-1717", key != NULL);
21441+ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21442+ return
21443+ (get_key_ordering(key) == 0ull) &&
21444+ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21445+}
21446+
21447+/* build key for stat-data.
21448+
21449+ return key of stat-data of this object. This should became sd plugin
21450+ method in the future. For now, let it be here.
21451+
21452+*/
21453+reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21454+ reiser4_key * result /* resulting key of @target
21455+ stat-data */ )
21456+{
21457+ assert("nikita-261", result != NULL);
21458+
21459+ reiser4_key_init(result);
21460+ set_key_locality(result, reiser4_inode_data(target)->locality_id);
21461+ set_key_ordering(result, get_inode_ordering(target));
21462+ set_key_objectid(result, get_inode_oid(target));
21463+ set_key_type(result, KEY_SD_MINOR);
21464+ set_key_offset(result, (__u64) 0);
21465+ return result;
21466+}
21467+
21468+/* encode part of key into &obj_key_id
21469+
21470+ This encodes into @id part of @key sufficient to restore @key later,
21471+ given that latter is key of object (key of stat-data).
21472+
21473+ See &obj_key_id
21474+*/
21475+int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21476+ obj_key_id * id /* id where key is encoded in */ )
21477+{
21478+ assert("nikita-1151", key != NULL);
21479+ assert("nikita-1152", id != NULL);
21480+
21481+ memcpy(id, key, sizeof *id);
21482+ return 0;
21483+}
21484+
21485+/* encode reference to @obj in @id.
21486+
21487+ This is like build_obj_key_id() above, but takes inode as parameter. */
21488+int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21489+ obj_key_id * id /* result */ )
21490+{
21491+ reiser4_key sdkey;
21492+
21493+ assert("nikita-1166", obj != NULL);
21494+ assert("nikita-1167", id != NULL);
21495+
21496+ build_sd_key(obj, &sdkey);
21497+ build_obj_key_id(&sdkey, id);
21498+ return 0;
21499+}
21500+
21501+/* decode @id back into @key
21502+
21503+ Restore key of object stat-data from @id. This is dual to
21504+ build_obj_key_id() above.
21505+*/
21506+int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21507+ * from */ ,
21508+ reiser4_key * key /* result */ )
21509+{
21510+ assert("nikita-1153", id != NULL);
21511+ assert("nikita-1154", key != NULL);
21512+
21513+ reiser4_key_init(key);
21514+ memcpy(key, id, sizeof *id);
21515+ return 0;
21516+}
21517+
21518+/* extract objectid of directory from key of directory entry within said
21519+ directory.
21520+ */
21521+oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21522+ * directory
21523+ * entry */ )
21524+{
21525+ assert("nikita-1314", de_key != NULL);
21526+ return get_key_locality(de_key);
21527+}
21528+
21529+/* encode into @id key of directory entry.
21530+
21531+ Encode into @id information sufficient to later distinguish directory
21532+ entries within the same directory. This is not whole key, because all
21533+ directory entries within directory item share locality which is equal
21534+ to objectid of their directory.
21535+
21536+*/
21537+int build_de_id(const struct inode *dir /* inode of directory */ ,
21538+ const struct qstr *name /* name to be given to @obj by
21539+ * directory entry being
21540+ * constructed */ ,
21541+ de_id * id /* short key of directory entry */ )
21542+{
21543+ reiser4_key key;
21544+
21545+ assert("nikita-1290", dir != NULL);
21546+ assert("nikita-1292", id != NULL);
21547+
21548+ /* NOTE-NIKITA this is suboptimal. */
21549+ inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21550+ return build_de_id_by_key(&key, id);
21551+}
21552+
21553+/* encode into @id key of directory entry.
21554+
21555+ Encode into @id information sufficient to later distinguish directory
21556+ entries within the same directory. This is not whole key, because all
21557+ directory entries within directory item share locality which is equal
21558+ to objectid of their directory.
21559+
21560+*/
21561+int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21562+ * entry */ ,
21563+ de_id * id /* short key of directory entry */ )
21564+{
21565+ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21566+ return 0;
21567+}
21568+
21569+/* restore from @id key of directory entry.
21570+
21571+ Function dual to build_de_id(): given @id and locality, build full
21572+ key of directory entry within directory item.
21573+
21574+*/
21575+int extract_key_from_de_id(const oid_t locality /* locality of directory
21576+ * entry */ ,
21577+ const de_id * id /* directory entry id */ ,
21578+ reiser4_key * key /* result */ )
21579+{
21580+ /* no need to initialise key here: all fields are overwritten */
21581+ memcpy(((__u64 *) key) + 1, id, sizeof *id);
21582+ set_key_locality(key, locality);
21583+ set_key_type(key, KEY_FILE_NAME_MINOR);
21584+ return 0;
21585+}
21586+
21587+/* compare two &de_id's */
21588+cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21589+ const de_id * id2 /* second &de_id to compare */ )
21590+{
21591+ /* NOTE-NIKITA ugly implementation */
21592+ reiser4_key k1;
21593+ reiser4_key k2;
21594+
21595+ extract_key_from_de_id((oid_t) 0, id1, &k1);
21596+ extract_key_from_de_id((oid_t) 0, id2, &k2);
21597+ return keycmp(&k1, &k2);
21598+}
21599+
21600+/* compare &de_id with key */
21601+cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21602+ const reiser4_key * key /* key to compare */ )
21603+{
21604+ cmp_t result;
21605+ reiser4_key *k1;
21606+
21607+ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21608+ result = KEY_DIFF_EL(k1, key, 1);
21609+ if (result == EQUAL_TO) {
21610+ result = KEY_DIFF_EL(k1, key, 2);
21611+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
21612+ result = KEY_DIFF_EL(k1, key, 3);
21613+ }
21614+ }
21615+ return result;
21616+}
21617+
21618+/*
21619+ * return number of bytes necessary to encode @inode identity.
21620+ */
21621+int inode_onwire_size(const struct inode *inode)
21622+{
21623+ int result;
21624+
21625+ result = dscale_bytes_to_write(get_inode_oid(inode));
21626+ result += dscale_bytes_to_write(get_inode_locality(inode));
21627+
21628+ /*
21629+ * ordering is large (it usually has highest bits set), so it makes
21630+ * little sense to dscale it.
21631+ */
21632+ if (REISER4_LARGE_KEY)
21633+ result += sizeof(get_inode_ordering(inode));
21634+ return result;
21635+}
21636+
21637+/*
21638+ * encode @inode identity at @start
21639+ */
21640+char *build_inode_onwire(const struct inode *inode, char *start)
21641+{
21642+ start += dscale_write(start, get_inode_locality(inode));
21643+ start += dscale_write(start, get_inode_oid(inode));
21644+
21645+ if (REISER4_LARGE_KEY) {
21646+ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21647+ start += sizeof(get_inode_ordering(inode));
21648+ }
21649+ return start;
21650+}
21651+
21652+/*
21653+ * extract key that was previously encoded by build_inode_onwire() at @addr
21654+ */
21655+char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21656+{
21657+ __u64 val;
21658+
21659+ addr += dscale_read(addr, &val);
21660+ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21661+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21662+ addr += dscale_read(addr, &val);
21663+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21664+#if REISER4_LARGE_KEY
21665+ memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21666+ addr += sizeof key_id->ordering;
21667+#endif
21668+ return addr;
21669+}
21670+
21671+/*
21672+ * skip a key that was previously encoded by build_inode_onwire() at @addr
21673+ * FIXME: handle IO errors.
21674+ */
21675+char * locate_obj_key_id_onwire(char * addr)
21676+{
21677+ /* locality */
21678+ addr += dscale_bytes_to_read(addr);
21679+ /* objectid */
21680+ addr += dscale_bytes_to_read(addr);
21681+#if REISER4_LARGE_KEY
21682+ addr += sizeof ((obj_key_id *)0)->ordering;
21683+#endif
21684+ return addr;
21685+}
21686+
21687+/* Make Linus happy.
21688+ Local variables:
21689+ c-indentation-style: "K&R"
21690+ mode-name: "LC"
21691+ c-basic-offset: 8
21692+ tab-width: 8
21693+ fill-column: 120
21694+ End:
21695+*/
21696diff -urN linux-2.6.24.orig/fs/reiser4/kassign.h linux-2.6.24/fs/reiser4/kassign.h
21697--- linux-2.6.24.orig/fs/reiser4/kassign.h 1970-01-01 03:00:00.000000000 +0300
21698+++ linux-2.6.24/fs/reiser4/kassign.h 2008-01-25 11:55:43.900543447 +0300
21699@@ -0,0 +1,111 @@
21700+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21701+ * reiser4/README */
21702+
21703+/* Key assignment policy interface. See kassign.c for details. */
21704+
21705+#if !defined( __KASSIGN_H__ )
21706+#define __KASSIGN_H__
21707+
21708+#include "forward.h"
21709+#include "key.h"
21710+#include "dformat.h"
21711+
21712+#include <linux/types.h> /* for __u?? */
21713+#include <linux/fs.h> /* for struct super_block, etc */
21714+#include <linux/dcache.h> /* for struct qstr */
21715+
21716+/* key assignment functions */
21717+
21718+/* Information from which key of file stat-data can be uniquely
21719+ restored. This depends on key assignment policy for
21720+ stat-data. Currently it's enough to store object id and locality id
21721+ (60+60==120) bits, because minor packing locality and offset of
21722+ stat-data key are always known constants: KEY_SD_MINOR and 0
21723+ respectively. For simplicity 4 bits are wasted in each id, and just
21724+ two 64 bit integers are stored.
21725+
21726+ This field has to be byte-aligned, because we don't want to waste
21727+ space in directory entries. There is another side of a coin of
21728+ course: we waste CPU and bus bandwidth in stead, by copying data back
21729+ and forth.
21730+
21731+ Next optimization: &obj_key_id is mainly used to address stat data from
21732+ directory entries. Under the assumption that majority of files only have
21733+ only name (one hard link) from *the* parent directory it seems reasonable
21734+ to only store objectid of stat data and take its locality from key of
21735+ directory item.
21736+
21737+ This requires some flag to be added to the &obj_key_id to distinguish
21738+ between these two cases. Remaining bits in flag byte are then asking to be
21739+ used to store file type.
21740+
21741+ This optimization requires changes in directory item handling code.
21742+
21743+*/
21744+typedef struct obj_key_id {
21745+ d8 locality[sizeof(__u64)];
21746+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21747+ )
21748+ d8 objectid[sizeof(__u64)];
21749+}
21750+obj_key_id;
21751+
21752+/* Information sufficient to uniquely identify directory entry within
21753+ compressed directory item.
21754+
21755+ For alignment issues see &obj_key_id above.
21756+*/
21757+typedef struct de_id {
21758+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21759+ d8 objectid[sizeof(__u64)];
21760+ d8 offset[sizeof(__u64)];
21761+}
21762+de_id;
21763+
21764+extern int inode_onwire_size(const struct inode *obj);
21765+extern char *build_inode_onwire(const struct inode *obj, char *area);
21766+extern char *locate_obj_key_id_onwire(char *area);
21767+extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21768+
21769+extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21770+extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21771+extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21772+extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21773+extern int build_de_id(const struct inode *dir, const struct qstr *name,
21774+ de_id * id);
21775+extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21776+extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21777+ reiser4_key * key);
21778+extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21779+extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21780+
21781+extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21782+extern void build_entry_key_common(const struct inode *dir,
21783+ const struct qstr *name,
21784+ reiser4_key * result);
21785+extern void build_entry_key_stable_entry(const struct inode *dir,
21786+ const struct qstr *name,
21787+ reiser4_key * result);
21788+extern int is_dot_key(const reiser4_key * key);
21789+extern reiser4_key *build_sd_key(const struct inode *target,
21790+ reiser4_key * result);
21791+
21792+extern int is_longname_key(const reiser4_key * key);
21793+extern int is_longname(const char *name, int len);
21794+extern char *extract_name_from_key(const reiser4_key * key, char *buf);
21795+extern char *reiser4_unpack_string(__u64 value, char *buf);
21796+extern void complete_entry_key(const struct inode *dir, const char *name,
21797+ int len, reiser4_key *result);
21798+
21799+/* __KASSIGN_H__ */
21800+#endif
21801+
21802+/* Make Linus happy.
21803+ Local variables:
21804+ c-indentation-style: "K&R"
21805+ mode-name: "LC"
21806+ c-basic-offset: 8
21807+ tab-width: 8
21808+ fill-column: 120
21809+ End:
21810+*/
21811diff -urN linux-2.6.24.orig/fs/reiser4/Kconfig linux-2.6.24/fs/reiser4/Kconfig
21812--- linux-2.6.24.orig/fs/reiser4/Kconfig 1970-01-01 03:00:00.000000000 +0300
21813+++ linux-2.6.24/fs/reiser4/Kconfig 2008-01-25 11:39:06.944209750 +0300
21814@@ -0,0 +1,34 @@
21815+config REISER4_FS
21816+ tristate "Reiser4 (EXPERIMENTAL)"
21817+ depends on EXPERIMENTAL
21818+ select ZLIB_INFLATE
21819+ select ZLIB_DEFLATE
21820+ select LZO_COMPRESS
21821+ select LZO_DECOMPRESS
21822+ select CRYPTO
21823+ help
21824+ Reiser4 is a filesystem that performs all filesystem operations
21825+ as atomic transactions, which means that it either performs a
21826+ write, or it does not, and in the event of a crash it does not
21827+ partially perform it or corrupt it.
21828+
21829+ It stores files in dancing trees, which are like balanced trees but
21830+ faster. It packs small files together so that they share blocks
21831+ without wasting space. This means you can use it to store really
21832+ small files. It also means that it saves you disk space. It avoids
21833+ hassling you with anachronisms like having a maximum number of
21834+ inodes, and wasting space if you use less than that number.
21835+
21836+ Reiser4 is a distinct filesystem type from reiserfs (V3).
21837+ It's therefore not possible to use reiserfs file systems
21838+ with reiser4.
21839+
21840+ To learn more about reiser4, go to http://www.namesys.com
21841+
21842+config REISER4_DEBUG
21843+ bool "Enable reiser4 debug mode"
21844+ depends on REISER4_FS
21845+ help
21846+ Don't use this unless you are debugging reiser4.
21847+
21848+ If unsure, say N.
21849diff -urN linux-2.6.24.orig/fs/reiser4/key.c linux-2.6.24/fs/reiser4/key.c
21850--- linux-2.6.24.orig/fs/reiser4/key.c 1970-01-01 03:00:00.000000000 +0300
21851+++ linux-2.6.24/fs/reiser4/key.c 2008-01-25 11:39:06.944209750 +0300
21852@@ -0,0 +1,137 @@
21853+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21854+
21855+/* Key manipulations. */
21856+
21857+#include "debug.h"
21858+#include "key.h"
21859+#include "super.h"
21860+#include "reiser4.h"
21861+
21862+#include <linux/types.h> /* for __u?? */
21863+
21864+/* Minimal possible key: all components are zero. It is presumed that this is
21865+ independent of key scheme. */
21866+static const reiser4_key MINIMAL_KEY = {
21867+ .el = {
21868+ 0ull,
21869+ ON_LARGE_KEY(0ull,)
21870+ 0ull,
21871+ 0ull
21872+ }
21873+};
21874+
21875+/* Maximal possible key: all components are ~0. It is presumed that this is
21876+ independent of key scheme. */
21877+static const reiser4_key MAXIMAL_KEY = {
21878+ .el = {
21879+ __constant_cpu_to_le64(~0ull),
21880+ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
21881+ __constant_cpu_to_le64(~0ull),
21882+ __constant_cpu_to_le64(~0ull)
21883+ }
21884+};
21885+
21886+/* Initialize key. */
21887+void reiser4_key_init(reiser4_key * key /* key to init */ )
21888+{
21889+ assert("nikita-1169", key != NULL);
21890+ memset(key, 0, sizeof *key);
21891+}
21892+
21893+/* minimal possible key in the tree. Return pointer to the static storage. */
21894+const reiser4_key *reiser4_min_key(void)
21895+{
21896+ return &MINIMAL_KEY;
21897+}
21898+
21899+/* maximum possible key in the tree. Return pointer to the static storage. */
21900+const reiser4_key *reiser4_max_key(void)
21901+{
21902+ return &MAXIMAL_KEY;
21903+}
21904+
21905+#if REISER4_DEBUG
21906+/* debugging aid: print symbolic name of key type */
21907+static const char *type_name(unsigned int key_type /* key type */ )
21908+{
21909+ switch (key_type) {
21910+ case KEY_FILE_NAME_MINOR:
21911+ return "file name";
21912+ case KEY_SD_MINOR:
21913+ return "stat data";
21914+ case KEY_ATTR_NAME_MINOR:
21915+ return "attr name";
21916+ case KEY_ATTR_BODY_MINOR:
21917+ return "attr body";
21918+ case KEY_BODY_MINOR:
21919+ return "file body";
21920+ default:
21921+ return "unknown";
21922+ }
21923+}
21924+
21925+/* debugging aid: print human readable information about key */
21926+void reiser4_print_key(const char *prefix /* prefix to print */ ,
21927+ const reiser4_key * key /* key to print */ )
21928+{
21929+ /* turn bold on */
21930+ /* printf ("\033[1m"); */
21931+ if (key == NULL)
21932+ printk("%s: null key\n", prefix);
21933+ else {
21934+ if (REISER4_LARGE_KEY)
21935+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
21936+ get_key_locality(key),
21937+ get_key_type(key),
21938+ get_key_ordering(key),
21939+ get_key_band(key),
21940+ get_key_objectid(key), get_key_offset(key));
21941+ else
21942+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
21943+ get_key_locality(key),
21944+ get_key_type(key),
21945+ get_key_band(key),
21946+ get_key_objectid(key), get_key_offset(key));
21947+ /*
21948+ * if this is a key of directory entry, try to decode part of
21949+ * a name stored in the key, and output it.
21950+ */
21951+ if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
21952+ char buf[DE_NAME_BUF_LEN];
21953+ char *c;
21954+
21955+ c = buf;
21956+ c = reiser4_unpack_string(get_key_ordering(key), c);
21957+ reiser4_unpack_string(get_key_fulloid(key), c);
21958+ printk("[%s", buf);
21959+ if (is_longname_key(key))
21960+ /*
21961+ * only part of the name is stored in the key.
21962+ */
21963+ printk("...]\n");
21964+ else {
21965+ /*
21966+ * whole name is stored in the key.
21967+ */
21968+ reiser4_unpack_string(get_key_offset(key), buf);
21969+ printk("%s]\n", buf);
21970+ }
21971+ } else {
21972+ printk("[%s]\n", type_name(get_key_type(key)));
21973+ }
21974+ }
21975+ /* turn bold off */
21976+ /* printf ("\033[m\017"); */
21977+}
21978+
21979+#endif
21980+
21981+/* Make Linus happy.
21982+ Local variables:
21983+ c-indentation-style: "K&R"
21984+ mode-name: "LC"
21985+ c-basic-offset: 8
21986+ tab-width: 8
21987+ fill-column: 120
21988+ End:
21989+*/
21990diff -urN linux-2.6.24.orig/fs/reiser4/key.h linux-2.6.24/fs/reiser4/key.h
21991--- linux-2.6.24.orig/fs/reiser4/key.h 1970-01-01 03:00:00.000000000 +0300
21992+++ linux-2.6.24/fs/reiser4/key.h 2008-01-25 11:39:06.944209750 +0300
21993@@ -0,0 +1,384 @@
21994+/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21995+
21996+/* Declarations of key-related data-structures and operations on keys. */
21997+
21998+#if !defined( __REISER4_KEY_H__ )
21999+#define __REISER4_KEY_H__
22000+
22001+#include "dformat.h"
22002+#include "forward.h"
22003+#include "debug.h"
22004+
22005+#include <linux/types.h> /* for __u?? */
22006+
22007+/* Operations on keys in reiser4 tree */
22008+
22009+/* No access to any of these fields shall be done except via a
22010+ wrapping macro/function, and that wrapping macro/function shall
22011+ convert to little endian order. Compare keys will consider cpu byte order. */
22012+
22013+/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
22014+ which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
22015+ within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong
22016+ approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
22017+ right one. */
22018+
22019+/* possible values for minor packing locality (4 bits required) */
22020+typedef enum {
22021+ /* file name */
22022+ KEY_FILE_NAME_MINOR = 0,
22023+ /* stat-data */
22024+ KEY_SD_MINOR = 1,
22025+ /* file attribute name */
22026+ KEY_ATTR_NAME_MINOR = 2,
22027+ /* file attribute value */
22028+ KEY_ATTR_BODY_MINOR = 3,
22029+ /* file body (tail or extent) */
22030+ KEY_BODY_MINOR = 4,
22031+} key_minor_locality;
22032+
22033+/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
22034+ Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
22035+ and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to
22036+ segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
22037+ block_alloc.c to check the node type when deciding where to allocate the node.
22038+
22039+ The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it
22040+ should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our
22041+ current implementation tails have a different minor packing locality from extents, and no files have both extents and
22042+ tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now....
22043+*/
22044+
22045+/* Arbitrary major packing localities can be assigned to objects using
22046+ the reiser4(filenameA/..packing<=some_number) system call.
22047+
22048+ In reiser4, the creat() syscall creates a directory
22049+
22050+ whose default flow (that which is referred to if the directory is
22051+ read as a file) is the traditional unix file body.
22052+
22053+ whose directory plugin is the 'filedir'
22054+
22055+ whose major packing locality is that of the parent of the object created.
22056+
22057+ The static_stat item is a particular commonly used directory
22058+ compression (the one for normal unix files).
22059+
22060+ The filedir plugin checks to see if the static_stat item exists.
22061+ There is a unique key for static_stat. If yes, then it uses the
22062+ static_stat item for all of the values that it contains. The
22063+ static_stat item contains a flag for each stat it contains which
22064+ indicates whether one should look outside the static_stat item for its
22065+ contents.
22066+*/
22067+
22068+/* offset of fields in reiser4_key. Value of each element of this enum
22069+ is index within key (thought as array of __u64's) where this field
22070+ is. */
22071+typedef enum {
22072+ /* major "locale", aka dirid. Sits in 1st element */
22073+ KEY_LOCALITY_INDEX = 0,
22074+ /* minor "locale", aka item type. Sits in 1st element */
22075+ KEY_TYPE_INDEX = 0,
22076+ ON_LARGE_KEY(KEY_ORDERING_INDEX,)
22077+ /* "object band". Sits in 2nd element */
22078+ KEY_BAND_INDEX,
22079+ /* objectid. Sits in 2nd element */
22080+ KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
22081+ /* full objectid. Sits in 2nd element */
22082+ KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22083+ /* Offset. Sits in 3rd element */
22084+ KEY_OFFSET_INDEX,
22085+ /* Name hash. Sits in 3rd element */
22086+ KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22087+ KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22088+ KEY_LAST_INDEX
22089+} reiser4_key_field_index;
22090+
22091+/* key in reiser4 internal "balanced" tree. It is just array of three
22092+ 64bit integers in disk byte order (little-endian by default). This
22093+ array is actually indexed by reiser4_key_field. Each __u64 within
22094+ this array is called "element". Logical key component encoded within
22095+ elements are called "fields".
22096+
22097+ We declare this as union with second component dummy to suppress
22098+ inconvenient array<->pointer casts implied in C. */
22099+union reiser4_key {
22100+ __le64 el[KEY_LAST_INDEX];
22101+ int pad;
22102+};
22103+
22104+/* bitmasks showing where within reiser4_key particular key is stored. */
22105+/* major locality occupies higher 60 bits of the first element */
22106+#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22107+
22108+/* minor locality occupies lower 4 bits of the first element */
22109+#define KEY_TYPE_MASK 0xfull
22110+
22111+/* controversial band occupies higher 4 bits of the 2nd element */
22112+#define KEY_BAND_MASK 0xf000000000000000ull
22113+
22114+/* objectid occupies lower 60 bits of the 2nd element */
22115+#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22116+
22117+/* full 64bit objectid*/
22118+#define KEY_FULLOID_MASK 0xffffffffffffffffull
22119+
22120+/* offset is just 3rd L.M.Nt itself */
22121+#define KEY_OFFSET_MASK 0xffffffffffffffffull
22122+
22123+/* ordering is whole second element */
22124+#define KEY_ORDERING_MASK 0xffffffffffffffffull
22125+
22126+/* how many bits key element should be shifted to left to get particular field */
22127+typedef enum {
22128+ KEY_LOCALITY_SHIFT = 4,
22129+ KEY_TYPE_SHIFT = 0,
22130+ KEY_BAND_SHIFT = 60,
22131+ KEY_OBJECTID_SHIFT = 0,
22132+ KEY_FULLOID_SHIFT = 0,
22133+ KEY_OFFSET_SHIFT = 0,
22134+ KEY_ORDERING_SHIFT = 0,
22135+} reiser4_key_field_shift;
22136+
22137+static inline __u64
22138+get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22139+{
22140+ assert("nikita-753", key != NULL);
22141+ assert("nikita-754", off < KEY_LAST_INDEX);
22142+ return le64_to_cpu(get_unaligned(&key->el[off]));
22143+}
22144+
22145+static inline void
22146+set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22147+{
22148+ assert("nikita-755", key != NULL);
22149+ assert("nikita-756", off < KEY_LAST_INDEX);
22150+ put_unaligned(cpu_to_le64(value), &key->el[off]);
22151+}
22152+
22153+/* macro to define getter and setter functions for field F with type T */
22154+#define DEFINE_KEY_FIELD( L, U, T ) \
22155+static inline T get_key_ ## L ( const reiser4_key *key ) \
22156+{ \
22157+ assert( "nikita-750", key != NULL ); \
22158+ return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \
22159+ KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \
22160+} \
22161+ \
22162+static inline void set_key_ ## L ( reiser4_key *key, T loc ) \
22163+{ \
22164+ __u64 el; \
22165+ \
22166+ assert( "nikita-752", key != NULL ); \
22167+ \
22168+ el = get_key_el( key, KEY_ ## U ## _INDEX ); \
22169+ /* clear field bits in the key */ \
22170+ el &= ~KEY_ ## U ## _MASK; \
22171+ /* actually it should be \
22172+ \
22173+ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22174+ \
22175+ but we trust user to never pass values that wouldn't fit \
22176+ into field. Clearing extra bits is one operation, but this \
22177+ function is time-critical. \
22178+ But check this in assertion. */ \
22179+ assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \
22180+ ~KEY_ ## U ## _MASK ) == 0 ); \
22181+ el |= ( loc << KEY_ ## U ## _SHIFT ); \
22182+ set_key_el( key, KEY_ ## U ## _INDEX, el ); \
22183+}
22184+
22185+typedef __u64 oid_t;
22186+
22187+/* define get_key_locality(), set_key_locality() */
22188+DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22189+/* define get_key_type(), set_key_type() */
22190+DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22191+/* define get_key_band(), set_key_band() */
22192+DEFINE_KEY_FIELD(band, BAND, __u64);
22193+/* define get_key_objectid(), set_key_objectid() */
22194+DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22195+/* define get_key_fulloid(), set_key_fulloid() */
22196+DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22197+/* define get_key_offset(), set_key_offset() */
22198+DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22199+#if (REISER4_LARGE_KEY)
22200+/* define get_key_ordering(), set_key_ordering() */
22201+DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22202+#else
22203+static inline __u64 get_key_ordering(const reiser4_key * key)
22204+{
22205+ return 0;
22206+}
22207+
22208+static inline void set_key_ordering(reiser4_key * key, __u64 val)
22209+{
22210+}
22211+#endif
22212+
22213+/* key comparison result */
22214+typedef enum { LESS_THAN = -1, /* if first key is less than second */
22215+ EQUAL_TO = 0, /* if keys are equal */
22216+ GREATER_THAN = +1 /* if first key is greater than second */
22217+} cmp_t;
22218+
22219+void reiser4_key_init(reiser4_key * key);
22220+
22221+/* minimal possible key in the tree. Return pointer to the static storage. */
22222+extern const reiser4_key *reiser4_min_key(void);
22223+extern const reiser4_key *reiser4_max_key(void);
22224+
22225+/* helper macro for keycmp() */
22226+#define KEY_DIFF(k1, k2, field) \
22227+({ \
22228+ typeof (get_key_ ## field (k1)) f1; \
22229+ typeof (get_key_ ## field (k2)) f2; \
22230+ \
22231+ f1 = get_key_ ## field (k1); \
22232+ f2 = get_key_ ## field (k2); \
22233+ \
22234+ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22235+})
22236+
22237+/* helper macro for keycmp() */
22238+#define KEY_DIFF_EL(k1, k2, off) \
22239+({ \
22240+ __u64 e1; \
22241+ __u64 e2; \
22242+ \
22243+ e1 = get_key_el(k1, off); \
22244+ e2 = get_key_el(k2, off); \
22245+ \
22246+ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22247+})
22248+
22249+/* compare `k1' and `k2'. This function is a heart of "key allocation
22250+ policy". All you need to implement new policy is to add yet another
22251+ clause here. */
22252+static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22253+ const reiser4_key * k2 /* second key to compare */ )
22254+{
22255+ cmp_t result;
22256+
22257+ /*
22258+ * This function is the heart of reiser4 tree-routines. Key comparison
22259+ * is among most heavily used operations in the file system.
22260+ */
22261+
22262+ assert("nikita-439", k1 != NULL);
22263+ assert("nikita-440", k2 != NULL);
22264+
22265+ /* there is no actual branch here: condition is compile time constant
22266+ * and constant folding and propagation ensures that only one branch
22267+ * is actually compiled in. */
22268+
22269+ if (REISER4_PLANA_KEY_ALLOCATION) {
22270+ /* if physical order of fields in a key is identical
22271+ with logical order, we can implement key comparison
22272+ as three 64bit comparisons. */
22273+ /* logical order of fields in plan-a:
22274+ locality->type->objectid->offset. */
22275+ /* compare locality and type at once */
22276+ result = KEY_DIFF_EL(k1, k2, 0);
22277+ if (result == EQUAL_TO) {
22278+ /* compare objectid (and band if it's there) */
22279+ result = KEY_DIFF_EL(k1, k2, 1);
22280+ /* compare offset */
22281+ if (result == EQUAL_TO) {
22282+ result = KEY_DIFF_EL(k1, k2, 2);
22283+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22284+ result = KEY_DIFF_EL(k1, k2, 3);
22285+ }
22286+ }
22287+ }
22288+ } else if (REISER4_3_5_KEY_ALLOCATION) {
22289+ result = KEY_DIFF(k1, k2, locality);
22290+ if (result == EQUAL_TO) {
22291+ result = KEY_DIFF(k1, k2, objectid);
22292+ if (result == EQUAL_TO) {
22293+ result = KEY_DIFF(k1, k2, type);
22294+ if (result == EQUAL_TO)
22295+ result = KEY_DIFF(k1, k2, offset);
22296+ }
22297+ }
22298+ } else
22299+ impossible("nikita-441", "Unknown key allocation scheme!");
22300+ return result;
22301+}
22302+
22303+/* true if @k1 equals @k2 */
22304+static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22305+ const reiser4_key * k2 /* second key to compare */ )
22306+{
22307+ assert("nikita-1879", k1 != NULL);
22308+ assert("nikita-1880", k2 != NULL);
22309+ return !memcmp(k1, k2, sizeof *k1);
22310+}
22311+
22312+/* true if @k1 is less than @k2 */
22313+static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22314+ const reiser4_key * k2 /* second key to compare */ )
22315+{
22316+ assert("nikita-1952", k1 != NULL);
22317+ assert("nikita-1953", k2 != NULL);
22318+ return keycmp(k1, k2) == LESS_THAN;
22319+}
22320+
22321+/* true if @k1 is less than or equal to @k2 */
22322+static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22323+ const reiser4_key * k2 /* second key to compare */ )
22324+{
22325+ assert("nikita-1954", k1 != NULL);
22326+ assert("nikita-1955", k2 != NULL);
22327+ return keycmp(k1, k2) != GREATER_THAN;
22328+}
22329+
22330+/* true if @k1 is greater than @k2 */
22331+static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22332+ const reiser4_key * k2 /* second key to compare */ )
22333+{
22334+ assert("nikita-1959", k1 != NULL);
22335+ assert("nikita-1960", k2 != NULL);
22336+ return keycmp(k1, k2) == GREATER_THAN;
22337+}
22338+
22339+/* true if @k1 is greater than or equal to @k2 */
22340+static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22341+ const reiser4_key * k2 /* second key to compare */ )
22342+{
22343+ assert("nikita-1956", k1 != NULL);
22344+ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22345+ * November 3: Laika */
22346+ return keycmp(k1, k2) != LESS_THAN;
22347+}
22348+
22349+static inline void prefetchkey(reiser4_key * key)
22350+{
22351+ prefetch(key);
22352+ prefetch(&key->el[KEY_CACHELINE_END]);
22353+}
22354+
22355+/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22356+ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22357+/* size of a buffer suitable to hold human readable key representation */
22358+#define KEY_BUF_LEN (80)
22359+
22360+#if REISER4_DEBUG
22361+extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22362+#else
22363+#define reiser4_print_key(p,k) noop
22364+#endif
22365+
22366+/* __FS_REISERFS_KEY_H__ */
22367+#endif
22368+
22369+/* Make Linus happy.
22370+ Local variables:
22371+ c-indentation-style: "K&R"
22372+ mode-name: "LC"
22373+ c-basic-offset: 8
22374+ tab-width: 8
22375+ fill-column: 120
22376+ End:
22377+*/
22378diff -urN linux-2.6.24.orig/fs/reiser4/ktxnmgrd.c linux-2.6.24/fs/reiser4/ktxnmgrd.c
22379--- linux-2.6.24.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 03:00:00.000000000 +0300
22380+++ linux-2.6.24/fs/reiser4/ktxnmgrd.c 2008-01-25 11:39:06.944209750 +0300
22381@@ -0,0 +1,214 @@
22382+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22383+/* Transaction manager daemon. */
22384+
22385+/*
22386+ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22387+ * needed/important for the following reasons:
22388+ *
22389+ * 1. in reiser4 atom is not committed immediately when last transaction
22390+ * handle closes, unless atom is either too old or too large (see
22391+ * atom_should_commit()). This is done to avoid committing too frequently.
22392+ * because:
22393+ *
22394+ * 2. sometimes we don't want to commit atom when closing last transaction
22395+ * handle even if it is old and fat enough. For example, because we are at
22396+ * this point under directory semaphore, and committing would stall all
22397+ * accesses to this directory.
22398+ *
22399+ * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22400+ * either due to (tunable) timeout or because it was explicitly woken up by
22401+ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22402+ * eligible.
22403+ *
22404+ */
22405+
22406+#include "debug.h"
22407+#include "txnmgr.h"
22408+#include "tree.h"
22409+#include "ktxnmgrd.h"
22410+#include "super.h"
22411+#include "reiser4.h"
22412+
22413+#include <linux/sched.h> /* for struct task_struct */
22414+#include <linux/wait.h>
22415+#include <linux/suspend.h>
22416+#include <linux/kernel.h>
22417+#include <linux/writeback.h>
22418+#include <linux/kthread.h>
22419+#include <linux/freezer.h>
22420+
22421+static int scan_mgr(struct super_block *);
22422+
22423+/*
22424+ * change current->comm so that ps, top, and friends will see changed
22425+ * state. This serves no useful purpose whatsoever, but also costs nothing. May
22426+ * be it will make lonely system administrator feeling less alone at 3 A.M.
22427+ */
22428+#define set_comm( state ) \
22429+ snprintf( current -> comm, sizeof( current -> comm ), \
22430+ "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22431+
22432+/**
22433+ * ktxnmgrd - kernel txnmgr daemon
22434+ * @arg: pointer to super block
22435+ *
22436+ * The background transaction manager daemon, started as a kernel thread during
22437+ * reiser4 initialization.
22438+ */
22439+static int ktxnmgrd(void *arg)
22440+{
22441+ struct super_block *super;
22442+ ktxnmgrd_context *ctx;
22443+ txn_mgr *mgr;
22444+ int done = 0;
22445+
22446+ super = arg;
22447+ mgr = &get_super_private(super)->tmgr;
22448+
22449+ /*
22450+ * do_fork() just copies task_struct into the new thread. ->fs_context
22451+ * shouldn't be copied of course. This shouldn't be a problem for the
22452+ * rest of the code though.
22453+ */
22454+ current->journal_info = NULL;
22455+ ctx = mgr->daemon;
22456+ while (1) {
22457+ try_to_freeze();
22458+ set_comm("wait");
22459+ {
22460+ DEFINE_WAIT(__wait);
22461+
22462+ prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22463+ if (kthread_should_stop()) {
22464+ done = 1;
22465+ } else
22466+ schedule_timeout(ctx->timeout);
22467+ finish_wait(&ctx->wait, &__wait);
22468+ }
22469+ if (done)
22470+ break;
22471+ set_comm("run");
22472+ spin_lock(&ctx->guard);
22473+ /*
22474+ * wait timed out or ktxnmgrd was woken up by explicit request
22475+ * to commit something. Scan list of atoms in txnmgr and look
22476+ * for too old atoms.
22477+ */
22478+ do {
22479+ ctx->rescan = 0;
22480+ scan_mgr(super);
22481+ spin_lock(&ctx->guard);
22482+ if (ctx->rescan) {
22483+ /*
22484+ * the list could be modified while ctx
22485+ * spinlock was released, we have to repeat
22486+ * scanning from the beginning
22487+ */
22488+ break;
22489+ }
22490+ } while (ctx->rescan);
22491+ spin_unlock(&ctx->guard);
22492+ }
22493+ return 0;
22494+}
22495+
22496+#undef set_comm
22497+
22498+/**
22499+ * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22500+ * @super: pointer to super block
22501+ *
22502+ * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22503+ * manager. Starts kernel txnmgr daemon. This is called on mount.
22504+ */
22505+int reiser4_init_ktxnmgrd(struct super_block *super)
22506+{
22507+ txn_mgr *mgr;
22508+ ktxnmgrd_context *ctx;
22509+
22510+ mgr = &get_super_private(super)->tmgr;
22511+
22512+ assert("zam-1014", mgr->daemon == NULL);
22513+
22514+ ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22515+ if (!ctx)
22516+ return RETERR(-ENOMEM);
22517+
22518+ assert("nikita-2442", ctx != NULL);
22519+
22520+ init_waitqueue_head(&ctx->wait);
22521+
22522+ /*kcond_init(&ctx->startup);*/
22523+ spin_lock_init(&ctx->guard);
22524+ ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22525+ ctx->rescan = 1;
22526+ mgr->daemon = ctx;
22527+
22528+ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22529+ if (IS_ERR(ctx->tsk)) {
22530+ int ret = PTR_ERR(ctx->tsk);
22531+ mgr->daemon = NULL;
22532+ kfree(ctx);
22533+ return RETERR(ret);
22534+ }
22535+ return 0;
22536+}
22537+
22538+void ktxnmgrd_kick(txn_mgr *mgr)
22539+{
22540+ assert("nikita-3234", mgr != NULL);
22541+ assert("nikita-3235", mgr->daemon != NULL);
22542+ wake_up(&mgr->daemon->wait);
22543+}
22544+
22545+int is_current_ktxnmgrd(void)
22546+{
22547+ return (get_current_super_private()->tmgr.daemon->tsk == current);
22548+}
22549+
22550+/**
22551+ * scan_mgr - commit atoms which are to be committed
22552+ * @super: super block to commit atoms of
22553+ *
22554+ * Commits old atoms.
22555+ */
22556+static int scan_mgr(struct super_block *super)
22557+{
22558+ int ret;
22559+ reiser4_context ctx;
22560+
22561+ init_stack_context(&ctx, super);
22562+
22563+ ret = commit_some_atoms(&get_super_private(super)->tmgr);
22564+
22565+ reiser4_exit_context(&ctx);
22566+ return ret;
22567+}
22568+
22569+/**
22570+ * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22571+ * @mgr:
22572+ *
22573+ * This is called on umount. Stops ktxnmgrd and free t
22574+ */
22575+void reiser4_done_ktxnmgrd(struct super_block *super)
22576+{
22577+ txn_mgr *mgr;
22578+
22579+ mgr = &get_super_private(super)->tmgr;
22580+ assert("zam-1012", mgr->daemon != NULL);
22581+
22582+ kthread_stop(mgr->daemon->tsk);
22583+ kfree(mgr->daemon);
22584+ mgr->daemon = NULL;
22585+}
22586+
22587+/*
22588+ * Local variables:
22589+ * c-indentation-style: "K&R"
22590+ * mode-name: "LC"
22591+ * c-basic-offset: 8
22592+ * tab-width: 8
22593+ * fill-column: 120
22594+ * End:
22595+ */
22596diff -urN linux-2.6.24.orig/fs/reiser4/ktxnmgrd.h linux-2.6.24/fs/reiser4/ktxnmgrd.h
22597--- linux-2.6.24.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 03:00:00.000000000 +0300
22598+++ linux-2.6.24/fs/reiser4/ktxnmgrd.h 2008-01-25 11:39:06.944209750 +0300
22599@@ -0,0 +1,52 @@
22600+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22601+ * reiser4/README */
22602+
22603+/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22604+
22605+#ifndef __KTXNMGRD_H__
22606+#define __KTXNMGRD_H__
22607+
22608+#include "txnmgr.h"
22609+
22610+#include <linux/fs.h>
22611+#include <linux/wait.h>
22612+#include <linux/completion.h>
22613+#include <linux/spinlock.h>
22614+#include <asm/atomic.h>
22615+#include <linux/sched.h> /* for struct task_struct */
22616+
22617+/* in this structure all data necessary to start up, shut down and communicate
22618+ * with ktxnmgrd are kept. */
22619+struct ktxnmgrd_context {
22620+ /* wait queue head on which ktxnmgrd sleeps */
22621+ wait_queue_head_t wait;
22622+ /* spin lock protecting all fields of this structure */
22623+ spinlock_t guard;
22624+ /* timeout of sleeping on ->wait */
22625+ signed long timeout;
22626+ /* kernel thread running ktxnmgrd */
22627+ struct task_struct *tsk;
22628+ /* list of all file systems served by this ktxnmgrd */
22629+ struct list_head queue;
22630+ /* should ktxnmgrd repeat scanning of atoms? */
22631+ unsigned int rescan:1;
22632+};
22633+
22634+extern int reiser4_init_ktxnmgrd(struct super_block *);
22635+extern void reiser4_done_ktxnmgrd(struct super_block *);
22636+
22637+extern void ktxnmgrd_kick(txn_mgr * mgr);
22638+extern int is_current_ktxnmgrd(void);
22639+
22640+/* __KTXNMGRD_H__ */
22641+#endif
22642+
22643+/* Make Linus happy.
22644+ Local variables:
22645+ c-indentation-style: "K&R"
22646+ mode-name: "LC"
22647+ c-basic-offset: 8
22648+ tab-width: 8
22649+ fill-column: 120
22650+ End:
22651+*/
22652diff -urN linux-2.6.24.orig/fs/reiser4/lock.c linux-2.6.24/fs/reiser4/lock.c
22653--- linux-2.6.24.orig/fs/reiser4/lock.c 1970-01-01 03:00:00.000000000 +0300
22654+++ linux-2.6.24/fs/reiser4/lock.c 2008-01-25 11:39:06.948210780 +0300
22655@@ -0,0 +1,1232 @@
22656+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22657+ * reiser4/README */
22658+
22659+/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22660+ order. V4 balances the tree from the bottom up, and searches the tree from
22661+ the top down, and that is really the way we want it, so tradition won't work
22662+ for us.
22663+
22664+ Instead we have two lock orderings, a high priority lock ordering, and a low
22665+ priority lock ordering. Each node in the tree has a lock in its znode.
22666+
22667+ Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22668+ has a set (maybe empty) of already locked nodes ("process locked set"). Each
22669+ process may have a pending lock request to a node locked by another process.
22670+ Note: we lock and unlock, but do not transfer locks: it is possible
22671+ transferring locks instead would save some bus locking....
22672+
22673+ Deadlock occurs when we have a loop constructed from process locked sets and
22674+ lock request vectors.
22675+
22676+ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22677+ memory is extended with "znodes" with which we connect nodes with their left
22678+ and right neighbors using sibling pointers stored in the znodes. When we
22679+ perform balancing operations we often go from left to right and from right to
22680+ left.
22681+
22682+ +-P1-+ +-P3-+
22683+ |+--+| V1 |+--+|
22684+ ||N1|| -------> ||N3||
22685+ |+--+| |+--+|
22686+ +----+ +----+
22687+ ^ |
22688+ |V2 |V3
22689+ | v
22690+ +---------P2---------+
22691+ |+--+ +--+|
22692+ ||N2| -------- |N4||
22693+ |+--+ +--+|
22694+ +--------------------+
22695+
22696+ We solve this by ensuring that only low priority processes lock in top to
22697+ bottom order and from right to left, and high priority processes lock from
22698+ bottom to top and left to right.
22699+
22700+ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22701+ kill those damn busy loops.
22702+ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22703+ stage) cannot be ordered that way. There are no rules what nodes can belong
22704+ to the atom and what nodes cannot. We cannot define what is right or left
22705+ direction, what is top or bottom. We can take immediate parent or side
22706+ neighbor of one node, but nobody guarantees that, say, left neighbor node is
22707+ not a far right neighbor for other nodes from the same atom. It breaks
22708+ deadlock avoidance rules and hi-low priority locking cannot be applied for
22709+ atom locks.
22710+
22711+ How does it help to avoid deadlocks ?
22712+
22713+ Suppose we have a deadlock with n processes. Processes from one priority
22714+ class never deadlock because they take locks in one consistent
22715+ order.
22716+
22717+ So, any possible deadlock loop must have low priority as well as high
22718+ priority processes. There are no other lock priority levels except low and
22719+ high. We know that any deadlock loop contains at least one node locked by a
22720+ low priority process and requested by a high priority process. If this
22721+ situation is caught and resolved it is sufficient to avoid deadlocks.
22722+
22723+ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22724+
22725+ The deadlock prevention algorithm is based on comparing
22726+ priorities of node owners (processes which keep znode locked) and
22727+ requesters (processes which want to acquire a lock on znode). We
22728+ implement a scheme where low-priority owners yield locks to
22729+ high-priority requesters. We created a signal passing system that
22730+ is used to ask low-priority processes to yield one or more locked
22731+ znodes.
22732+
22733+ The condition when a znode needs to change its owners is described by the
22734+ following formula:
22735+
22736+ #############################################
22737+ # #
22738+ # (number of high-priority requesters) > 0 #
22739+ # AND #
22740+ # (numbers of high-priority owners) == 0 #
22741+ # #
22742+ #############################################
22743+
22744+ Note that a low-priority process delays node releasing if another
22745+ high-priority process owns this node. So, slightly more strictly speaking,
22746+ to have a deadlock capable cycle you must have a loop in which a high
22747+ priority process is waiting on a low priority process to yield a node, which
22748+ is slightly different from saying a high priority process is waiting on a
22749+ node owned by a low priority process.
22750+
22751+ It is enough to avoid deadlocks if we prevent any low-priority process from
22752+ falling asleep if its locked set contains a node which satisfies the
22753+ deadlock condition.
22754+
22755+ That condition is implicitly or explicitly checked in all places where new
22756+ high-priority requests may be added or removed from node request queue or
22757+ high-priority process takes or releases a lock on node. The main
22758+ goal of these checks is to never lose the moment when node becomes "has
22759+ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22760+ at that time.
22761+
22762+ The information about received signals is stored in the per-process
22763+ structure (lock stack) and analyzed before a low-priority process goes to
22764+ sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22765+ sleeping process up and forces him to re-check lock status and received
22766+ signal info. If "must-yield-this-lock" signals were received the locking
22767+ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22768+
22769+ V4 LOCKING DRAWBACKS
22770+
22771+ If we have already balanced on one level, and we are propagating our changes
22772+ upward to a higher level, it could be very messy to surrender all locks on
22773+ the lower level because we put so much computational work into it, and
22774+ reverting them to their state before they were locked might be very complex.
22775+ We also don't want to acquire all locks before performing balancing because
22776+ that would either be almost as much work as the balancing, or it would be
22777+ too conservative and lock too much. We want balancing to be done only at
22778+ high priority. Yet, we might want to go to the left one node and use some
22779+ of its empty space... So we make one attempt at getting the node to the left
22780+ using try_lock, and if it fails we do without it, because we didn't really
22781+ need it, it was only a nice to have.
22782+
22783+ LOCK STRUCTURES DESCRIPTION
22784+
22785+ The following data structures are used in the reiser4 locking
22786+ implementation:
22787+
22788+ All fields related to long-term locking are stored in znode->lock.
22789+
22790+ The lock stack is a per thread object. It owns all znodes locked by the
22791+ thread. One znode may be locked by several threads in case of read lock or
22792+ one znode may be write locked by one thread several times. The special link
22793+ objects (lock handles) support n<->m relation between znodes and lock
22794+ owners.
22795+
22796+ <Thread 1> <Thread 2>
22797+
22798+ +---------+ +---------+
22799+ | LS1 | | LS2 |
22800+ +---------+ +---------+
22801+ ^ ^
22802+ |---------------+ +----------+
22803+ v v v v
22804+ +---------+ +---------+ +---------+ +---------+
22805+ | LH1 | | LH2 | | LH3 | | LH4 |
22806+ +---------+ +---------+ +---------+ +---------+
22807+ ^ ^ ^ ^
22808+ | +------------+ |
22809+ v v v
22810+ +---------+ +---------+ +---------+
22811+ | Z1 | | Z2 | | Z3 |
22812+ +---------+ +---------+ +---------+
22813+
22814+ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22815+ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22816+ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
22817+ Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22818+ list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22819+ is locked (for read) twice by different threads and two lock handles are on
22820+ its list. Each lock handle represents a single relation of a locking of a
22821+ znode by a thread. Locking of a znode is an establishing of a locking
22822+ relation between the lock stack and the znode by adding of a new lock handle
22823+ to a list of lock handles, the lock stack. The lock stack links all lock
22824+ handles for all znodes locked by the lock stack. The znode list groups all
22825+ lock handles for all locks stacks which locked the znode.
22826+
22827+ Yet another relation may exist between znode and lock owners. If lock
22828+ procedure cannot immediately take lock on an object it adds the lock owner
22829+ on special `requestors' list belongs to znode. That list represents a
22830+ queue of pending lock requests. Because one lock owner may request only
22831+ only one lock object at a time, it is a 1->n relation between lock objects
22832+ and a lock owner implemented as it is described above. Full information
22833+ (priority, pointers to lock and link objects) about each lock request is
22834+ stored in lock owner structure in `request' field.
22835+
22836+ SHORT_TERM LOCKING
22837+
22838+ This is a list of primitive operations over lock stacks / lock handles /
22839+ znodes and locking descriptions for them.
22840+
22841+ 1. locking / unlocking which is done by two list insertion/deletion, one
22842+ to/from znode's list of lock handles, another one is to/from lock stack's
22843+ list of lock handles. The first insertion is protected by
22844+ znode->lock.guard spinlock. The list owned by the lock stack can be
22845+ modified only by thread who owns the lock stack and nobody else can
22846+ modify/read it. There is nothing to be protected by a spinlock or
22847+ something else.
22848+
22849+ 2. adding/removing a lock request to/from znode requesters list. The rule is
22850+ that znode->lock.guard spinlock should be taken for this.
22851+
22852+ 3. we can traverse list of lock handles and use references to lock stacks who
22853+ locked given znode if znode->lock.guard spinlock is taken.
22854+
22855+ 4. If a lock stack is associated with a znode as a lock requestor or lock
22856+ owner its existence is guaranteed by znode->lock.guard spinlock. Some its
22857+ (lock stack's) fields should be protected from being accessed in parallel
22858+ by two or more threads. Please look at lock_stack structure definition
22859+ for the info how those fields are protected. */
22860+
22861+/* Znode lock and capturing intertwining. */
22862+/* In current implementation we capture formatted nodes before locking
22863+ them. Take a look on longterm lock znode, reiser4_try_capture() request
22864+ precedes locking requests. The longterm_lock_znode function unconditionally
22865+ captures znode before even checking of locking conditions.
22866+
22867+ Another variant is to capture znode after locking it. It was not tested, but
22868+ at least one deadlock condition is supposed to be there. One thread has
22869+ locked a znode (Node-1) and calls reiser4_try_capture() for it.
22870+ reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
22871+ Second thread is a flushing thread, its current atom is the atom Node-1
22872+ belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
22873+ is locked by the first thread. The described situation is a deadlock. */
22874+
22875+#include "debug.h"
22876+#include "txnmgr.h"
22877+#include "znode.h"
22878+#include "jnode.h"
22879+#include "tree.h"
22880+#include "plugin/node/node.h"
22881+#include "super.h"
22882+
22883+#include <linux/spinlock.h>
22884+
22885+#if REISER4_DEBUG
22886+static int request_is_deadlock_safe(znode *, znode_lock_mode,
22887+ znode_lock_request);
22888+#endif
22889+
22890+/* Returns a lock owner associated with current thread */
22891+lock_stack *get_current_lock_stack(void)
22892+{
22893+ return &get_current_context()->stack;
22894+}
22895+
22896+/* Wakes up all low priority owners informing them about possible deadlock */
22897+static void wake_up_all_lopri_owners(znode * node)
22898+{
22899+ lock_handle *handle;
22900+
22901+ assert_spin_locked(&(node->lock.guard));
22902+ list_for_each_entry(handle, &node->lock.owners, owners_link) {
22903+ assert("nikita-1832", handle->node == node);
22904+ /* count this signal in owner->nr_signaled */
22905+ if (!handle->signaled) {
22906+ handle->signaled = 1;
22907+ atomic_inc(&handle->owner->nr_signaled);
22908+ /* Wake up a single process */
22909+ reiser4_wake_up(handle->owner);
22910+ }
22911+ }
22912+}
22913+
22914+/* Adds a lock to a lock owner, which means creating a link to the lock and
22915+ putting the link into the two lists all links are on (the doubly linked list
22916+ that forms the lock_stack, and the doubly linked list of links attached
22917+ to a lock.
22918+*/
22919+static inline void
22920+link_object(lock_handle * handle, lock_stack * owner, znode * node)
22921+{
22922+ assert("jmacd-810", handle->owner == NULL);
22923+ assert_spin_locked(&(node->lock.guard));
22924+
22925+ handle->owner = owner;
22926+ handle->node = node;
22927+
22928+ assert("reiser4-4",
22929+ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
22930+
22931+ /* add lock handle to the end of lock_stack's list of locks */
22932+ list_add_tail(&handle->locks_link, &owner->locks);
22933+ ON_DEBUG(owner->nr_locks++);
22934+ reiser4_ctx_gfp_mask_set();
22935+
22936+ /* add lock handle to the head of znode's list of owners */
22937+ list_add(&handle->owners_link, &node->lock.owners);
22938+ handle->signaled = 0;
22939+}
22940+
22941+/* Breaks a relation between a lock and its owner */
22942+static inline void unlink_object(lock_handle * handle)
22943+{
22944+ assert("zam-354", handle->owner != NULL);
22945+ assert("nikita-1608", handle->node != NULL);
22946+ assert_spin_locked(&(handle->node->lock.guard));
22947+ assert("nikita-1829", handle->owner == get_current_lock_stack());
22948+ assert("reiser4-5", handle->owner->nr_locks > 0);
22949+
22950+ /* remove lock handle from lock_stack's list of locks */
22951+ list_del(&handle->locks_link);
22952+ ON_DEBUG(handle->owner->nr_locks--);
22953+ reiser4_ctx_gfp_mask_set();
22954+ assert("reiser4-6",
22955+ ergo(list_empty_careful(&handle->owner->locks),
22956+ handle->owner->nr_locks == 0));
22957+ /* remove lock handle from znode's list of owners */
22958+ list_del(&handle->owners_link);
22959+ /* indicates that lock handle is free now */
22960+ handle->node = NULL;
22961+#if REISER4_DEBUG
22962+ INIT_LIST_HEAD(&handle->locks_link);
22963+ INIT_LIST_HEAD(&handle->owners_link);
22964+ handle->owner = NULL;
22965+#endif
22966+}
22967+
22968+/* Actually locks an object knowing that we are able to do this */
22969+static void lock_object(lock_stack * owner)
22970+{
22971+ struct lock_request *request;
22972+ znode *node;
22973+
22974+ request = &owner->request;
22975+ node = request->node;
22976+ assert_spin_locked(&(node->lock.guard));
22977+ if (request->mode == ZNODE_READ_LOCK) {
22978+ node->lock.nr_readers++;
22979+ } else {
22980+ /* check that we don't switched from read to write lock */
22981+ assert("nikita-1840", node->lock.nr_readers <= 0);
22982+ /* We allow recursive locking; a node can be locked several
22983+ times for write by same process */
22984+ node->lock.nr_readers--;
22985+ }
22986+
22987+ link_object(request->handle, owner, node);
22988+
22989+ if (owner->curpri) {
22990+ node->lock.nr_hipri_owners++;
22991+ }
22992+}
22993+
22994+/* Check for recursive write locking */
22995+static int recursive(lock_stack * owner)
22996+{
22997+ int ret;
22998+ znode *node;
22999+ lock_handle *lh;
23000+
23001+ node = owner->request.node;
23002+
23003+ /* Owners list is not empty for a locked node */
23004+ assert("zam-314", !list_empty_careful(&node->lock.owners));
23005+ assert("nikita-1841", owner == get_current_lock_stack());
23006+ assert_spin_locked(&(node->lock.guard));
23007+
23008+ lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
23009+ ret = (lh->owner == owner);
23010+
23011+ /* Recursive read locking should be done usual way */
23012+ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
23013+ /* mixing of read/write locks is not allowed */
23014+ assert("zam-341", !ret || znode_is_wlocked(node));
23015+
23016+ return ret;
23017+}
23018+
23019+#if REISER4_DEBUG
23020+/* Returns true if the lock is held by the calling thread. */
23021+int znode_is_any_locked(const znode * node)
23022+{
23023+ lock_handle *handle;
23024+ lock_stack *stack;
23025+ int ret;
23026+
23027+ if (!znode_is_locked(node)) {
23028+ return 0;
23029+ }
23030+
23031+ stack = get_current_lock_stack();
23032+
23033+ spin_lock_stack(stack);
23034+
23035+ ret = 0;
23036+
23037+ list_for_each_entry(handle, &stack->locks, locks_link) {
23038+ if (handle->node == node) {
23039+ ret = 1;
23040+ break;
23041+ }
23042+ }
23043+
23044+ spin_unlock_stack(stack);
23045+
23046+ return ret;
23047+}
23048+
23049+#endif
23050+
23051+/* Returns true if a write lock is held by the calling thread. */
23052+int znode_is_write_locked(const znode * node)
23053+{
23054+ lock_stack *stack;
23055+ lock_handle *handle;
23056+
23057+ assert("jmacd-8765", node != NULL);
23058+
23059+ if (!znode_is_wlocked(node)) {
23060+ return 0;
23061+ }
23062+
23063+ stack = get_current_lock_stack();
23064+
23065+ /*
23066+ * When znode is write locked, all owner handles point to the same lock
23067+ * stack. Get pointer to lock stack from the first lock handle from
23068+ * znode's owner list
23069+ */
23070+ handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
23071+
23072+ return (handle->owner == stack);
23073+}
23074+
23075+/* This "deadlock" condition is the essential part of reiser4 locking
23076+ implementation. This condition is checked explicitly by calling
23077+ check_deadlock_condition() or implicitly in all places where znode lock
23078+ state (set of owners and request queue) is changed. Locking code is
23079+ designed to use this condition to trigger procedure of passing object from
23080+ low priority owner(s) to high priority one(s).
23081+
23082+ The procedure results in passing an event (setting lock_handle->signaled
23083+ flag) and counting this event in nr_signaled field of owner's lock stack
23084+ object and wakeup owner's process.
23085+*/
23086+static inline int check_deadlock_condition(znode * node)
23087+{
23088+ assert_spin_locked(&(node->lock.guard));
23089+ return node->lock.nr_hipri_requests > 0
23090+ && node->lock.nr_hipri_owners == 0;
23091+}
23092+
23093+static int check_livelock_condition(znode * node, znode_lock_mode mode)
23094+{
23095+ zlock * lock = &node->lock;
23096+
23097+ return mode == ZNODE_READ_LOCK &&
23098+ lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23099+}
23100+
23101+/* checks lock/request compatibility */
23102+static int can_lock_object(lock_stack * owner)
23103+{
23104+ znode *node = owner->request.node;
23105+
23106+ assert_spin_locked(&(node->lock.guard));
23107+
23108+ /* See if the node is disconnected. */
23109+ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23110+ return RETERR(-EINVAL);
23111+
23112+ /* Do not ever try to take a lock if we are going in low priority
23113+ direction and a node have a high priority request without high
23114+ priority owners. */
23115+ if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23116+ return RETERR(-E_REPEAT);
23117+ if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23118+ return RETERR(-E_REPEAT);
23119+ if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23120+ return RETERR(-E_REPEAT);
23121+ return 0;
23122+}
23123+
23124+/* Setting of a high priority to the process. It clears "signaled" flags
23125+ because znode locked by high-priority process can't satisfy our "deadlock
23126+ condition". */
23127+static void set_high_priority(lock_stack * owner)
23128+{
23129+ assert("nikita-1846", owner == get_current_lock_stack());
23130+ /* Do nothing if current priority is already high */
23131+ if (!owner->curpri) {
23132+ /* We don't need locking for owner->locks list, because, this
23133+ * function is only called with the lock stack of the current
23134+ * thread, and no other thread can play with owner->locks list
23135+ * and/or change ->node pointers of lock handles in this list.
23136+ *
23137+ * (Interrupts also are not involved.)
23138+ */
23139+ lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23140+ while (&owner->locks != &item->locks_link) {
23141+ znode *node = item->node;
23142+
23143+ spin_lock_zlock(&node->lock);
23144+
23145+ node->lock.nr_hipri_owners++;
23146+
23147+ /* we can safely set signaled to zero, because
23148+ previous statement (nr_hipri_owners ++) guarantees
23149+ that signaled will be never set again. */
23150+ item->signaled = 0;
23151+ spin_unlock_zlock(&node->lock);
23152+
23153+ item = list_entry(item->locks_link.next, lock_handle, locks_link);
23154+ }
23155+ owner->curpri = 1;
23156+ atomic_set(&owner->nr_signaled, 0);
23157+ }
23158+}
23159+
23160+/* Sets a low priority to the process. */
23161+static void set_low_priority(lock_stack * owner)
23162+{
23163+ assert("nikita-3075", owner == get_current_lock_stack());
23164+ /* Do nothing if current priority is already low */
23165+ if (owner->curpri) {
23166+ /* scan all locks (lock handles) held by @owner, which is
23167+ actually current thread, and check whether we are reaching
23168+ deadlock possibility anywhere.
23169+ */
23170+ lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23171+ while (&owner->locks != &handle->locks_link) {
23172+ znode *node = handle->node;
23173+ spin_lock_zlock(&node->lock);
23174+ /* this thread just was hipri owner of @node, so
23175+ nr_hipri_owners has to be greater than zero. */
23176+ assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23177+ node->lock.nr_hipri_owners--;
23178+ /* If we have deadlock condition, adjust a nr_signaled
23179+ field. It is enough to set "signaled" flag only for
23180+ current process, other low-pri owners will be
23181+ signaled and waken up after current process unlocks
23182+ this object and any high-priority requestor takes
23183+ control. */
23184+ if (check_deadlock_condition(node)
23185+ && !handle->signaled) {
23186+ handle->signaled = 1;
23187+ atomic_inc(&owner->nr_signaled);
23188+ }
23189+ spin_unlock_zlock(&node->lock);
23190+ handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23191+ }
23192+ owner->curpri = 0;
23193+ }
23194+}
23195+
23196+static void remove_lock_request(lock_stack * requestor)
23197+{
23198+ zlock * lock = &requestor->request.node->lock;
23199+
23200+ if (requestor->curpri) {
23201+ assert("nikita-1838", lock->nr_hipri_requests > 0);
23202+ lock->nr_hipri_requests--;
23203+ if (requestor->request.mode == ZNODE_WRITE_LOCK)
23204+ lock->nr_hipri_write_requests --;
23205+ }
23206+ list_del(&requestor->requestors_link);
23207+}
23208+
23209+static void invalidate_all_lock_requests(znode * node)
23210+{
23211+ lock_stack *requestor, *tmp;
23212+
23213+ assert_spin_locked(&(node->lock.guard));
23214+
23215+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23216+ remove_lock_request(requestor);
23217+ requestor->request.ret_code = -EINVAL;
23218+ reiser4_wake_up(requestor);
23219+ requestor->request.mode = ZNODE_NO_LOCK;
23220+ }
23221+}
23222+
23223+static void dispatch_lock_requests(znode * node)
23224+{
23225+ lock_stack *requestor, *tmp;
23226+
23227+ assert_spin_locked(&(node->lock.guard));
23228+
23229+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23230+ if (znode_is_write_locked(node))
23231+ break;
23232+ if (!can_lock_object(requestor)) {
23233+ lock_object(requestor);
23234+ remove_lock_request(requestor);
23235+ requestor->request.ret_code = 0;
23236+ reiser4_wake_up(requestor);
23237+ requestor->request.mode = ZNODE_NO_LOCK;
23238+ }
23239+ }
23240+}
23241+
23242+/* release long-term lock, acquired by longterm_lock_znode() */
23243+void longterm_unlock_znode(lock_handle * handle)
23244+{
23245+ znode *node = handle->node;
23246+ lock_stack *oldowner = handle->owner;
23247+ int hipri;
23248+ int readers;
23249+ int rdelta;
23250+ int youdie;
23251+
23252+ /*
23253+ * this is time-critical and highly optimized code. Modify carefully.
23254+ */
23255+
23256+ assert("jmacd-1021", handle != NULL);
23257+ assert("jmacd-1022", handle->owner != NULL);
23258+ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23259+
23260+ assert("zam-130", oldowner == get_current_lock_stack());
23261+
23262+ LOCK_CNT_DEC(long_term_locked_znode);
23263+
23264+ /*
23265+ * to minimize amount of operations performed under lock, pre-compute
23266+ * all variables used within critical section. This makes code
23267+ * obscure.
23268+ */
23269+
23270+ /* was this lock of hi or lo priority */
23271+ hipri = oldowner->curpri ? 1 : 0;
23272+ /* number of readers */
23273+ readers = node->lock.nr_readers;
23274+ /* +1 if write lock, -1 if read lock */
23275+ rdelta = (readers > 0) ? -1 : +1;
23276+ /* true if node is to die and write lock is released */
23277+ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23278+
23279+ spin_lock_zlock(&node->lock);
23280+
23281+ assert("zam-101", znode_is_locked(node));
23282+
23283+ /* Adjust a number of high priority owners of this lock */
23284+ assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23285+ node->lock.nr_hipri_owners -= hipri;
23286+
23287+ /* Handle znode deallocation on last write-lock release. */
23288+ if (znode_is_wlocked_once(node)) {
23289+ if (youdie) {
23290+ forget_znode(handle);
23291+ assert("nikita-2191", znode_invariant(node));
23292+ zput(node);
23293+ return;
23294+ }
23295+ }
23296+
23297+ if (handle->signaled)
23298+ atomic_dec(&oldowner->nr_signaled);
23299+
23300+ /* Unlocking means owner<->object link deletion */
23301+ unlink_object(handle);
23302+
23303+ /* This is enough to be sure whether an object is completely
23304+ unlocked. */
23305+ node->lock.nr_readers += rdelta;
23306+
23307+ /* If the node is locked it must have an owners list. Likewise, if
23308+ the node is unlocked it must have an empty owners list. */
23309+ assert("zam-319", equi(znode_is_locked(node),
23310+ !list_empty_careful(&node->lock.owners)));
23311+
23312+#if REISER4_DEBUG
23313+ if (!znode_is_locked(node))
23314+ ++node->times_locked;
23315+#endif
23316+
23317+ /* If there are pending lock requests we wake up a requestor */
23318+ if (!znode_is_wlocked(node))
23319+ dispatch_lock_requests(node);
23320+ if (check_deadlock_condition(node))
23321+ wake_up_all_lopri_owners(node);
23322+ spin_unlock_zlock(&node->lock);
23323+
23324+ /* minus one reference from handle->node */
23325+ assert("nikita-2190", znode_invariant(node));
23326+ ON_DEBUG(check_lock_data());
23327+ ON_DEBUG(check_lock_node_data(node));
23328+ zput(node);
23329+}
23330+
23331+/* final portion of longterm-lock */
23332+static int
23333+lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23334+{
23335+ znode *node = owner->request.node;
23336+
23337+ assert_spin_locked(&(node->lock.guard));
23338+
23339+ /* If we broke with (ok == 0) it means we can_lock, now do it. */
23340+ if (ok == 0) {
23341+ lock_object(owner);
23342+ owner->request.mode = 0;
23343+ /* count a reference from lockhandle->node
23344+
23345+ znode was already referenced at the entry to this function,
23346+ hence taking spin-lock here is not necessary (see comment
23347+ in the zref()).
23348+ */
23349+ zref(node);
23350+
23351+ LOCK_CNT_INC(long_term_locked_znode);
23352+ }
23353+ spin_unlock_zlock(&node->lock);
23354+ ON_DEBUG(check_lock_data());
23355+ ON_DEBUG(check_lock_node_data(node));
23356+ return ok;
23357+}
23358+
23359+/*
23360+ * version of longterm_znode_lock() optimized for the most common case: read
23361+ * lock without any special flags. This is the kind of lock that any tree
23362+ * traversal takes on the root node of the tree, which is very frequent.
23363+ */
23364+static int longterm_lock_tryfast(lock_stack * owner)
23365+{
23366+ int result;
23367+ znode *node;
23368+ zlock *lock;
23369+
23370+ node = owner->request.node;
23371+ lock = &node->lock;
23372+
23373+ assert("nikita-3340", reiser4_schedulable());
23374+ assert("nikita-3341", request_is_deadlock_safe(node,
23375+ ZNODE_READ_LOCK,
23376+ ZNODE_LOCK_LOPRI));
23377+ spin_lock_zlock(lock);
23378+ result = can_lock_object(owner);
23379+ spin_unlock_zlock(lock);
23380+
23381+ if (likely(result != -EINVAL)) {
23382+ spin_lock_znode(node);
23383+ result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23384+ spin_unlock_znode(node);
23385+ spin_lock_zlock(lock);
23386+ if (unlikely(result != 0)) {
23387+ owner->request.mode = 0;
23388+ } else {
23389+ result = can_lock_object(owner);
23390+ if (unlikely(result == -E_REPEAT)) {
23391+ /* fall back to longterm_lock_znode() */
23392+ spin_unlock_zlock(lock);
23393+ return 1;
23394+ }
23395+ }
23396+ return lock_tail(owner, result, ZNODE_READ_LOCK);
23397+ } else
23398+ return 1;
23399+}
23400+
23401+/* locks given lock object */
23402+int longterm_lock_znode(
23403+ /* local link object (allocated by lock owner thread, usually on its own
23404+ * stack) */
23405+ lock_handle * handle,
23406+ /* znode we want to lock. */
23407+ znode * node,
23408+ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23409+ znode_lock_mode mode,
23410+ /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23411+ znode_lock_request request) {
23412+ int ret;
23413+ int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23414+ int non_blocking = 0;
23415+ int has_atom;
23416+ txn_capture cap_flags;
23417+ zlock *lock;
23418+ txn_handle *txnh;
23419+ tree_level level;
23420+
23421+ /* Get current process context */
23422+ lock_stack *owner = get_current_lock_stack();
23423+
23424+ /* Check that the lock handle is initialized and isn't already being
23425+ * used. */
23426+ assert("jmacd-808", handle->owner == NULL);
23427+ assert("nikita-3026", reiser4_schedulable());
23428+ assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23429+ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23430+ /* long term locks are not allowed in the VM contexts (->writepage(),
23431+ * prune_{d,i}cache()).
23432+ *
23433+ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23434+ * bug caused by d_splice_alias() only working for directories.
23435+ */
23436+ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23437+ assert ("zam-1055", mode != ZNODE_NO_LOCK);
23438+
23439+ cap_flags = 0;
23440+ if (request & ZNODE_LOCK_NONBLOCK) {
23441+ cap_flags |= TXN_CAPTURE_NONBLOCKING;
23442+ non_blocking = 1;
23443+ }
23444+
23445+ if (request & ZNODE_LOCK_DONT_FUSE)
23446+ cap_flags |= TXN_CAPTURE_DONT_FUSE;
23447+
23448+ /* If we are changing our process priority we must adjust a number
23449+ of high priority owners for each znode that we already lock */
23450+ if (hipri) {
23451+ set_high_priority(owner);
23452+ } else {
23453+ set_low_priority(owner);
23454+ }
23455+
23456+ level = znode_get_level(node);
23457+
23458+ /* Fill request structure with our values. */
23459+ owner->request.mode = mode;
23460+ owner->request.handle = handle;
23461+ owner->request.node = node;
23462+
23463+ txnh = get_current_context()->trans;
23464+ lock = &node->lock;
23465+
23466+ if (mode == ZNODE_READ_LOCK && request == 0) {
23467+ ret = longterm_lock_tryfast(owner);
23468+ if (ret <= 0)
23469+ return ret;
23470+ }
23471+
23472+ has_atom = (txnh->atom != NULL);
23473+
23474+ /* Synchronize on node's zlock guard lock. */
23475+ spin_lock_zlock(lock);
23476+
23477+ if (znode_is_locked(node) &&
23478+ mode == ZNODE_WRITE_LOCK && recursive(owner))
23479+ return lock_tail(owner, 0, mode);
23480+
23481+ for (;;) {
23482+ /* Check the lock's availability: if it is unavaiable we get
23483+ E_REPEAT, 0 indicates "can_lock", otherwise the node is
23484+ invalid. */
23485+ ret = can_lock_object(owner);
23486+
23487+ if (unlikely(ret == -EINVAL)) {
23488+ /* @node is dying. Leave it alone. */
23489+ break;
23490+ }
23491+
23492+ if (unlikely(ret == -E_REPEAT && non_blocking)) {
23493+ /* either locking of @node by the current thread will
23494+ * lead to the deadlock, or lock modes are
23495+ * incompatible. */
23496+ break;
23497+ }
23498+
23499+ assert("nikita-1844", (ret == 0)
23500+ || ((ret == -E_REPEAT) && !non_blocking));
23501+ /* If we can get the lock... Try to capture first before
23502+ taking the lock. */
23503+
23504+ /* first handle commonest case where node and txnh are already
23505+ * in the same atom. */
23506+ /* safe to do without taking locks, because:
23507+ *
23508+ * 1. read of aligned word is atomic with respect to writes to
23509+ * this word
23510+ *
23511+ * 2. false negatives are handled in reiser4_try_capture().
23512+ *
23513+ * 3. false positives are impossible.
23514+ *
23515+ * PROOF: left as an exercise to the curious reader.
23516+ *
23517+ * Just kidding. Here is one:
23518+ *
23519+ * At the time T0 txnh->atom is stored in txnh_atom.
23520+ *
23521+ * At the time T1 node->atom is stored in node_atom.
23522+ *
23523+ * At the time T2 we observe that
23524+ *
23525+ * txnh_atom != NULL && node_atom == txnh_atom.
23526+ *
23527+ * Imagine that at this moment we acquire node and txnh spin
23528+ * lock in this order. Suppose that under spin lock we have
23529+ *
23530+ * node->atom != txnh->atom, (S1)
23531+ *
23532+ * at the time T3.
23533+ *
23534+ * txnh->atom != NULL still, because txnh is open by the
23535+ * current thread.
23536+ *
23537+ * Suppose node->atom == NULL, that is, node was un-captured
23538+ * between T1, and T3. But un-capturing of formatted node is
23539+ * always preceded by the call to reiser4_invalidate_lock(),
23540+ * which marks znode as JNODE_IS_DYING under zlock spin
23541+ * lock. Contradiction, because can_lock_object() above checks
23542+ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23543+ *
23544+ * Suppose that node->atom != node_atom, that is, atom, node
23545+ * belongs to was fused into another atom: node_atom was fused
23546+ * into node->atom. Atom of txnh was equal to node_atom at T2,
23547+ * which means that under spin lock, txnh->atom == node->atom,
23548+ * because txnh->atom can only follow fusion
23549+ * chain. Contradicts S1.
23550+ *
23551+ * The same for hypothesis txnh->atom != txnh_atom. Hence,
23552+ * node->atom == node_atom == txnh_atom == txnh->atom. Again
23553+ * contradicts S1. Hence S1 is false. QED.
23554+ *
23555+ */
23556+
23557+ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23558+ ;
23559+ } else {
23560+ /*
23561+ * unlock zlock spin lock here. It is possible for
23562+ * longterm_unlock_znode() to sneak in here, but there
23563+ * is no harm: reiser4_invalidate_lock() will mark znode
23564+ * as JNODE_IS_DYING and this will be noted by
23565+ * can_lock_object() below.
23566+ */
23567+ spin_unlock_zlock(lock);
23568+ spin_lock_znode(node);
23569+ ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags);
23570+ spin_unlock_znode(node);
23571+ spin_lock_zlock(lock);
23572+ if (unlikely(ret != 0)) {
23573+ /* In the failure case, the txnmgr releases
23574+ the znode's lock (or in some cases, it was
23575+ released a while ago). There's no need to
23576+ reacquire it so we should return here,
23577+ avoid releasing the lock. */
23578+ owner->request.mode = 0;
23579+ break;
23580+ }
23581+
23582+ /* Check the lock's availability again -- this is
23583+ because under some circumstances the capture code
23584+ has to release and reacquire the znode spinlock. */
23585+ ret = can_lock_object(owner);
23586+ }
23587+
23588+ /* This time, a return of (ret == 0) means we can lock, so we
23589+ should break out of the loop. */
23590+ if (likely(ret != -E_REPEAT || non_blocking))
23591+ break;
23592+
23593+ /* Lock is unavailable, we have to wait. */
23594+ ret = reiser4_prepare_to_sleep(owner);
23595+ if (unlikely(ret != 0))
23596+ break;
23597+
23598+ assert_spin_locked(&(node->lock.guard));
23599+ if (hipri) {
23600+ /* If we are going in high priority direction then
23601+ increase high priority requests counter for the
23602+ node */
23603+ lock->nr_hipri_requests++;
23604+ if (mode == ZNODE_WRITE_LOCK)
23605+ lock->nr_hipri_write_requests ++;
23606+ /* If there are no high priority owners for a node,
23607+ then immediately wake up low priority owners, so
23608+ they can detect possible deadlock */
23609+ if (lock->nr_hipri_owners == 0)
23610+ wake_up_all_lopri_owners(node);
23611+ }
23612+ list_add_tail(&owner->requestors_link, &lock->requestors);
23613+
23614+ /* Ok, here we have prepared a lock request, so unlock
23615+ a znode ... */
23616+ spin_unlock_zlock(lock);
23617+ /* ... and sleep */
23618+ reiser4_go_to_sleep(owner);
23619+ if (owner->request.mode == ZNODE_NO_LOCK)
23620+ goto request_is_done;
23621+ spin_lock_zlock(lock);
23622+ if (owner->request.mode == ZNODE_NO_LOCK) {
23623+ spin_unlock_zlock(lock);
23624+ request_is_done:
23625+ if (owner->request.ret_code == 0) {
23626+ LOCK_CNT_INC(long_term_locked_znode);
23627+ zref(node);
23628+ }
23629+ return owner->request.ret_code;
23630+ }
23631+ remove_lock_request(owner);
23632+ }
23633+
23634+ return lock_tail(owner, ret, mode);
23635+}
23636+
23637+/* lock object invalidation means changing of lock object state to `INVALID'
23638+ and waiting for all other processes to cancel theirs lock requests. */
23639+void reiser4_invalidate_lock(lock_handle * handle /* path to lock
23640+ * owner and lock
23641+ * object is being
23642+ * invalidated. */ )
23643+{
23644+ znode *node = handle->node;
23645+ lock_stack *owner = handle->owner;
23646+
23647+ assert("zam-325", owner == get_current_lock_stack());
23648+ assert("zam-103", znode_is_write_locked(node));
23649+ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23650+ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23651+ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23652+ assert("nikita-3097", znode_is_wlocked_once(node));
23653+ assert_spin_locked(&(node->lock.guard));
23654+
23655+ if (handle->signaled)
23656+ atomic_dec(&owner->nr_signaled);
23657+
23658+ ZF_SET(node, JNODE_IS_DYING);
23659+ unlink_object(handle);
23660+ node->lock.nr_readers = 0;
23661+
23662+ invalidate_all_lock_requests(node);
23663+ spin_unlock_zlock(&node->lock);
23664+}
23665+
23666+/* Initializes lock_stack. */
23667+void init_lock_stack(lock_stack * owner /* pointer to
23668+ * allocated
23669+ * structure. */ )
23670+{
23671+ INIT_LIST_HEAD(&owner->locks);
23672+ INIT_LIST_HEAD(&owner->requestors_link);
23673+ spin_lock_init(&owner->sguard);
23674+ owner->curpri = 1;
23675+ init_waitqueue_head(&owner->wait);
23676+}
23677+
23678+/* Initializes lock object. */
23679+void reiser4_init_lock(zlock * lock /* pointer on allocated
23680+ * uninitialized lock object
23681+ * structure. */ )
23682+{
23683+ memset(lock, 0, sizeof(zlock));
23684+ spin_lock_init(&lock->guard);
23685+ INIT_LIST_HEAD(&lock->requestors);
23686+ INIT_LIST_HEAD(&lock->owners);
23687+}
23688+
23689+/* Transfer a lock handle (presumably so that variables can be moved between stack and
23690+ heap locations). */
23691+static void
23692+move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23693+{
23694+ znode *node = old->node;
23695+ lock_stack *owner = old->owner;
23696+ int signaled;
23697+
23698+ /* locks_list, modified by link_object() is not protected by
23699+ anything. This is valid because only current thread ever modifies
23700+ locks_list of its lock_stack.
23701+ */
23702+ assert("nikita-1827", owner == get_current_lock_stack());
23703+ assert("nikita-1831", new->owner == NULL);
23704+
23705+ spin_lock_zlock(&node->lock);
23706+
23707+ signaled = old->signaled;
23708+ if (unlink_old) {
23709+ unlink_object(old);
23710+ } else {
23711+ if (node->lock.nr_readers > 0) {
23712+ node->lock.nr_readers += 1;
23713+ } else {
23714+ node->lock.nr_readers -= 1;
23715+ }
23716+ if (signaled) {
23717+ atomic_inc(&owner->nr_signaled);
23718+ }
23719+ if (owner->curpri) {
23720+ node->lock.nr_hipri_owners += 1;
23721+ }
23722+ LOCK_CNT_INC(long_term_locked_znode);
23723+
23724+ zref(node);
23725+ }
23726+ link_object(new, owner, node);
23727+ new->signaled = signaled;
23728+
23729+ spin_unlock_zlock(&node->lock);
23730+}
23731+
23732+void move_lh(lock_handle * new, lock_handle * old)
23733+{
23734+ move_lh_internal(new, old, /*unlink_old */ 1);
23735+}
23736+
23737+void copy_lh(lock_handle * new, lock_handle * old)
23738+{
23739+ move_lh_internal(new, old, /*unlink_old */ 0);
23740+}
23741+
23742+/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
23743+int reiser4_check_deadlock(void)
23744+{
23745+ lock_stack *owner = get_current_lock_stack();
23746+ return atomic_read(&owner->nr_signaled) != 0;
23747+}
23748+
23749+/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
23750+ priorities. */
23751+int reiser4_prepare_to_sleep(lock_stack * owner)
23752+{
23753+ assert("nikita-1847", owner == get_current_lock_stack());
23754+
23755+ /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23756+ * counted in nr_signaled */
23757+ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23758+ assert("zam-959", !owner->curpri);
23759+ return RETERR(-E_DEADLOCK);
23760+ }
23761+ return 0;
23762+}
23763+
23764+/* Wakes up a single thread */
23765+void __reiser4_wake_up(lock_stack * owner)
23766+{
23767+ atomic_set(&owner->wakeup, 1);
23768+ wake_up(&owner->wait);
23769+}
23770+
23771+/* Puts a thread to sleep */
23772+void reiser4_go_to_sleep(lock_stack * owner)
23773+{
23774+ /* Well, we might sleep here, so holding of any spinlocks is no-no */
23775+ assert("nikita-3027", reiser4_schedulable());
23776+
23777+ wait_event(owner->wait, atomic_read(&owner->wakeup));
23778+ atomic_set(&owner->wakeup, 0);
23779+}
23780+
23781+int lock_stack_isclean(lock_stack * owner)
23782+{
23783+ if (list_empty_careful(&owner->locks)) {
23784+ assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23785+ return 1;
23786+ }
23787+
23788+ return 0;
23789+}
23790+
23791+#if REISER4_DEBUG
23792+
23793+/*
23794+ * debugging functions
23795+ */
23796+
23797+static void list_check(struct list_head *head)
23798+{
23799+ struct list_head *pos;
23800+
23801+ list_for_each(pos, head)
23802+ assert("", (pos->prev != NULL && pos->next != NULL &&
23803+ pos->prev->next == pos && pos->next->prev == pos));
23804+}
23805+
23806+/* check consistency of locking data-structures hanging of the @stack */
23807+static void check_lock_stack(lock_stack * stack)
23808+{
23809+ spin_lock_stack(stack);
23810+ /* check that stack->locks is not corrupted */
23811+ list_check(&stack->locks);
23812+ spin_unlock_stack(stack);
23813+}
23814+
23815+/* check consistency of locking data structures */
23816+void check_lock_data(void)
23817+{
23818+ check_lock_stack(&get_current_context()->stack);
23819+}
23820+
23821+/* check consistency of locking data structures for @node */
23822+void check_lock_node_data(znode * node)
23823+{
23824+ spin_lock_zlock(&node->lock);
23825+ list_check(&node->lock.owners);
23826+ list_check(&node->lock.requestors);
23827+ spin_unlock_zlock(&node->lock);
23828+}
23829+
23830+/* check that given lock request is dead lock safe. This check is, of course,
23831+ * not exhaustive. */
23832+static int
23833+request_is_deadlock_safe(znode * node, znode_lock_mode mode,
23834+ znode_lock_request request)
23835+{
23836+ lock_stack *owner;
23837+
23838+ owner = get_current_lock_stack();
23839+ /*
23840+ * check that hipri lock request is not issued when there are locked
23841+ * nodes at the higher levels.
23842+ */
23843+ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
23844+ znode_get_level(node) != 0) {
23845+ lock_handle *item;
23846+
23847+ list_for_each_entry(item, &owner->locks, locks_link) {
23848+ znode *other;
23849+
23850+ other = item->node;
23851+
23852+ if (znode_get_level(other) == 0)
23853+ continue;
23854+ if (znode_get_level(other) > znode_get_level(node))
23855+ return 0;
23856+ }
23857+ }
23858+ return 1;
23859+}
23860+
23861+#endif
23862+
23863+/* return pointer to static storage with name of lock_mode. For
23864+ debugging */
23865+const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
23866+{
23867+ if (lock == ZNODE_READ_LOCK)
23868+ return "read";
23869+ else if (lock == ZNODE_WRITE_LOCK)
23870+ return "write";
23871+ else {
23872+ static char buf[30];
23873+
23874+ sprintf(buf, "unknown: %i", lock);
23875+ return buf;
23876+ }
23877+}
23878+
23879+/* Make Linus happy.
23880+ Local variables:
23881+ c-indentation-style: "K&R"
23882+ mode-name: "LC"
23883+ c-basic-offset: 8
23884+ tab-width: 8
23885+ fill-column: 79
23886+ End:
23887+*/
23888diff -urN linux-2.6.24.orig/fs/reiser4/lock.h linux-2.6.24/fs/reiser4/lock.h
23889--- linux-2.6.24.orig/fs/reiser4/lock.h 1970-01-01 03:00:00.000000000 +0300
23890+++ linux-2.6.24/fs/reiser4/lock.h 2008-01-25 11:39:06.948210780 +0300
23891@@ -0,0 +1,249 @@
23892+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
23893+
23894+/* Long term locking data structures. See lock.c for details. */
23895+
23896+#ifndef __LOCK_H__
23897+#define __LOCK_H__
23898+
23899+#include "forward.h"
23900+#include "debug.h"
23901+#include "dformat.h"
23902+#include "key.h"
23903+#include "coord.h"
23904+#include "plugin/node/node.h"
23905+#include "txnmgr.h"
23906+#include "readahead.h"
23907+
23908+#include <linux/types.h>
23909+#include <linux/spinlock.h>
23910+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
23911+#include <asm/atomic.h>
23912+#include <linux/wait.h>
23913+
23914+/* Per-znode lock object */
23915+struct zlock {
23916+ spinlock_t guard;
23917+ /* The number of readers if positive; the number of recursively taken
23918+ write locks if negative. Protected by zlock spin lock. */
23919+ int nr_readers;
23920+ /* A number of processes (lock_stacks) that have this object
23921+ locked with high priority */
23922+ unsigned nr_hipri_owners;
23923+ /* A number of attempts to lock znode in high priority direction */
23924+ unsigned nr_hipri_requests;
23925+ /* A linked list of lock_handle objects that contains pointers
23926+ for all lock_stacks which have this lock object locked */
23927+ unsigned nr_hipri_write_requests;
23928+ struct list_head owners;
23929+ /* A linked list of lock_stacks that wait for this lock */
23930+ struct list_head requestors;
23931+};
23932+
23933+static inline void spin_lock_zlock(zlock *lock)
23934+{
23935+ /* check that zlock is not locked */
23936+ assert("", LOCK_CNT_NIL(spin_locked_zlock));
23937+ /* check that spinlocks of lower priorities are not held */
23938+ assert("", LOCK_CNT_NIL(spin_locked_stack));
23939+
23940+ spin_lock(&lock->guard);
23941+
23942+ LOCK_CNT_INC(spin_locked_zlock);
23943+ LOCK_CNT_INC(spin_locked);
23944+}
23945+
23946+static inline void spin_unlock_zlock(zlock *lock)
23947+{
23948+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
23949+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
23950+
23951+ LOCK_CNT_DEC(spin_locked_zlock);
23952+ LOCK_CNT_DEC(spin_locked);
23953+
23954+ spin_unlock(&lock->guard);
23955+}
23956+
23957+#define lock_is_locked(lock) ((lock)->nr_readers != 0)
23958+#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
23959+#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
23960+#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
23961+#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0)
23962+#define lock_mode_compatible(lock, mode) \
23963+ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
23964+ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
23965+
23966+/* Since we have R/W znode locks we need additional bidirectional `link'
23967+ objects to implement n<->m relationship between lock owners and lock
23968+ objects. We call them `lock handles'.
23969+
23970+ Locking: see lock.c/"SHORT-TERM LOCKING"
23971+*/
23972+struct lock_handle {
23973+ /* This flag indicates that a signal to yield a lock was passed to
23974+ lock owner and counted in owner->nr_signalled
23975+
23976+ Locking: this is accessed under spin lock on ->node.
23977+ */
23978+ int signaled;
23979+ /* A link to owner of a lock */
23980+ lock_stack *owner;
23981+ /* A link to znode locked */
23982+ znode *node;
23983+ /* A list of all locks for a process */
23984+ struct list_head locks_link;
23985+ /* A list of all owners for a znode */
23986+ struct list_head owners_link;
23987+};
23988+
23989+struct lock_request {
23990+ /* A pointer to uninitialized link object */
23991+ lock_handle *handle;
23992+ /* A pointer to the object we want to lock */
23993+ znode *node;
23994+ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
23995+ znode_lock_mode mode;
23996+ /* how dispatch_lock_requests() returns lock request result code */
23997+ int ret_code;
23998+};
23999+
24000+/* A lock stack structure for accumulating locks owned by a process */
24001+struct lock_stack {
24002+ /* A guard lock protecting a lock stack */
24003+ spinlock_t sguard;
24004+ /* number of znodes which were requested by high priority processes */
24005+ atomic_t nr_signaled;
24006+ /* Current priority of a process
24007+
24008+ This is only accessed by the current thread and thus requires no
24009+ locking.
24010+ */
24011+ int curpri;
24012+ /* A list of all locks owned by this process. Elements can be added to
24013+ * this list only by the current thread. ->node pointers in this list
24014+ * can be only changed by the current thread. */
24015+ struct list_head locks;
24016+ /* When lock_stack waits for the lock, it puts itself on double-linked
24017+ requestors list of that lock */
24018+ struct list_head requestors_link;
24019+ /* Current lock request info.
24020+
24021+ This is only accessed by the current thread and thus requires no
24022+ locking.
24023+ */
24024+ struct lock_request request;
24025+ /* the following two fields are the lock stack's
24026+ * synchronization object to use with the standard linux/wait.h
24027+ * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
24028+ * usage details. */
24029+ wait_queue_head_t wait;
24030+ atomic_t wakeup;
24031+#if REISER4_DEBUG
24032+ int nr_locks; /* number of lock handles in the above list */
24033+#endif
24034+};
24035+
24036+/*
24037+ User-visible znode locking functions
24038+*/
24039+
24040+extern int longterm_lock_znode(lock_handle * handle,
24041+ znode * node,
24042+ znode_lock_mode mode,
24043+ znode_lock_request request);
24044+
24045+extern void longterm_unlock_znode(lock_handle * handle);
24046+
24047+extern int reiser4_check_deadlock(void);
24048+
24049+extern lock_stack *get_current_lock_stack(void);
24050+
24051+extern void init_lock_stack(lock_stack * owner);
24052+extern void reiser4_init_lock(zlock * lock);
24053+
24054+static inline void init_lh(lock_handle *lh)
24055+{
24056+#if REISER4_DEBUG
24057+ memset(lh, 0, sizeof *lh);
24058+ INIT_LIST_HEAD(&lh->locks_link);
24059+ INIT_LIST_HEAD(&lh->owners_link);
24060+#else
24061+ lh->node = NULL;
24062+#endif
24063+}
24064+
24065+static inline void done_lh(lock_handle *lh)
24066+{
24067+ assert("zam-342", lh != NULL);
24068+ if (lh->node != NULL)
24069+ longterm_unlock_znode(lh);
24070+}
24071+
24072+extern void move_lh(lock_handle * new, lock_handle * old);
24073+extern void copy_lh(lock_handle * new, lock_handle * old);
24074+
24075+extern int reiser4_prepare_to_sleep(lock_stack * owner);
24076+extern void reiser4_go_to_sleep(lock_stack * owner);
24077+extern void __reiser4_wake_up(lock_stack * owner);
24078+
24079+extern int lock_stack_isclean(lock_stack * owner);
24080+
24081+/* zlock object state check macros: only used in assertions. Both forms imply that the
24082+ lock is held by the current thread. */
24083+extern int znode_is_write_locked(const znode *);
24084+extern void reiser4_invalidate_lock(lock_handle *);
24085+
24086+/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24087+#define spin_ordering_pred_stack(stack) \
24088+ (LOCK_CNT_NIL(spin_locked_stack) && \
24089+ LOCK_CNT_NIL(spin_locked_txnmgr) && \
24090+ LOCK_CNT_NIL(spin_locked_inode) && \
24091+ LOCK_CNT_NIL(rw_locked_cbk_cache) && \
24092+ LOCK_CNT_NIL(spin_locked_super_eflush) )
24093+
24094+static inline void spin_lock_stack(lock_stack *stack)
24095+{
24096+ assert("", spin_ordering_pred_stack(stack));
24097+ spin_lock(&(stack->sguard));
24098+ LOCK_CNT_INC(spin_locked_stack);
24099+ LOCK_CNT_INC(spin_locked);
24100+}
24101+
24102+static inline void spin_unlock_stack(lock_stack *stack)
24103+{
24104+ assert_spin_locked(&(stack->sguard));
24105+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24106+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24107+ LOCK_CNT_DEC(spin_locked_stack);
24108+ LOCK_CNT_DEC(spin_locked);
24109+ spin_unlock(&(stack->sguard));
24110+}
24111+
24112+static inline void reiser4_wake_up(lock_stack * owner)
24113+{
24114+ spin_lock_stack(owner);
24115+ __reiser4_wake_up(owner);
24116+ spin_unlock_stack(owner);
24117+}
24118+
24119+const char *lock_mode_name(znode_lock_mode lock);
24120+
24121+#if REISER4_DEBUG
24122+extern void check_lock_data(void);
24123+extern void check_lock_node_data(znode * node);
24124+#else
24125+#define check_lock_data() noop
24126+#define check_lock_node_data() noop
24127+#endif
24128+
24129+/* __LOCK_H__ */
24130+#endif
24131+
24132+/* Make Linus happy.
24133+ Local variables:
24134+ c-indentation-style: "K&R"
24135+ mode-name: "LC"
24136+ c-basic-offset: 8
24137+ tab-width: 8
24138+ fill-column: 120
24139+ End:
24140+*/
24141diff -urN linux-2.6.24.orig/fs/reiser4/Makefile linux-2.6.24/fs/reiser4/Makefile
24142--- linux-2.6.24.orig/fs/reiser4/Makefile 1970-01-01 03:00:00.000000000 +0300
24143+++ linux-2.6.24/fs/reiser4/Makefile 2008-01-25 11:39:06.948210780 +0300
24144@@ -0,0 +1,98 @@
24145+#
24146+# reiser4/Makefile
24147+#
24148+
24149+obj-$(CONFIG_REISER4_FS) += reiser4.o
24150+
24151+reiser4-y := \
24152+ debug.o \
24153+ jnode.o \
24154+ znode.o \
24155+ key.o \
24156+ pool.o \
24157+ tree_mod.o \
24158+ estimate.o \
24159+ carry.o \
24160+ carry_ops.o \
24161+ lock.o \
24162+ tree.o \
24163+ context.o \
24164+ tap.o \
24165+ coord.o \
24166+ block_alloc.o \
24167+ txnmgr.o \
24168+ kassign.o \
24169+ flush.o \
24170+ wander.o \
24171+ eottl.o \
24172+ search.o \
24173+ page_cache.o \
24174+ seal.o \
24175+ dscale.o \
24176+ flush_queue.o \
24177+ ktxnmgrd.o \
24178+ blocknrset.o \
24179+ super.o \
24180+ super_ops.o \
24181+ fsdata.o \
24182+ export_ops.o \
24183+ oid.o \
24184+ tree_walk.o \
24185+ inode.o \
24186+ vfs_ops.o \
24187+ as_ops.o \
24188+ entd.o\
24189+ readahead.o \
24190+ status_flags.o \
24191+ init_super.o \
24192+ safe_link.o \
24193+ \
24194+ plugin/plugin.o \
24195+ plugin/plugin_set.o \
24196+ plugin/node/node.o \
24197+ plugin/object.o \
24198+ plugin/cluster.o \
24199+ plugin/inode_ops.o \
24200+ plugin/inode_ops_rename.o \
24201+ plugin/file_ops.o \
24202+ plugin/file_ops_readdir.o \
24203+ plugin/file_plugin_common.o \
24204+ plugin/file/file.o \
24205+ plugin/file/tail_conversion.o \
24206+ plugin/file/file_conversion.o \
24207+ plugin/file/symlink.o \
24208+ plugin/file/cryptcompress.o \
24209+ plugin/dir_plugin_common.o \
24210+ plugin/dir/hashed_dir.o \
24211+ plugin/dir/seekable_dir.o \
24212+ plugin/node/node40.o \
24213+ \
24214+ plugin/crypto/cipher.o \
24215+ plugin/crypto/digest.o \
24216+ \
24217+ plugin/compress/compress.o \
24218+ plugin/compress/compress_mode.o \
24219+ \
24220+ plugin/item/static_stat.o \
24221+ plugin/item/sde.o \
24222+ plugin/item/cde.o \
24223+ plugin/item/blackbox.o \
24224+ plugin/item/internal.o \
24225+ plugin/item/tail.o \
24226+ plugin/item/ctail.o \
24227+ plugin/item/extent.o \
24228+ plugin/item/extent_item_ops.o \
24229+ plugin/item/extent_file_ops.o \
24230+ plugin/item/extent_flush_ops.o \
24231+ \
24232+ plugin/hash.o \
24233+ plugin/fibration.o \
24234+ plugin/tail_policy.o \
24235+ plugin/item/item.o \
24236+ \
24237+ plugin/security/perm.o \
24238+ plugin/space/bitmap.o \
24239+ \
24240+ plugin/disk_format/disk_format40.o \
24241+ plugin/disk_format/disk_format.o
24242+
24243diff -urN linux-2.6.24.orig/fs/reiser4/oid.c linux-2.6.24/fs/reiser4/oid.c
24244--- linux-2.6.24.orig/fs/reiser4/oid.c 1970-01-01 03:00:00.000000000 +0300
24245+++ linux-2.6.24/fs/reiser4/oid.c 2008-01-25 11:39:06.952211810 +0300
24246@@ -0,0 +1,141 @@
24247+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24248+
24249+#include "debug.h"
24250+#include "super.h"
24251+#include "txnmgr.h"
24252+
24253+/* we used to have oid allocation plugin. It was removed because it
24254+ was recognized as providing unneeded level of abstraction. If one
24255+ ever will find it useful - look at yet_unneeded_abstractions/oid
24256+*/
24257+
24258+/*
24259+ * initialize in-memory data for oid allocator at @super. @nr_files and @next
24260+ * are provided by disk format plugin that reads them from the disk during
24261+ * mount.
24262+ */
24263+int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24264+{
24265+ reiser4_super_info_data *sbinfo;
24266+
24267+ sbinfo = get_super_private(super);
24268+
24269+ sbinfo->next_to_use = next;
24270+ sbinfo->oids_in_use = nr_files;
24271+ return 0;
24272+}
24273+
24274+/*
24275+ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24276+ * runs out of oids.
24277+ */
24278+oid_t oid_allocate(struct super_block * super)
24279+{
24280+ reiser4_super_info_data *sbinfo;
24281+ oid_t oid;
24282+
24283+ sbinfo = get_super_private(super);
24284+
24285+ spin_lock_reiser4_super(sbinfo);
24286+ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24287+ oid = sbinfo->next_to_use++;
24288+ sbinfo->oids_in_use++;
24289+ } else
24290+ oid = ABSOLUTE_MAX_OID;
24291+ spin_unlock_reiser4_super(sbinfo);
24292+ return oid;
24293+}
24294+
24295+/*
24296+ * Tell oid allocator that @oid is now free.
24297+ */
24298+int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24299+{
24300+ reiser4_super_info_data *sbinfo;
24301+
24302+ sbinfo = get_super_private(super);
24303+
24304+ spin_lock_reiser4_super(sbinfo);
24305+ sbinfo->oids_in_use--;
24306+ spin_unlock_reiser4_super(sbinfo);
24307+ return 0;
24308+}
24309+
24310+/*
24311+ * return next @oid that would be allocated (i.e., returned by oid_allocate())
24312+ * without actually allocating it. This is used by disk format plugin to save
24313+ * oid allocator state on the disk.
24314+ */
24315+oid_t oid_next(const struct super_block * super)
24316+{
24317+ reiser4_super_info_data *sbinfo;
24318+ oid_t oid;
24319+
24320+ sbinfo = get_super_private(super);
24321+
24322+ spin_lock_reiser4_super(sbinfo);
24323+ oid = sbinfo->next_to_use;
24324+ spin_unlock_reiser4_super(sbinfo);
24325+ return oid;
24326+}
24327+
24328+/*
24329+ * returns number of currently used oids. This is used by statfs(2) to report
24330+ * number of "inodes" and by disk format plugin to save oid allocator state on
24331+ * the disk.
24332+ */
24333+long oids_used(const struct super_block *super)
24334+{
24335+ reiser4_super_info_data *sbinfo;
24336+ oid_t used;
24337+
24338+ sbinfo = get_super_private(super);
24339+
24340+ spin_lock_reiser4_super(sbinfo);
24341+ used = sbinfo->oids_in_use;
24342+ spin_unlock_reiser4_super(sbinfo);
24343+ if (used < (__u64) ((long)~0) >> 1)
24344+ return (long)used;
24345+ else
24346+ return (long)-1;
24347+}
24348+
24349+/*
24350+ * Count oid as allocated in atom. This is done after call to oid_allocate()
24351+ * at the point when we are irrevocably committed to creation of the new file
24352+ * (i.e., when oid allocation cannot be any longer rolled back due to some
24353+ * error).
24354+ */
24355+void oid_count_allocated(void)
24356+{
24357+ txn_atom *atom;
24358+
24359+ atom = get_current_atom_locked();
24360+ atom->nr_objects_created++;
24361+ spin_unlock_atom(atom);
24362+}
24363+
24364+/*
24365+ * Count oid as free in atom. This is done after call to oid_release() at the
24366+ * point when we are irrevocably committed to the deletion of the file (i.e.,
24367+ * when oid release cannot be any longer rolled back due to some error).
24368+ */
24369+void oid_count_released(void)
24370+{
24371+ txn_atom *atom;
24372+
24373+ atom = get_current_atom_locked();
24374+ atom->nr_objects_deleted++;
24375+ spin_unlock_atom(atom);
24376+}
24377+
24378+/*
24379+ Local variables:
24380+ c-indentation-style: "K&R"
24381+ mode-name: "LC"
24382+ c-basic-offset: 8
24383+ tab-width: 8
24384+ fill-column: 120
24385+ scroll-step: 1
24386+ End:
24387+*/
24388diff -urN linux-2.6.24.orig/fs/reiser4/page_cache.c linux-2.6.24/fs/reiser4/page_cache.c
24389--- linux-2.6.24.orig/fs/reiser4/page_cache.c 1970-01-01 03:00:00.000000000 +0300
24390+++ linux-2.6.24/fs/reiser4/page_cache.c 2008-01-25 11:54:46.665843146 +0300
24391@@ -0,0 +1,714 @@
24392+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24393+ * reiser4/README */
24394+
24395+/* Memory pressure hooks. Fake inodes handling. */
24396+
24397+/* GLOSSARY
24398+
24399+ . Formatted and unformatted nodes.
24400+ Elements of reiser4 balanced tree to store data and metadata.
24401+ Unformatted nodes are pointed to by extent pointers. Such nodes
24402+ are used to store data of large objects. Unlike unformatted nodes,
24403+ formatted ones have associated format described by node4X plugin.
24404+
24405+ . Jnode (or journal node)
24406+ The in-memory header which is used to track formatted and unformatted
24407+ nodes, bitmap nodes, etc. In particular, jnodes are used to track
24408+ transactional information associated with each block(see reiser4/jnode.c
24409+ for details).
24410+
24411+ . Znode
24412+ The in-memory header which is used to track formatted nodes. Contains
24413+ embedded jnode (see reiser4/znode.c for details).
24414+*/
24415+
24416+/* We store all file system meta data (and data, of course) in the page cache.
24417+
24418+ What does this mean? In stead of using bread/brelse we create special
24419+ "fake" inode (one per super block) and store content of formatted nodes
24420+ into pages bound to this inode in the page cache. In newer kernels bread()
24421+ already uses inode attached to block device (bd_inode). Advantage of having
24422+ our own fake inode is that we can install appropriate methods in its
24423+ address_space operations. Such methods are called by VM on memory pressure
24424+ (or during background page flushing) and we can use them to react
24425+ appropriately.
24426+
24427+ In initial version we only support one block per page. Support for multiple
24428+ blocks per page is complicated by relocation.
24429+
24430+ To each page, used by reiser4, jnode is attached. jnode is analogous to
24431+ buffer head. Difference is that jnode is bound to the page permanently:
24432+ jnode cannot be removed from memory until its backing page is.
24433+
24434+ jnode contain pointer to page (->pg field) and page contain pointer to
24435+ jnode in ->private field. Pointer from jnode to page is protected to by
24436+ jnode's spinlock and pointer from page to jnode is protected by page lock
24437+ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24438+ lock. To go into reverse direction use jnode_lock_page() function that uses
24439+ standard try-lock-and-release device.
24440+
24441+ Properties:
24442+
24443+ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24444+ reference counter is increased.
24445+
24446+ 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24447+ reference counter is decreased.
24448+
24449+ 3. on jload() reference counter on jnode page is increased, page is
24450+ kmapped and `referenced'.
24451+
24452+ 4. on jrelse() inverse operations are performed.
24453+
24454+ 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24455+
24456+ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24457+ historically.]
24458+
24459+ [In the following discussion, `lock' invariably means long term lock on
24460+ znode.] (What about page locks?)
24461+
24462+ There is some special class of deadlock possibilities related to memory
24463+ pressure. Locks acquired by other reiser4 threads are accounted for in
24464+ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24465+ invoked additional hidden arc is added to the locking graph: thread that
24466+ tries to allocate memory waits for ->vm_writeback() to finish. If this
24467+ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24468+ prevention is useless.
24469+
24470+ Another related problem is possibility for ->vm_writeback() to run out of
24471+ memory itself. This is not a problem for ext2 and friends, because their
24472+ ->vm_writeback() don't allocate much memory, but reiser4 flush is
24473+ definitely able to allocate huge amounts of memory.
24474+
24475+ It seems that there is no reliable way to cope with the problems above. In
24476+ stead it was decided that ->vm_writeback() (as invoked in the kswapd
24477+ context) wouldn't perform any flushing itself, but rather should just wake
24478+ up some auxiliary thread dedicated for this purpose (or, the same thread
24479+ that does periodic commit of old atoms (ktxnmgrd.c)).
24480+
24481+ Details:
24482+
24483+ 1. Page is called `reclaimable' against particular reiser4 mount F if this
24484+ page can be ultimately released by try_to_free_pages() under presumptions
24485+ that:
24486+
24487+ a. ->vm_writeback() for F is no-op, and
24488+
24489+ b. none of the threads accessing F are making any progress, and
24490+
24491+ c. other reiser4 mounts obey the same memory reservation protocol as F
24492+ (described below).
24493+
24494+ For example, clean un-pinned page, or page occupied by ext2 data are
24495+ reclaimable against any reiser4 mount.
24496+
24497+ When there is more than one reiser4 mount in a system, condition (c) makes
24498+ reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24499+
24500+ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24501+
24502+ Fake inode is used to bound formatted nodes and each node is indexed within
24503+ fake inode by its block number. If block size of smaller than page size, it
24504+ may so happen that block mapped to the page with formatted node is occupied
24505+ by unformatted node or is unallocated. This lead to some complications,
24506+ because flushing whole page can lead to an incorrect overwrite of
24507+ unformatted node that is moreover, can be cached in some other place as
24508+ part of the file body. To avoid this, buffers for unformatted nodes are
24509+ never marked dirty. Also pages in the fake are never marked dirty. This
24510+ rules out usage of ->writepage() as memory pressure hook. In stead
24511+ ->releasepage() is used.
24512+
24513+ Josh is concerned that page->buffer is going to die. This should not pose
24514+ significant problem though, because we need to add some data structures to
24515+ the page anyway (jnode) and all necessary book keeping can be put there.
24516+
24517+*/
24518+
24519+/* Life cycle of pages/nodes.
24520+
24521+ jnode contains reference to page and page contains reference back to
24522+ jnode. This reference is counted in page ->count. Thus, page bound to jnode
24523+ cannot be released back into free pool.
24524+
24525+ 1. Formatted nodes.
24526+
24527+ 1. formatted node is represented by znode. When new znode is created its
24528+ ->pg pointer is NULL initially.
24529+
24530+ 2. when node content is loaded into znode (by call to zload()) for the
24531+ first time following happens (in call to ->read_node() or
24532+ ->allocate_node()):
24533+
24534+ 1. new page is added to the page cache.
24535+
24536+ 2. this page is attached to znode and its ->count is increased.
24537+
24538+ 3. page is kmapped.
24539+
24540+ 3. if more calls to zload() follow (without corresponding zrelses), page
24541+ counter is left intact and in its stead ->d_count is increased in znode.
24542+
24543+ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24544+ ->release_node() is called and page is kunmapped as result.
24545+
24546+ 5. at some moment node can be captured by a transaction. Its ->x_count
24547+ is then increased by transaction manager.
24548+
24549+ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24550+ bit set) following will happen (also see comment at the top of znode.c):
24551+
24552+ 1. when last lock is released, node will be uncaptured from
24553+ transaction. This released reference that transaction manager acquired
24554+ at the step 5.
24555+
24556+ 2. when last reference is released, zput() detects that node is
24557+ actually deleted and calls ->delete_node()
24558+ operation. page_cache_delete_node() implementation detaches jnode from
24559+ page and releases page.
24560+
24561+ 7. otherwise (node wasn't removed from the tree), last reference to
24562+ znode will be released after transaction manager committed transaction
24563+ node was in. This implies squallocing of this node (see
24564+ flush.c). Nothing special happens at this point. Znode is still in the
24565+ hash table and page is still attached to it.
24566+
24567+ 8. znode is actually removed from the memory because of the memory
24568+ pressure, or during umount (znodes_tree_done()). Anyway, znode is
24569+ removed by the call to zdrop(). At this moment, page is detached from
24570+ znode and removed from the inode address space.
24571+
24572+*/
24573+
24574+#include "debug.h"
24575+#include "dformat.h"
24576+#include "key.h"
24577+#include "txnmgr.h"
24578+#include "jnode.h"
24579+#include "znode.h"
24580+#include "block_alloc.h"
24581+#include "tree.h"
24582+#include "vfs_ops.h"
24583+#include "inode.h"
24584+#include "super.h"
24585+#include "entd.h"
24586+#include "page_cache.h"
24587+#include "ktxnmgrd.h"
24588+
24589+#include <linux/types.h>
24590+#include <linux/fs.h>
24591+#include <linux/mm.h> /* for struct page */
24592+#include <linux/swap.h> /* for struct page */
24593+#include <linux/pagemap.h>
24594+#include <linux/bio.h>
24595+#include <linux/writeback.h>
24596+#include <linux/blkdev.h>
24597+
24598+static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24599+
24600+static struct address_space_operations formatted_fake_as_ops;
24601+
24602+static const oid_t fake_ino = 0x1;
24603+static const oid_t bitmap_ino = 0x2;
24604+static const oid_t cc_ino = 0x3;
24605+
24606+static void
24607+init_fake_inode(struct super_block *super, struct inode *fake,
24608+ struct inode **pfake)
24609+{
24610+ assert("nikita-2168", fake->i_state & I_NEW);
24611+ fake->i_mapping->a_ops = &formatted_fake_as_ops;
24612+ *pfake = fake;
24613+ /* NOTE-NIKITA something else? */
24614+ unlock_new_inode(fake);
24615+}
24616+
24617+/**
24618+ * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24619+ * @super: super block to init fake inode for
24620+ *
24621+ * Initializes fake inode to which formatted nodes are bound in the page cache
24622+ * and inode for bitmaps.
24623+ */
24624+int reiser4_init_formatted_fake(struct super_block *super)
24625+{
24626+ struct inode *fake;
24627+ struct inode *bitmap;
24628+ struct inode *cc;
24629+ reiser4_super_info_data *sinfo;
24630+
24631+ assert("nikita-1703", super != NULL);
24632+
24633+ sinfo = get_super_private_nocheck(super);
24634+ fake = iget_locked(super, oid_to_ino(fake_ino));
24635+
24636+ if (fake != NULL) {
24637+ init_fake_inode(super, fake, &sinfo->fake);
24638+
24639+ bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24640+ if (bitmap != NULL) {
24641+ init_fake_inode(super, bitmap, &sinfo->bitmap);
24642+
24643+ cc = iget_locked(super, oid_to_ino(cc_ino));
24644+ if (cc != NULL) {
24645+ init_fake_inode(super, cc, &sinfo->cc);
24646+ return 0;
24647+ } else {
24648+ iput(sinfo->fake);
24649+ iput(sinfo->bitmap);
24650+ sinfo->fake = NULL;
24651+ sinfo->bitmap = NULL;
24652+ }
24653+ } else {
24654+ iput(sinfo->fake);
24655+ sinfo->fake = NULL;
24656+ }
24657+ }
24658+ return RETERR(-ENOMEM);
24659+}
24660+
24661+/**
24662+ * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24663+ * @super: super block to init fake inode for
24664+ *
24665+ * Releases inodes which were used as address spaces of bitmap and formatted
24666+ * nodes.
24667+ */
24668+void reiser4_done_formatted_fake(struct super_block *super)
24669+{
24670+ reiser4_super_info_data *sinfo;
24671+
24672+ sinfo = get_super_private_nocheck(super);
24673+
24674+ if (sinfo->fake != NULL) {
24675+ iput(sinfo->fake);
24676+ sinfo->fake = NULL;
24677+ }
24678+
24679+ if (sinfo->bitmap != NULL) {
24680+ iput(sinfo->bitmap);
24681+ sinfo->bitmap = NULL;
24682+ }
24683+
24684+ if (sinfo->cc != NULL) {
24685+ iput(sinfo->cc);
24686+ sinfo->cc = NULL;
24687+ }
24688+ return;
24689+}
24690+
24691+void reiser4_wait_page_writeback(struct page *page)
24692+{
24693+ assert("zam-783", PageLocked(page));
24694+
24695+ do {
24696+ unlock_page(page);
24697+ wait_on_page_writeback(page);
24698+ lock_page(page);
24699+ } while (PageWriteback(page));
24700+}
24701+
24702+/* return tree @page is in */
24703+reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ )
24704+{
24705+ assert("nikita-2461", page != NULL);
24706+ return &get_super_private(page->mapping->host->i_sb)->tree;
24707+}
24708+
24709+/* completion handler for single page bio-based read.
24710+
24711+ mpage_end_io_read() would also do. But it's static.
24712+
24713+*/
24714+static void
24715+end_bio_single_page_read(struct bio *bio, int err UNUSED_ARG)
24716+{
24717+ struct page *page;
24718+
24719+ page = bio->bi_io_vec[0].bv_page;
24720+
24721+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24722+ SetPageUptodate(page);
24723+ } else {
24724+ ClearPageUptodate(page);
24725+ SetPageError(page);
24726+ }
24727+ unlock_page(page);
24728+ bio_put(bio);
24729+}
24730+
24731+/* completion handler for single page bio-based write.
24732+
24733+ mpage_end_io_write() would also do. But it's static.
24734+
24735+*/
24736+static void
24737+end_bio_single_page_write(struct bio *bio, int err UNUSED_ARG)
24738+{
24739+ struct page *page;
24740+
24741+ page = bio->bi_io_vec[0].bv_page;
24742+
24743+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24744+ SetPageError(page);
24745+ end_page_writeback(page);
24746+ bio_put(bio);
24747+}
24748+
24749+/* ->readpage() method for formatted nodes */
24750+static int formatted_readpage(struct file *f UNUSED_ARG,
24751+ struct page *page /* page to read */ )
24752+{
24753+ assert("nikita-2412", PagePrivate(page) && jprivate(page));
24754+ return reiser4_page_io(page, jprivate(page), READ,
24755+ reiser4_ctx_gfp_mask_get());
24756+}
24757+
24758+/**
24759+ * reiser4_page_io - submit single-page bio request
24760+ * @page: page to perform io for
24761+ * @node: jnode of page
24762+ * @rw: read or write
24763+ * @gfp: gfp mask for bio allocation
24764+ *
24765+ * Submits single page read or write.
24766+ */
24767+int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
24768+{
24769+ struct bio *bio;
24770+ int result;
24771+
24772+ assert("nikita-2094", page != NULL);
24773+ assert("nikita-2226", PageLocked(page));
24774+ assert("nikita-2634", node != NULL);
24775+ assert("nikita-2893", rw == READ || rw == WRITE);
24776+
24777+ if (rw) {
24778+ if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24779+ unlock_page(page);
24780+ return 0;
24781+ }
24782+ }
24783+
24784+ bio = page_bio(page, node, rw, gfp);
24785+ if (!IS_ERR(bio)) {
24786+ if (rw == WRITE) {
24787+ set_page_writeback(page);
24788+ unlock_page(page);
24789+ }
24790+ reiser4_submit_bio(rw, bio);
24791+ result = 0;
24792+ } else {
24793+ unlock_page(page);
24794+ result = PTR_ERR(bio);
24795+ }
24796+
24797+ return result;
24798+}
24799+
24800+/* helper function to construct bio for page */
24801+static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24802+{
24803+ struct bio *bio;
24804+ assert("nikita-2092", page != NULL);
24805+ assert("nikita-2633", node != NULL);
24806+
24807+ /* Simple implementation in the assumption that blocksize == pagesize.
24808+
24809+ We only have to submit one block, but submit_bh() will allocate bio
24810+ anyway, so lets use all the bells-and-whistles of bio code.
24811+ */
24812+
24813+ bio = bio_alloc(gfp, 1);
24814+ if (bio != NULL) {
24815+ int blksz;
24816+ struct super_block *super;
24817+ reiser4_block_nr blocknr;
24818+
24819+ super = page->mapping->host->i_sb;
24820+ assert("nikita-2029", super != NULL);
24821+ blksz = super->s_blocksize;
24822+ assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24823+
24824+ spin_lock_jnode(node);
24825+ blocknr = *jnode_get_io_block(node);
24826+ spin_unlock_jnode(node);
24827+
24828+ assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
24829+ assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
24830+
24831+ bio->bi_bdev = super->s_bdev;
24832+ /* fill bio->bi_sector before calling bio_add_page(), because
24833+ * q->merge_bvec_fn may want to inspect it (see
24834+ * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24835+ bio->bi_sector = blocknr * (blksz >> 9);
24836+
24837+ if (!bio_add_page(bio, page, blksz, 0)) {
24838+ warning("nikita-3452",
24839+ "Single page bio cannot be constructed");
24840+ return ERR_PTR(RETERR(-EINVAL));
24841+ }
24842+
24843+ /* bio -> bi_idx is filled by bio_init() */
24844+ bio->bi_end_io = (rw == READ) ?
24845+ end_bio_single_page_read : end_bio_single_page_write;
24846+
24847+ return bio;
24848+ } else
24849+ return ERR_PTR(RETERR(-ENOMEM));
24850+}
24851+
24852+/* this function is internally called by jnode_make_dirty() */
24853+int reiser4_set_page_dirty_internal(struct page *page)
24854+{
24855+ struct address_space *mapping;
24856+
24857+ mapping = page->mapping;
24858+ BUG_ON(mapping == NULL);
24859+
24860+ if (!TestSetPageDirty(page)) {
24861+ if (mapping_cap_account_dirty(mapping))
24862+ inc_zone_page_state(page, NR_FILE_DIRTY);
24863+
24864+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
24865+ }
24866+
24867+ /* znode must be dirty ? */
24868+ if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb))
24869+ assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
24870+ return 0;
24871+}
24872+
24873+#if 0
24874+static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
24875+{
24876+ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
24877+ return 1;
24878+ if (ctx->super != s)
24879+ return 1;
24880+ if (get_super_private(s)->entd.tsk == current)
24881+ return 0;
24882+ if (!lock_stack_isclean(&ctx->stack))
24883+ return 0;
24884+ if (ctx->trans->atom != NULL)
24885+ return 0;
24886+ return 1;
24887+}
24888+#endif
24889+
24890+/**
24891+ * reiser4_writepage - writepage of struct address_space_operations
24892+ * @page: page to write
24893+ * @wbc:
24894+ *
24895+ *
24896+ */
24897+/* Common memory pressure notification. */
24898+int reiser4_writepage(struct page *page,
24899+ struct writeback_control *wbc)
24900+{
24901+ struct super_block *s;
24902+ reiser4_context *ctx;
24903+
24904+ assert("vs-828", PageLocked(page));
24905+
24906+ s = page->mapping->host->i_sb;
24907+ ctx = get_current_context_check();
24908+
24909+ //assert("", can_hit_entd(ctx, s));
24910+ return write_page_by_ent(page, wbc);
24911+}
24912+
24913+/* ->set_page_dirty() method of formatted address_space */
24914+static int formatted_set_page_dirty(struct page *page)
24915+{
24916+ assert("nikita-2173", page != NULL);
24917+ BUG();
24918+ return __set_page_dirty_nobuffers(page);
24919+}
24920+
24921+/* writepages method of address space operations in reiser4 is used to involve
24922+ into transactions pages which are dirtied via mmap. Only regular files can
24923+ have such pages. Fake inode is used to access formatted nodes via page
24924+ cache. As formatted nodes can never be mmaped, fake inode's writepages has
24925+ nothing to do */
24926+static int
24927+writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
24928+{
24929+ return 0;
24930+}
24931+
24932+/* address space operations for the fake inode */
24933+static struct address_space_operations formatted_fake_as_ops = {
24934+ /* Perform a writeback of a single page as a memory-freeing
24935+ * operation. */
24936+ .writepage = reiser4_writepage,
24937+ /* this is called to read formatted node */
24938+ .readpage = formatted_readpage,
24939+ /* ->sync_page() method of fake inode address space operations. Called
24940+ from wait_on_page() and lock_page().
24941+
24942+ This is most annoyingly misnomered method. Actually it is called
24943+ from wait_on_page_bit() and lock_page() and its purpose is to
24944+ actually start io by jabbing device drivers.
24945+ */
24946+ .sync_page = block_sync_page,
24947+ /* Write back some dirty pages from this mapping. Called from sync.
24948+ called during sync (pdflush) */
24949+ .writepages = writepages_fake,
24950+ /* Set a page dirty */
24951+ .set_page_dirty = formatted_set_page_dirty,
24952+ /* used for read-ahead. Not applicable */
24953+ .readpages = NULL,
24954+ .prepare_write = NULL,
24955+ .commit_write = NULL,
24956+ .bmap = NULL,
24957+ /* called just before page is being detached from inode mapping and
24958+ removed from memory. Called on truncate, cut/squeeze, and
24959+ umount. */
24960+ .invalidatepage = reiser4_invalidatepage,
24961+ /* this is called by shrink_cache() so that file system can try to
24962+ release objects (jnodes, buffers, journal heads) attached to page
24963+ and, may be made page itself free-able.
24964+ */
24965+ .releasepage = reiser4_releasepage,
24966+ .direct_IO = NULL
24967+};
24968+
24969+/* called just before page is released (no longer used by reiser4). Callers:
24970+ jdelete() and extent2tail(). */
24971+void reiser4_drop_page(struct page *page)
24972+{
24973+ assert("nikita-2181", PageLocked(page));
24974+ clear_page_dirty_for_io(page);
24975+ ClearPageUptodate(page);
24976+#if defined(PG_skipped)
24977+ ClearPageSkipped(page);
24978+#endif
24979+ unlock_page(page);
24980+}
24981+
24982+#define JNODE_GANG_SIZE (16)
24983+
24984+/* find all jnodes from range specified and invalidate them */
24985+static int
24986+truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
24987+{
24988+ reiser4_inode *info;
24989+ int truncated_jnodes;
24990+ reiser4_tree *tree;
24991+ unsigned long index;
24992+ unsigned long end;
24993+
24994+ if (inode_file_plugin(inode) ==
24995+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
24996+ /*
24997+ * No need to get rid of jnodes here: if the single jnode of
24998+ * page cluster did not have page, then it was found and killed
24999+ * before in
25000+ * truncate_complete_page_cluster()->jput()->jput_final(),
25001+ * otherwise it will be dropped by reiser4_invalidatepage()
25002+ */
25003+ return 0;
25004+ truncated_jnodes = 0;
25005+
25006+ info = reiser4_inode_data(inode);
25007+ tree = reiser4_tree_by_inode(inode);
25008+
25009+ index = from;
25010+ end = from + count;
25011+
25012+ while (1) {
25013+ jnode *gang[JNODE_GANG_SIZE];
25014+ int taken;
25015+ int i;
25016+ jnode *node;
25017+
25018+ assert("nikita-3466", index <= end);
25019+
25020+ read_lock_tree(tree);
25021+ taken =
25022+ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
25023+ (void **)gang, index,
25024+ JNODE_GANG_SIZE);
25025+ for (i = 0; i < taken; ++i) {
25026+ node = gang[i];
25027+ if (index_jnode(node) < end)
25028+ jref(node);
25029+ else
25030+ gang[i] = NULL;
25031+ }
25032+ read_unlock_tree(tree);
25033+
25034+ for (i = 0; i < taken; ++i) {
25035+ node = gang[i];
25036+ if (node != NULL) {
25037+ index = max(index, index_jnode(node));
25038+ spin_lock_jnode(node);
25039+ assert("edward-1457", node->pg == NULL);
25040+ /* this is always called after
25041+ truncate_inode_pages_range(). Therefore, here
25042+ jnode can not have page. New pages can not be
25043+ created because truncate_jnodes_range goes
25044+ under exclusive access on file obtained,
25045+ where as new page creation requires
25046+ non-exclusive access obtained */
25047+ JF_SET(node, JNODE_HEARD_BANSHEE);
25048+ reiser4_uncapture_jnode(node);
25049+ unhash_unformatted_jnode(node);
25050+ truncated_jnodes++;
25051+ jput(node);
25052+ } else
25053+ break;
25054+ }
25055+ if (i != taken || taken == 0)
25056+ break;
25057+ }
25058+ return truncated_jnodes;
25059+}
25060+
25061+/* Truncating files in reiser4: problems and solutions.
25062+
25063+ VFS calls fs's truncate after it has called truncate_inode_pages()
25064+ to get rid of pages corresponding to part of file being truncated.
25065+ In reiser4 it may cause existence of unallocated extents which do
25066+ not have jnodes. Flush code does not expect that. Solution of this
25067+ problem is straightforward. As vfs's truncate is implemented using
25068+ setattr operation, it seems reasonable to have ->setattr() that
25069+ will cut file body. However, flush code also does not expect dirty
25070+ pages without parent items, so it is impossible to cut all items,
25071+ then truncate all pages in two steps. We resolve this problem by
25072+ cutting items one-by-one. Each such fine-grained step performed
25073+ under longterm znode lock calls at the end ->kill_hook() method of
25074+ a killed item to remove its binded pages and jnodes.
25075+
25076+ The following function is a common part of mentioned kill hooks.
25077+ Also, this is called before tail-to-extent conversion (to not manage
25078+ few copies of the data).
25079+*/
25080+void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25081+ unsigned long count, int even_cows)
25082+{
25083+ loff_t from_bytes, count_bytes;
25084+
25085+ if (count == 0)
25086+ return;
25087+ from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25088+ count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25089+
25090+ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25091+ truncate_inode_pages_range(mapping, from_bytes,
25092+ from_bytes + count_bytes - 1);
25093+ truncate_jnodes_range(mapping->host, from, count);
25094+}
25095+
25096+/*
25097+ * Local variables:
25098+ * c-indentation-style: "K&R"
25099+ * mode-name: "LC"
25100+ * c-basic-offset: 8
25101+ * tab-width: 8
25102+ * fill-column: 120
25103+ * scroll-step: 1
25104+ * End:
25105+ */
25106diff -urN linux-2.6.24.orig/fs/reiser4/page_cache.h linux-2.6.24/fs/reiser4/page_cache.h
25107--- linux-2.6.24.orig/fs/reiser4/page_cache.h 1970-01-01 03:00:00.000000000 +0300
25108+++ linux-2.6.24/fs/reiser4/page_cache.h 2008-01-25 11:39:06.952211810 +0300
25109@@ -0,0 +1,68 @@
25110+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25111+ * reiser4/README */
25112+/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25113+
25114+#if !defined( __REISER4_PAGE_CACHE_H__ )
25115+#define __REISER4_PAGE_CACHE_H__
25116+
25117+#include "forward.h"
25118+#include "context.h" /* for reiser4_ctx_gfp_mask_get() */
25119+
25120+#include <linux/fs.h> /* for struct super_block, address_space */
25121+#include <linux/mm.h> /* for struct page */
25122+#include <linux/pagemap.h> /* for lock_page() */
25123+#include <linux/vmalloc.h> /* for __vmalloc() */
25124+
25125+extern int reiser4_init_formatted_fake(struct super_block *);
25126+extern void reiser4_done_formatted_fake(struct super_block *);
25127+
25128+extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25129+
25130+extern int reiser4_set_page_dirty_internal(struct page *);
25131+
25132+#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25133+
25134+extern void reiser4_wait_page_writeback(struct page *);
25135+static inline void lock_and_wait_page_writeback(struct page *page)
25136+{
25137+ lock_page(page);
25138+ if (unlikely(PageWriteback(page)))
25139+ reiser4_wait_page_writeback(page);
25140+}
25141+
25142+#define jprivate(page) ((jnode *)page_private(page))
25143+
25144+extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25145+extern void reiser4_drop_page(struct page *);
25146+extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25147+ unsigned long count, int even_cows);
25148+extern void capture_reiser4_inodes(struct super_block *,
25149+ struct writeback_control *);
25150+static inline void * reiser4_vmalloc (unsigned long size)
25151+{
25152+ return __vmalloc(size,
25153+ reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25154+ PAGE_KERNEL);
25155+}
25156+
25157+#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25158+
25159+#if REISER4_DEBUG
25160+extern void print_page(const char *prefix, struct page *page);
25161+#else
25162+#define print_page(prf, p) noop
25163+#endif
25164+
25165+/* __REISER4_PAGE_CACHE_H__ */
25166+#endif
25167+
25168+/* Make Linus happy.
25169+ Local variables:
25170+ c-indentation-style: "K&R"
25171+ mode-name: "LC"
25172+ c-basic-offset: 8
25173+ tab-width: 8
25174+ fill-column: 120
25175+ scroll-step: 1
25176+ End:
25177+*/
25178diff -urN linux-2.6.24.orig/fs/reiser4/plugin/cluster.c linux-2.6.24/fs/reiser4/plugin/cluster.c
25179--- linux-2.6.24.orig/fs/reiser4/plugin/cluster.c 1970-01-01 03:00:00.000000000 +0300
25180+++ linux-2.6.24/fs/reiser4/plugin/cluster.c 2008-01-25 11:39:06.952211810 +0300
25181@@ -0,0 +1,71 @@
25182+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25183+
25184+/* Contains reiser4 cluster plugins (see
25185+ http://www.namesys.com/cryptcompress_design.html
25186+ "Concepts of clustering" for details). */
25187+
25188+#include "plugin_header.h"
25189+#include "plugin.h"
25190+#include "../inode.h"
25191+
25192+static int change_cluster(struct inode *inode,
25193+ reiser4_plugin * plugin,
25194+ pset_member memb)
25195+{
25196+ assert("edward-1324", inode != NULL);
25197+ assert("edward-1325", plugin != NULL);
25198+ assert("edward-1326", is_reiser4_inode(inode));
25199+ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25200+
25201+ /* Can't change the cluster plugin for already existent regular files. */
25202+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25203+ return RETERR(-EINVAL);
25204+
25205+ /* If matches, nothing to change. */
25206+ if (inode_hash_plugin(inode) != NULL &&
25207+ inode_hash_plugin(inode)->h.id == plugin->h.id)
25208+ return 0;
25209+
25210+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25211+ PSET_CLUSTER, plugin);
25212+}
25213+
25214+static reiser4_plugin_ops cluster_plugin_ops = {
25215+ .init = NULL,
25216+ .load = NULL,
25217+ .save_len = NULL,
25218+ .save = NULL,
25219+ .change = &change_cluster
25220+};
25221+
25222+#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25223+ [CLUSTER_ ## ID ## _ID] = { \
25224+ .h = { \
25225+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25226+ .id = CLUSTER_ ## ID ## _ID, \
25227+ .pops = &cluster_plugin_ops, \
25228+ .label = LABEL, \
25229+ .desc = DESC, \
25230+ .linkage = {NULL, NULL} \
25231+ }, \
25232+ .shift = SHIFT \
25233+ }
25234+
25235+cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25236+ SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25237+ SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25238+ SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25239+ SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25240+ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25241+};
25242+
25243+/*
25244+ Local variables:
25245+ c-indentation-style: "K&R"
25246+ mode-name: "LC"
25247+ c-basic-offset: 8
25248+ tab-width: 8
25249+ fill-column: 120
25250+ scroll-step: 1
25251+ End:
25252+*/
25253diff -urN linux-2.6.24.orig/fs/reiser4/plugin/cluster.h linux-2.6.24/fs/reiser4/plugin/cluster.h
25254--- linux-2.6.24.orig/fs/reiser4/plugin/cluster.h 1970-01-01 03:00:00.000000000 +0300
25255+++ linux-2.6.24/fs/reiser4/plugin/cluster.h 2008-01-25 11:39:06.956212841 +0300
25256@@ -0,0 +1,409 @@
25257+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25258+
25259+/* This file contains size/offset translators, modulators
25260+ and other helper functions. */
25261+
25262+#if !defined( __FS_REISER4_CLUSTER_H__ )
25263+#define __FS_REISER4_CLUSTER_H__
25264+
25265+#include "../inode.h"
25266+
25267+static inline int inode_cluster_shift(struct inode *inode)
25268+{
25269+ assert("edward-92", inode != NULL);
25270+ assert("edward-93", reiser4_inode_data(inode) != NULL);
25271+
25272+ return inode_cluster_plugin(inode)->shift;
25273+}
25274+
25275+static inline unsigned cluster_nrpages_shift(struct inode *inode)
25276+{
25277+ return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25278+}
25279+
25280+/* cluster size in page units */
25281+static inline unsigned cluster_nrpages(struct inode *inode)
25282+{
25283+ return 1U << cluster_nrpages_shift(inode);
25284+}
25285+
25286+static inline size_t inode_cluster_size(struct inode *inode)
25287+{
25288+ assert("edward-96", inode != NULL);
25289+
25290+ return 1U << inode_cluster_shift(inode);
25291+}
25292+
25293+static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25294+{
25295+ return idx >> cluster_nrpages_shift(inode);
25296+}
25297+
25298+static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25299+{
25300+ return idx << cluster_nrpages_shift(inode);
25301+}
25302+
25303+static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25304+{
25305+ return clust_to_pg(pg_to_clust(idx, inode), inode);
25306+}
25307+
25308+static inline pgoff_t off_to_pg(loff_t off)
25309+{
25310+ return (off >> PAGE_CACHE_SHIFT);
25311+}
25312+
25313+static inline loff_t pg_to_off(pgoff_t idx)
25314+{
25315+ return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25316+}
25317+
25318+static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25319+{
25320+ return off >> inode_cluster_shift(inode);
25321+}
25322+
25323+static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25324+{
25325+ return (loff_t) idx << inode_cluster_shift(inode);
25326+}
25327+
25328+static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25329+{
25330+ return clust_to_off(off_to_clust(off, inode), inode);
25331+}
25332+
25333+static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25334+{
25335+ return clust_to_pg(off_to_clust(off, inode), inode);
25336+}
25337+
25338+static inline unsigned off_to_pgoff(loff_t off)
25339+{
25340+ return off & (PAGE_CACHE_SIZE - 1);
25341+}
25342+
25343+static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25344+{
25345+ return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25346+}
25347+
25348+static inline pgoff_t offset_in_clust(struct page * page)
25349+{
25350+ assert("edward-1488", page != NULL);
25351+ assert("edward-1489", page->mapping != NULL);
25352+
25353+ return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1);
25354+}
25355+
25356+static inline int first_page_in_cluster(struct page * page)
25357+{
25358+ return offset_in_clust(page) == 0;
25359+}
25360+
25361+static inline int last_page_in_cluster(struct page * page)
25362+{
25363+ return offset_in_clust(page) ==
25364+ cluster_nrpages(page->mapping->host) - 1;
25365+}
25366+
25367+static inline unsigned
25368+pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25369+{
25370+ return off_to_cloff(pg_to_off(idx), inode);
25371+}
25372+
25373+/*********************** Size translators **************************/
25374+
25375+/* Translate linear size.
25376+ * New units are (1 << @blk_shift) times larger, then old ones.
25377+ * In other words, calculate number of logical blocks, occupied
25378+ * by @count elements
25379+ */
25380+static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits)
25381+{
25382+ return (count + (1UL << blkbits) - 1) >> blkbits;
25383+}
25384+
25385+/* size in pages */
25386+static inline pgoff_t size_in_pages(loff_t size)
25387+{
25388+ return size_in_blocks(size, PAGE_CACHE_SHIFT);
25389+}
25390+
25391+/* size in logical clusters */
25392+static inline cloff_t size_in_lc(loff_t size, struct inode *inode)
25393+{
25394+ return size_in_blocks(size, inode_cluster_shift(inode));
25395+}
25396+
25397+/* size in pages to the size in page clusters */
25398+static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode)
25399+{
25400+ return size_in_blocks(size, cluster_nrpages_shift(inode));
25401+}
25402+
25403+/*********************** Size modulators ***************************/
25404+
25405+/*
25406+ Modulate linear size by nominated block size and offset.
25407+
25408+ The "finite" function (which is zero almost everywhere).
25409+ How much is a height of the figure at a position @pos,
25410+ when trying to construct rectangle of height (1 << @blkbits),
25411+ and square @size.
25412+
25413+ ******
25414+ *******
25415+ *******
25416+ *******
25417+ ----------> pos
25418+*/
25419+static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits)
25420+{
25421+ unsigned end = size >> blkbits;
25422+ if (pos < end)
25423+ return 1U << blkbits;
25424+ if (unlikely(pos > end))
25425+ return 0;
25426+ return size & ~(~0ull << blkbits);
25427+}
25428+
25429+/* the same as above, but block size is page size */
25430+static inline unsigned __mbp(loff_t size, pgoff_t pos)
25431+{
25432+ return __mbb(size, pos, PAGE_CACHE_SHIFT);
25433+}
25434+
25435+/* number of file's bytes in the nominated logical cluster */
25436+static inline unsigned lbytes(cloff_t index, struct inode * inode)
25437+{
25438+ return __mbb(i_size_read(inode), index, inode_cluster_shift(inode));
25439+}
25440+
25441+/* number of file's bytes in the nominated page */
25442+static inline unsigned pbytes(pgoff_t index, struct inode * inode)
25443+{
25444+ return __mbp(i_size_read(inode), index);
25445+}
25446+
25447+/**
25448+ * number of pages occuped by @win->count bytes starting from
25449+ * @win->off at logical cluster defined by @win. This is exactly
25450+ * a number of pages to be modified and dirtied in any cluster operation.
25451+ */
25452+static inline pgoff_t win_count_to_nrpages(struct reiser4_slide * win)
25453+{
25454+ return ((win->off + win->count +
25455+ (1UL << PAGE_CACHE_SHIFT) - 1) >> PAGE_CACHE_SHIFT) -
25456+ off_to_pg(win->off);
25457+}
25458+
25459+/* return true, if logical cluster is not occupied by the file */
25460+static inline int new_logical_cluster(struct cluster_handle * clust,
25461+ struct inode *inode)
25462+{
25463+ return clust_to_off(clust->index, inode) >= i_size_read(inode);
25464+}
25465+
25466+/* return true, if pages @p1 and @p2 are of the same page cluster */
25467+static inline int same_page_cluster(struct page * p1, struct page * p2)
25468+{
25469+ assert("edward-1490", p1 != NULL);
25470+ assert("edward-1491", p2 != NULL);
25471+ assert("edward-1492", p1->mapping != NULL);
25472+ assert("edward-1493", p2->mapping != NULL);
25473+
25474+ return (pg_to_clust(page_index(p1), p1->mapping->host) ==
25475+ pg_to_clust(page_index(p2), p2->mapping->host));
25476+}
25477+
25478+static inline int cluster_is_complete(struct cluster_handle * clust,
25479+ struct inode * inode)
25480+{
25481+ return clust->tc.lsize == inode_cluster_size(inode);
25482+}
25483+
25484+static inline void reiser4_slide_init(struct reiser4_slide * win)
25485+{
25486+ assert("edward-1084", win != NULL);
25487+ memset(win, 0, sizeof *win);
25488+}
25489+
25490+static inline tfm_action
25491+cluster_get_tfm_act(struct tfm_cluster * tc)
25492+{
25493+ assert("edward-1356", tc != NULL);
25494+ return tc->act;
25495+}
25496+
25497+static inline void
25498+cluster_set_tfm_act(struct tfm_cluster * tc, tfm_action act)
25499+{
25500+ assert("edward-1356", tc != NULL);
25501+ tc->act = act;
25502+}
25503+
25504+static inline void cluster_init_act(struct cluster_handle * clust,
25505+ tfm_action act,
25506+ struct reiser4_slide * window)
25507+{
25508+ assert("edward-84", clust != NULL);
25509+ memset(clust, 0, sizeof *clust);
25510+ cluster_set_tfm_act(&clust->tc, act);
25511+ clust->dstat = INVAL_DISK_CLUSTER;
25512+ clust->win = window;
25513+}
25514+
25515+static inline void cluster_init_read(struct cluster_handle * clust,
25516+ struct reiser4_slide * window)
25517+{
25518+ cluster_init_act (clust, TFMA_READ, window);
25519+}
25520+
25521+static inline void cluster_init_write(struct cluster_handle * clust,
25522+ struct reiser4_slide * window)
25523+{
25524+ cluster_init_act (clust, TFMA_WRITE, window);
25525+}
25526+
25527+/* true if @p1 and @p2 are items of the same disk cluster */
25528+static inline int same_disk_cluster(const coord_t * p1, const coord_t * p2)
25529+{
25530+ /* drop this if you have other items to aggregate */
25531+ assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID);
25532+
25533+ return item_plugin_by_coord(p1)->b.mergeable(p1, p2);
25534+}
25535+
25536+static inline int dclust_get_extension_dsize(hint_t * hint)
25537+{
25538+ return hint->ext_coord.extension.ctail.dsize;
25539+}
25540+
25541+static inline void dclust_set_extension_dsize(hint_t * hint, int dsize)
25542+{
25543+ hint->ext_coord.extension.ctail.dsize = dsize;
25544+}
25545+
25546+static inline int dclust_get_extension_shift(hint_t * hint)
25547+{
25548+ return hint->ext_coord.extension.ctail.shift;
25549+}
25550+
25551+static inline int dclust_get_extension_ncount(hint_t * hint)
25552+{
25553+ return hint->ext_coord.extension.ctail.ncount;
25554+}
25555+
25556+static inline void dclust_inc_extension_ncount(hint_t * hint)
25557+{
25558+ hint->ext_coord.extension.ctail.ncount ++;
25559+}
25560+
25561+static inline void dclust_init_extension(hint_t * hint)
25562+{
25563+ memset(&hint->ext_coord.extension.ctail, 0,
25564+ sizeof(hint->ext_coord.extension.ctail));
25565+}
25566+
25567+static inline int hint_is_unprepped_dclust(hint_t * hint)
25568+{
25569+ assert("edward-1451", hint_is_valid(hint));
25570+ return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25571+}
25572+
25573+static inline void coord_set_between_clusters(coord_t * coord)
25574+{
25575+#if REISER4_DEBUG
25576+ int result;
25577+ result = zload(coord->node);
25578+ assert("edward-1296", !result);
25579+#endif
25580+ if (!coord_is_between_items(coord)) {
25581+ coord->between = AFTER_ITEM;
25582+ coord->unit_pos = 0;
25583+ }
25584+#if REISER4_DEBUG
25585+ zrelse(coord->node);
25586+#endif
25587+}
25588+
25589+int reiser4_inflate_cluster(struct cluster_handle *, struct inode *);
25590+int find_disk_cluster(struct cluster_handle *, struct inode *, int read,
25591+ znode_lock_mode mode);
25592+int checkout_logical_cluster(struct cluster_handle *, jnode *, struct inode *);
25593+int reiser4_deflate_cluster(struct cluster_handle *, struct inode *);
25594+void truncate_complete_page_cluster(struct inode *inode, cloff_t start,
25595+ int even_cows);
25596+void invalidate_hint_cluster(struct cluster_handle * clust);
25597+int get_disk_cluster_locked(struct cluster_handle * clust, struct inode * inode,
25598+ znode_lock_mode lock_mode);
25599+void reset_cluster_params(struct cluster_handle * clust);
25600+int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
25601+ int count);
25602+int prepare_page_cluster(struct inode *inode, struct cluster_handle * clust,
25603+ rw_op rw);
25604+void __put_page_cluster(int from, int count,
25605+ struct page ** pages, struct inode * inode);
25606+void put_page_cluster(struct cluster_handle * clust,
25607+ struct inode * inode, rw_op rw);
25608+void put_cluster_handle(struct cluster_handle * clust);
25609+int grab_tfm_stream(struct inode *inode, struct tfm_cluster * tc, tfm_stream_id id);
25610+int tfm_cluster_is_uptodate(struct tfm_cluster * tc);
25611+void tfm_cluster_set_uptodate(struct tfm_cluster * tc);
25612+void tfm_cluster_clr_uptodate(struct tfm_cluster * tc);
25613+
25614+/* move cluster handle to the target position
25615+ specified by the page of index @pgidx */
25616+static inline void move_cluster_forward(struct cluster_handle * clust,
25617+ struct inode *inode,
25618+ pgoff_t pgidx)
25619+{
25620+ assert("edward-1297", clust != NULL);
25621+ assert("edward-1298", inode != NULL);
25622+
25623+ reset_cluster_params(clust);
25624+ if (clust->index_valid &&
25625+ /* Hole in the indices. Hint became invalid and can not be
25626+ used by find_cluster_item() even if seal/node versions
25627+ will coincide */
25628+ pg_to_clust(pgidx, inode) != clust->index + 1) {
25629+ reiser4_unset_hint(clust->hint);
25630+ invalidate_hint_cluster(clust);
25631+ }
25632+ clust->index = pg_to_clust(pgidx, inode);
25633+ clust->index_valid = 1;
25634+}
25635+
25636+static inline int alloc_clust_pages(struct cluster_handle * clust,
25637+ struct inode *inode)
25638+{
25639+ assert("edward-791", clust != NULL);
25640+ assert("edward-792", inode != NULL);
25641+ clust->pages =
25642+ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25643+ reiser4_ctx_gfp_mask_get());
25644+ if (!clust->pages)
25645+ return -ENOMEM;
25646+ return 0;
25647+}
25648+
25649+static inline void free_clust_pages(struct cluster_handle * clust)
25650+{
25651+ kfree(clust->pages);
25652+}
25653+
25654+#endif /* __FS_REISER4_CLUSTER_H__ */
25655+
25656+/* Make Linus happy.
25657+ Local variables:
25658+ c-indentation-style: "K&R"
25659+ mode-name: "LC"
25660+ c-basic-offset: 8
25661+ tab-width: 8
25662+ fill-column: 120
25663+ scroll-step: 1
25664+ End:
25665+*/
25666diff -urN linux-2.6.24.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.24/fs/reiser4/plugin/compress/compress.c
25667--- linux-2.6.24.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 03:00:00.000000000 +0300
25668+++ linux-2.6.24/fs/reiser4/plugin/compress/compress.c 2008-01-25 11:39:06.956212841 +0300
25669@@ -0,0 +1,367 @@
25670+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25671+/* reiser4 compression transform plugins */
25672+
25673+#include "../../debug.h"
25674+#include "../../inode.h"
25675+#include "../plugin.h"
25676+
25677+#include <linux/lzo.h>
25678+#include <linux/zlib.h>
25679+#include <linux/types.h>
25680+#include <linux/hardirq.h>
25681+
25682+static int change_compression(struct inode *inode,
25683+ reiser4_plugin * plugin,
25684+ pset_member memb)
25685+{
25686+ assert("edward-1316", inode != NULL);
25687+ assert("edward-1317", plugin != NULL);
25688+ assert("edward-1318", is_reiser4_inode(inode));
25689+ assert("edward-1319",
25690+ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25691+
25692+ /* cannot change compression plugin of already existing regular object */
25693+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25694+ return RETERR(-EINVAL);
25695+
25696+ /* If matches, nothing to change. */
25697+ if (inode_hash_plugin(inode) != NULL &&
25698+ inode_hash_plugin(inode)->h.id == plugin->h.id)
25699+ return 0;
25700+
25701+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25702+ PSET_COMPRESSION, plugin);
25703+}
25704+
25705+static reiser4_plugin_ops compression_plugin_ops = {
25706+ .init = NULL,
25707+ .load = NULL,
25708+ .save_len = NULL,
25709+ .save = NULL,
25710+ .change = &change_compression
25711+};
25712+
25713+/******************************************************************************/
25714+/* gzip1 compression */
25715+/******************************************************************************/
25716+
25717+#define GZIP1_DEF_LEVEL Z_BEST_SPEED
25718+#define GZIP1_DEF_WINBITS 15
25719+#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
25720+
25721+static int gzip1_init(void)
25722+{
25723+ int ret = -EINVAL;
25724+#if REISER4_ZLIB
25725+ ret = 0;
25726+#endif
25727+ if (ret == -EINVAL)
25728+ warning("edward-1337", "Zlib not compiled into kernel");
25729+ return ret;
25730+}
25731+
25732+static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25733+{
25734+ return 0;
25735+}
25736+
25737+static coa_t gzip1_alloc(tfm_action act)
25738+{
25739+ coa_t coa = NULL;
25740+#if REISER4_ZLIB
25741+ int ret = 0;
25742+ switch (act) {
25743+ case TFMA_WRITE: /* compress */
25744+ coa = reiser4_vmalloc(zlib_deflate_workspacesize());
25745+ if (!coa) {
25746+ ret = -ENOMEM;
25747+ break;
25748+ }
25749+ break;
25750+ case TFMA_READ: /* decompress */
25751+ coa = reiser4_vmalloc(zlib_inflate_workspacesize());
25752+ if (!coa) {
25753+ ret = -ENOMEM;
25754+ break;
25755+ }
25756+ break;
25757+ default:
25758+ impossible("edward-767",
25759+ "trying to alloc workspace for unknown tfm action");
25760+ }
25761+ if (ret) {
25762+ warning("edward-768",
25763+ "alloc workspace for gzip1 (tfm action = %d) failed\n",
25764+ act);
25765+ return ERR_PTR(ret);
25766+ }
25767+#endif
25768+ return coa;
25769+}
25770+
25771+static void gzip1_free(coa_t coa, tfm_action act)
25772+{
25773+ assert("edward-769", coa != NULL);
25774+
25775+ switch (act) {
25776+ case TFMA_WRITE: /* compress */
25777+ vfree(coa);
25778+ break;
25779+ case TFMA_READ: /* decompress */
25780+ vfree(coa);
25781+ break;
25782+ default:
25783+ impossible("edward-770", "unknown tfm action");
25784+ }
25785+ return;
25786+}
25787+
25788+static int gzip1_min_size_deflate(void)
25789+{
25790+ return 64;
25791+}
25792+
25793+static void
25794+gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25795+ __u8 * dst_first, unsigned *dst_len)
25796+{
25797+#if REISER4_ZLIB
25798+ int ret = 0;
25799+ struct z_stream_s stream;
25800+
25801+ assert("edward-842", coa != NULL);
25802+ assert("edward-875", src_len != 0);
25803+
25804+ stream.workspace = coa;
25805+ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25806+ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25807+ Z_DEFAULT_STRATEGY);
25808+ if (ret != Z_OK) {
25809+ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25810+ goto rollback;
25811+ }
25812+ ret = zlib_deflateReset(&stream);
25813+ if (ret != Z_OK) {
25814+ warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25815+ goto rollback;
25816+ }
25817+ stream.next_in = src_first;
25818+ stream.avail_in = src_len;
25819+ stream.next_out = dst_first;
25820+ stream.avail_out = *dst_len;
25821+
25822+ ret = zlib_deflate(&stream, Z_FINISH);
25823+ if (ret != Z_STREAM_END) {
25824+ if (ret != Z_OK)
25825+ warning("edward-773",
25826+ "zlib_deflate returned %d\n", ret);
25827+ goto rollback;
25828+ }
25829+ *dst_len = stream.total_out;
25830+ return;
25831+ rollback:
25832+ *dst_len = src_len;
25833+#endif
25834+ return;
25835+}
25836+
25837+static void
25838+gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25839+ __u8 * dst_first, unsigned *dst_len)
25840+{
25841+#if REISER4_ZLIB
25842+ int ret = 0;
25843+ struct z_stream_s stream;
25844+
25845+ assert("edward-843", coa != NULL);
25846+ assert("edward-876", src_len != 0);
25847+
25848+ stream.workspace = coa;
25849+ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25850+ if (ret != Z_OK) {
25851+ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25852+ return;
25853+ }
25854+ ret = zlib_inflateReset(&stream);
25855+ if (ret != Z_OK) {
25856+ warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25857+ return;
25858+ }
25859+
25860+ stream.next_in = src_first;
25861+ stream.avail_in = src_len;
25862+ stream.next_out = dst_first;
25863+ stream.avail_out = *dst_len;
25864+
25865+ ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25866+ /*
25867+ * Work around a bug in zlib, which sometimes wants to taste an extra
25868+ * byte when being used in the (undocumented) raw deflate mode.
25869+ * (From USAGI).
25870+ */
25871+ if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25872+ u8 zerostuff = 0;
25873+ stream.next_in = &zerostuff;
25874+ stream.avail_in = 1;
25875+ ret = zlib_inflate(&stream, Z_FINISH);
25876+ }
25877+ if (ret != Z_STREAM_END) {
25878+ warning("edward-776", "zlib_inflate returned %d\n", ret);
25879+ return;
25880+ }
25881+ *dst_len = stream.total_out;
25882+#endif
25883+ return;
25884+}
25885+
25886+/******************************************************************************/
25887+/* lzo1 compression */
25888+/******************************************************************************/
25889+
25890+static int lzo1_init(void)
25891+{
25892+ return 0;
25893+}
25894+
25895+static int lzo1_overrun(unsigned in_len)
25896+{
25897+ return in_len / 64 + 16 + 3;
25898+}
25899+
25900+static coa_t lzo1_alloc(tfm_action act)
25901+{
25902+ int ret = 0;
25903+ coa_t coa = NULL;
25904+
25905+ switch (act) {
25906+ case TFMA_WRITE: /* compress */
25907+ coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS);
25908+ if (!coa) {
25909+ ret = -ENOMEM;
25910+ break;
25911+ }
25912+ case TFMA_READ: /* decompress */
25913+ break;
25914+ default:
25915+ impossible("edward-877",
25916+ "trying to alloc workspace for unknown tfm action");
25917+ }
25918+ if (ret) {
25919+ warning("edward-878",
25920+ "alloc workspace for lzo1 (tfm action = %d) failed\n",
25921+ act);
25922+ return ERR_PTR(ret);
25923+ }
25924+ return coa;
25925+}
25926+
25927+static void lzo1_free(coa_t coa, tfm_action act)
25928+{
25929+ assert("edward-879", coa != NULL);
25930+
25931+ switch (act) {
25932+ case TFMA_WRITE: /* compress */
25933+ vfree(coa);
25934+ break;
25935+ case TFMA_READ: /* decompress */
25936+ impossible("edward-1304",
25937+ "trying to free non-allocated workspace");
25938+ default:
25939+ impossible("edward-880", "unknown tfm action");
25940+ }
25941+ return;
25942+}
25943+
25944+static int lzo1_min_size_deflate(void)
25945+{
25946+ return 256;
25947+}
25948+
25949+static void
25950+lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25951+ __u8 * dst_first, unsigned *dst_len)
25952+{
25953+ int result;
25954+
25955+ assert("edward-846", coa != NULL);
25956+ assert("edward-847", src_len != 0);
25957+
25958+ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
25959+ if (unlikely(result != LZO_E_OK)) {
25960+ warning("edward-849", "lzo1x_1_compress failed\n");
25961+ goto out;
25962+ }
25963+ if (*dst_len >= src_len) {
25964+ //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
25965+ goto out;
25966+ }
25967+ return;
25968+ out:
25969+ *dst_len = src_len;
25970+ return;
25971+}
25972+
25973+static void
25974+lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25975+ __u8 * dst_first, unsigned *dst_len)
25976+{
25977+ int result;
25978+
25979+ assert("edward-851", coa == NULL);
25980+ assert("edward-852", src_len != 0);
25981+
25982+ result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len);
25983+ if (result != LZO_E_OK)
25984+ warning("edward-853", "lzo1x_1_decompress failed\n");
25985+ return;
25986+}
25987+
25988+compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
25989+ [LZO1_COMPRESSION_ID] = {
25990+ .h = {
25991+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25992+ .id = LZO1_COMPRESSION_ID,
25993+ .pops = &compression_plugin_ops,
25994+ .label = "lzo1",
25995+ .desc = "lzo1 compression transform",
25996+ .linkage = {NULL, NULL}
25997+ },
25998+ .init = lzo1_init,
25999+ .overrun = lzo1_overrun,
26000+ .alloc = lzo1_alloc,
26001+ .free = lzo1_free,
26002+ .min_size_deflate = lzo1_min_size_deflate,
26003+ .checksum = reiser4_adler32,
26004+ .compress = lzo1_compress,
26005+ .decompress = lzo1_decompress
26006+ },
26007+ [GZIP1_COMPRESSION_ID] = {
26008+ .h = {
26009+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26010+ .id = GZIP1_COMPRESSION_ID,
26011+ .pops = &compression_plugin_ops,
26012+ .label = "gzip1",
26013+ .desc = "gzip1 compression transform",
26014+ .linkage = {NULL, NULL}
26015+ },
26016+ .init = gzip1_init,
26017+ .overrun = gzip1_overrun,
26018+ .alloc = gzip1_alloc,
26019+ .free = gzip1_free,
26020+ .min_size_deflate = gzip1_min_size_deflate,
26021+ .checksum = reiser4_adler32,
26022+ .compress = gzip1_compress,
26023+ .decompress = gzip1_decompress
26024+ }
26025+};
26026+
26027+/*
26028+ Local variables:
26029+ c-indentation-style: "K&R"
26030+ mode-name: "LC"
26031+ c-basic-offset: 8
26032+ tab-width: 8
26033+ fill-column: 120
26034+ scroll-step: 1
26035+ End:
26036+*/
26037diff -urN linux-2.6.24.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.24/fs/reiser4/plugin/compress/compress.h
26038--- linux-2.6.24.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 03:00:00.000000000 +0300
26039+++ linux-2.6.24/fs/reiser4/plugin/compress/compress.h 2008-01-25 11:39:06.956212841 +0300
26040@@ -0,0 +1,43 @@
26041+#if !defined( __FS_REISER4_COMPRESS_H__ )
26042+#define __FS_REISER4_COMPRESS_H__
26043+
26044+#include <linux/types.h>
26045+#include <linux/string.h>
26046+
26047+/* transform direction */
26048+typedef enum {
26049+ TFMA_READ, /* decrypt, decompress */
26050+ TFMA_WRITE, /* encrypt, compress */
26051+ TFMA_LAST
26052+} tfm_action;
26053+
26054+/* supported compression algorithms */
26055+typedef enum {
26056+ LZO1_COMPRESSION_ID,
26057+ GZIP1_COMPRESSION_ID,
26058+ LAST_COMPRESSION_ID,
26059+} reiser4_compression_id;
26060+
26061+/* the same as pgoff, but units are page clusters */
26062+typedef unsigned long cloff_t;
26063+
26064+/* working data of a (de)compression algorithm */
26065+typedef void *coa_t;
26066+
26067+/* table for all supported (de)compression algorithms */
26068+typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
26069+
26070+__u32 reiser4_adler32(char *data, __u32 len);
26071+
26072+#endif /* __FS_REISER4_COMPRESS_H__ */
26073+
26074+/* Make Linus happy.
26075+ Local variables:
26076+ c-indentation-style: "K&R"
26077+ mode-name: "LC"
26078+ c-basic-offset: 8
26079+ tab-width: 8
26080+ fill-column: 120
26081+ scroll-step: 1
26082+ End:
26083+*/
26084diff -urN linux-2.6.24.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.24/fs/reiser4/plugin/compress/compress_mode.c
26085--- linux-2.6.24.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 03:00:00.000000000 +0300
26086+++ linux-2.6.24/fs/reiser4/plugin/compress/compress_mode.c 2008-01-25 11:39:06.956212841 +0300
26087@@ -0,0 +1,162 @@
26088+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26089+/* This file contains Reiser4 compression mode plugins.
26090+
26091+ Compression mode plugin is a set of handlers called by compressor
26092+ at flush time and represent some heuristics including the ones
26093+ which are to avoid compression of incompressible data, see
26094+ http://www.namesys.com/cryptcompress_design.html for more details.
26095+*/
26096+#include "../../inode.h"
26097+#include "../plugin.h"
26098+
26099+static int should_deflate_none(struct inode * inode, cloff_t index)
26100+{
26101+ return 0;
26102+}
26103+
26104+static int should_deflate_common(struct inode * inode, cloff_t index)
26105+{
26106+ return compression_is_on(cryptcompress_inode_data(inode));
26107+}
26108+
26109+static int discard_hook_ultim(struct inode *inode, cloff_t index)
26110+{
26111+ turn_off_compression(cryptcompress_inode_data(inode));
26112+ return 0;
26113+}
26114+
26115+static int discard_hook_lattd(struct inode *inode, cloff_t index)
26116+{
26117+ struct cryptcompress_info * info = cryptcompress_inode_data(inode);
26118+
26119+ assert("edward-1462",
26120+ get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
26121+ get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
26122+
26123+ turn_off_compression(info);
26124+ if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
26125+ set_lattice_factor(info, get_lattice_factor(info) << 1);
26126+ return 0;
26127+}
26128+
26129+static int accept_hook_lattd(struct inode *inode, cloff_t index)
26130+{
26131+ turn_on_compression(cryptcompress_inode_data(inode));
26132+ set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
26133+ return 0;
26134+}
26135+
26136+/* Check on dynamic lattice, the adaptive compression modes which
26137+ defines the following behavior:
26138+
26139+ Compression is on: try to compress everything and turn
26140+ it off, whenever cluster is incompressible.
26141+
26142+ Compression is off: try to compress clusters of indexes
26143+ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26144+ them is compressible. If incompressible, then increase FACTOR */
26145+
26146+/* check if @index belongs to one-dimensional lattice
26147+ of sparce factor @factor */
26148+static int is_on_lattice(cloff_t index, int factor)
26149+{
26150+ return (factor ? index % factor == 0: index == 0);
26151+}
26152+
26153+static int should_deflate_lattd(struct inode * inode, cloff_t index)
26154+{
26155+ return should_deflate_common(inode, index) ||
26156+ is_on_lattice(index,
26157+ get_lattice_factor
26158+ (cryptcompress_inode_data(inode)));
26159+}
26160+
26161+/* compression mode_plugins */
26162+compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26163+ [NONE_COMPRESSION_MODE_ID] = {
26164+ .h = {
26165+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26166+ .id = NONE_COMPRESSION_MODE_ID,
26167+ .pops = NULL,
26168+ .label = "none",
26169+ .desc = "Compress nothing",
26170+ .linkage = {NULL, NULL}
26171+ },
26172+ .should_deflate = should_deflate_none,
26173+ .accept_hook = NULL,
26174+ .discard_hook = NULL
26175+ },
26176+ /* Check-on-dynamic-lattice adaptive compression mode */
26177+ [LATTD_COMPRESSION_MODE_ID] = {
26178+ .h = {
26179+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26180+ .id = LATTD_COMPRESSION_MODE_ID,
26181+ .pops = NULL,
26182+ .label = "lattd",
26183+ .desc = "Check on dynamic lattice",
26184+ .linkage = {NULL, NULL}
26185+ },
26186+ .should_deflate = should_deflate_lattd,
26187+ .accept_hook = accept_hook_lattd,
26188+ .discard_hook = discard_hook_lattd
26189+ },
26190+ /* Check-ultimately compression mode:
26191+ Turn off compression forever as soon as we meet
26192+ incompressible data */
26193+ [ULTIM_COMPRESSION_MODE_ID] = {
26194+ .h = {
26195+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26196+ .id = ULTIM_COMPRESSION_MODE_ID,
26197+ .pops = NULL,
26198+ .label = "ultim",
26199+ .desc = "Check ultimately",
26200+ .linkage = {NULL, NULL}
26201+ },
26202+ .should_deflate = should_deflate_common,
26203+ .accept_hook = NULL,
26204+ .discard_hook = discard_hook_ultim
26205+ },
26206+ /* Force-to-compress-everything compression mode */
26207+ [FORCE_COMPRESSION_MODE_ID] = {
26208+ .h = {
26209+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26210+ .id = FORCE_COMPRESSION_MODE_ID,
26211+ .pops = NULL,
26212+ .label = "force",
26213+ .desc = "Force to compress everything",
26214+ .linkage = {NULL, NULL}
26215+ },
26216+ .should_deflate = NULL,
26217+ .accept_hook = NULL,
26218+ .discard_hook = NULL
26219+ },
26220+ /* Convert-to-extent compression mode.
26221+ In this mode items will be converted to extents and management
26222+ will be passed to (classic) unix file plugin as soon as ->write()
26223+ detects that the first complete logical cluster (of index #0) is
26224+ incompressible. */
26225+ [CONVX_COMPRESSION_MODE_ID] = {
26226+ .h = {
26227+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26228+ .id = CONVX_COMPRESSION_MODE_ID,
26229+ .pops = NULL,
26230+ .label = "conv",
26231+ .desc = "Convert to extent",
26232+ .linkage = {NULL, NULL}
26233+ },
26234+ .should_deflate = should_deflate_common,
26235+ .accept_hook = NULL,
26236+ .discard_hook = NULL
26237+ }
26238+};
26239+
26240+/*
26241+ Local variables:
26242+ c-indentation-style: "K&R"
26243+ mode-name: "LC"
26244+ c-basic-offset: 8
26245+ tab-width: 8
26246+ fill-column: 120
26247+ scroll-step: 1
26248+ End:
26249+*/
26250diff -urN linux-2.6.24.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.24/fs/reiser4/plugin/compress/Makefile
26251--- linux-2.6.24.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 03:00:00.000000000 +0300
26252+++ linux-2.6.24/fs/reiser4/plugin/compress/Makefile 2008-01-25 11:39:06.956212841 +0300
26253@@ -0,0 +1,5 @@
26254+obj-$(CONFIG_REISER4_FS) += compress_plugins.o
26255+
26256+compress_plugins-objs := \
26257+ compress.o \
26258+ compress_mode.o
26259diff -urN linux-2.6.24.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.24/fs/reiser4/plugin/crypto/cipher.c
26260--- linux-2.6.24.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 03:00:00.000000000 +0300
26261+++ linux-2.6.24/fs/reiser4/plugin/crypto/cipher.c 2008-01-25 11:39:06.956212841 +0300
26262@@ -0,0 +1,37 @@
26263+/* Copyright 2001, 2002, 2003 by Hans Reiser,
26264+ licensing governed by reiser4/README */
26265+/* Reiser4 cipher transform plugins */
26266+
26267+#include "../../debug.h"
26268+#include "../plugin.h"
26269+
26270+cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
26271+ [NONE_CIPHER_ID] = {
26272+ .h = {
26273+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
26274+ .id = NONE_CIPHER_ID,
26275+ .pops = NULL,
26276+ .label = "none",
26277+ .desc = "no cipher transform",
26278+ .linkage = {NULL, NULL}
26279+ },
26280+ .alloc = NULL,
26281+ .free = NULL,
26282+ .scale = NULL,
26283+ .align_stream = NULL,
26284+ .setkey = NULL,
26285+ .encrypt = NULL,
26286+ .decrypt = NULL
26287+ }
26288+};
26289+
26290+/* Make Linus happy.
26291+ Local variables:
26292+ c-indentation-style: "K&R"
26293+ mode-name: "LC"
26294+ c-basic-offset: 8
26295+ tab-width: 8
26296+ fill-column: 120
26297+ scroll-step: 1
26298+ End:
26299+*/
26300diff -urN linux-2.6.24.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.24/fs/reiser4/plugin/crypto/cipher.h
26301--- linux-2.6.24.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 03:00:00.000000000 +0300
26302+++ linux-2.6.24/fs/reiser4/plugin/crypto/cipher.h 2008-01-25 11:39:06.956212841 +0300
26303@@ -0,0 +1,55 @@
26304+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26305+/* This file contains definitions for the objects operated
26306+ by reiser4 key manager, which is something like keyring
26307+ wrapped by appropriate reiser4 plugin */
26308+
26309+#if !defined( __FS_REISER4_CRYPT_H__ )
26310+#define __FS_REISER4_CRYPT_H__
26311+
26312+#include <linux/crypto.h>
26313+
26314+/* key info imported from user space */
26315+struct reiser4_crypto_data {
26316+ int keysize; /* uninstantiated key size */
26317+ __u8 * key; /* uninstantiated key */
26318+ int keyid_size; /* size of passphrase */
26319+ __u8 * keyid; /* passphrase */
26320+};
26321+
26322+/* This object contains all needed infrastructure to implement
26323+ cipher transform. This is operated (allocating, inheriting,
26324+ validating, binding to host inode, etc..) by reiser4 key manager.
26325+
26326+ This info can be allocated in two cases:
26327+ 1. importing a key from user space.
26328+ 2. reading inode from disk */
26329+struct reiser4_crypto_info {
26330+ struct inode * host;
26331+ struct crypto_hash * digest;
26332+ struct crypto_blkcipher * cipher;
26333+#if 0
26334+ cipher_key_plugin * kplug; /* key manager */
26335+#endif
26336+ __u8 * keyid; /* key fingerprint, created by digest plugin,
26337+ using uninstantiated key and passphrase.
26338+ supposed to be stored in disk stat-data */
26339+ int inst; /* this indicates if the cipher key is
26340+ instantiated (case 1 above) */
26341+ int keysize; /* uninstantiated key size (bytes), supposed
26342+ to be stored in disk stat-data */
26343+ int keyload_count; /* number of the objects which has this
26344+ crypto-stat attached */
26345+};
26346+
26347+#endif /* __FS_REISER4_CRYPT_H__ */
26348+
26349+/*
26350+ Local variables:
26351+ c-indentation-style: "K&R"
26352+ mode-name: "LC"
26353+ c-basic-offset: 8
26354+ tab-width: 8
26355+ fill-column: 120
26356+ scroll-step: 1
26357+ End:
26358+*/
26359diff -urN linux-2.6.24.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.24/fs/reiser4/plugin/crypto/digest.c
26360--- linux-2.6.24.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 03:00:00.000000000 +0300
26361+++ linux-2.6.24/fs/reiser4/plugin/crypto/digest.c 2008-01-25 11:39:06.956212841 +0300
26362@@ -0,0 +1,58 @@
26363+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26364+
26365+/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
26366+/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
26367+#include "../../debug.h"
26368+#include "../plugin_header.h"
26369+#include "../plugin.h"
26370+#include "../file/cryptcompress.h"
26371+
26372+#include <linux/types.h>
26373+
26374+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
26375+
26376+static struct crypto_hash * alloc_sha256 (void)
26377+{
26378+#if REISER4_SHA256
26379+ return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
26380+#else
26381+ warning("edward-1418", "sha256 unsupported");
26382+ return ERR_PTR(-EINVAL);
26383+#endif
26384+}
26385+
26386+static void free_sha256 (struct crypto_hash * tfm)
26387+{
26388+#if REISER4_SHA256
26389+ crypto_free_hash(tfm);
26390+#endif
26391+ return;
26392+}
26393+
26394+/* digest plugins */
26395+digest_plugin digest_plugins[LAST_DIGEST_ID] = {
26396+ [SHA256_32_DIGEST_ID] = {
26397+ .h = {
26398+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
26399+ .id = SHA256_32_DIGEST_ID,
26400+ .pops = NULL,
26401+ .label = "sha256_32",
26402+ .desc = "sha256_32 digest transform",
26403+ .linkage = {NULL, NULL}
26404+ },
26405+ .fipsize = sizeof(__u32),
26406+ .alloc = alloc_sha256,
26407+ .free = free_sha256
26408+ }
26409+};
26410+
26411+/*
26412+ Local variables:
26413+ c-indentation-style: "K&R"
26414+ mode-name: "LC"
26415+ c-basic-offset: 8
26416+ tab-width: 8
26417+ fill-column: 120
26418+ scroll-step: 1
26419+ End:
26420+*/
26421diff -urN linux-2.6.24.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.24/fs/reiser4/plugin/dir/dir.h
26422--- linux-2.6.24.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 03:00:00.000000000 +0300
26423+++ linux-2.6.24/fs/reiser4/plugin/dir/dir.h 2008-01-25 11:39:06.960213871 +0300
26424@@ -0,0 +1,36 @@
26425+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26426+ * reiser4/README */
26427+
26428+/* this file contains declarations of methods implementing directory plugins */
26429+
26430+#if !defined( __REISER4_DIR_H__ )
26431+#define __REISER4_DIR_H__
26432+
26433+/*#include "../../key.h"
26434+
26435+#include <linux/fs.h>*/
26436+
26437+/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
26438+
26439+/* "hashed" directory methods of dir plugin */
26440+void build_entry_key_hashed(const struct inode *, const struct qstr *,
26441+ reiser4_key *);
26442+
26443+/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
26444+
26445+/* "seekable" directory methods of dir plugin */
26446+void build_entry_key_seekable(const struct inode *, const struct qstr *,
26447+ reiser4_key *);
26448+
26449+/* __REISER4_DIR_H__ */
26450+#endif
26451+
26452+/*
26453+ Local variables:
26454+ c-indentation-style: "K&R"
26455+ mode-name: "LC"
26456+ c-basic-offset: 8
26457+ tab-width: 8
26458+ fill-column: 120
26459+ End:
26460+*/
26461diff -urN linux-2.6.24.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.24/fs/reiser4/plugin/dir/hashed_dir.c
26462--- linux-2.6.24.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 03:00:00.000000000 +0300
26463+++ linux-2.6.24/fs/reiser4/plugin/dir/hashed_dir.c 2008-01-25 11:39:06.960213871 +0300
26464@@ -0,0 +1,81 @@
26465+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26466+ * reiser4/README */
26467+
26468+/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
26469+ names to the files. */
26470+
26471+/*
26472+ * Hashed directory logically consists of persistent directory
26473+ * entries. Directory entry is a pair of a file name and a key of stat-data of
26474+ * a file that has this name in the given directory.
26475+ *
26476+ * Directory entries are stored in the tree in the form of directory
26477+ * items. Directory item should implement dir_entry_ops portion of item plugin
26478+ * interface (see plugin/item/item.h). Hashed directory interacts with
26479+ * directory item plugin exclusively through dir_entry_ops operations.
26480+ *
26481+ * Currently there are two implementations of directory items: "simple
26482+ * directory item" (plugin/item/sde.[ch]), and "compound directory item"
26483+ * (plugin/item/cde.[ch]) with the latter being the default.
26484+ *
26485+ * There is, however some delicate way through which directory code interferes
26486+ * with item plugin: key assignment policy. A key for a directory item is
26487+ * chosen by directory code, and as described in kassign.c, this key contains
26488+ * a portion of file name. Directory item uses this knowledge to avoid storing
26489+ * this portion of file name twice: in the key and in the directory item body.
26490+ *
26491+ */
26492+
26493+#include "../../inode.h"
26494+
26495+void complete_entry_key(const struct inode *, const char *name,
26496+ int len, reiser4_key * result);
26497+
26498+/* this is implementation of build_entry_key method of dir
26499+ plugin for HASHED_DIR_PLUGIN_ID
26500+ */
26501+void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
26502+ * (or will be) in.*/
26503+ const struct qstr *qname, /* name of file referenced
26504+ * by this entry */
26505+ reiser4_key * result /* resulting key of directory
26506+ * entry */ )
26507+{
26508+ const char *name;
26509+ int len;
26510+
26511+ assert("nikita-1139", dir != NULL);
26512+ assert("nikita-1140", qname != NULL);
26513+ assert("nikita-1141", qname->name != NULL);
26514+ assert("nikita-1142", result != NULL);
26515+
26516+ name = qname->name;
26517+ len = qname->len;
26518+
26519+ assert("nikita-2867", strlen(name) == len);
26520+
26521+ reiser4_key_init(result);
26522+ /* locality of directory entry's key is objectid of parent
26523+ directory */
26524+ set_key_locality(result, get_inode_oid(dir));
26525+ /* minor packing locality is constant */
26526+ set_key_type(result, KEY_FILE_NAME_MINOR);
26527+ /* dot is special case---we always want it to be first entry in
26528+ a directory. Actually, we just want to have smallest
26529+ directory entry.
26530+ */
26531+ if (len == 1 && name[0] == '.')
26532+ return;
26533+
26534+ /* initialize part of entry key which depends on file name */
26535+ complete_entry_key(dir, name, len, result);
26536+}
26537+
26538+/* Local variables:
26539+ c-indentation-style: "K&R"
26540+ mode-name: "LC"
26541+ c-basic-offset: 8
26542+ tab-width: 8
26543+ fill-column: 120
26544+ End:
26545+*/
26546diff -urN linux-2.6.24.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.24/fs/reiser4/plugin/dir/Makefile
26547--- linux-2.6.24.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 03:00:00.000000000 +0300
26548+++ linux-2.6.24/fs/reiser4/plugin/dir/Makefile 2008-01-25 11:39:06.960213871 +0300
26549@@ -0,0 +1,5 @@
26550+obj-$(CONFIG_REISER4_FS) += dir_plugins.o
26551+
26552+dir_plugins-objs := \
26553+ hashed_dir.o \
26554+ seekable_dir.o
26555diff -urN linux-2.6.24.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.24/fs/reiser4/plugin/dir/seekable_dir.c
26556--- linux-2.6.24.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 03:00:00.000000000 +0300
26557+++ linux-2.6.24/fs/reiser4/plugin/dir/seekable_dir.c 2008-01-25 11:39:06.960213871 +0300
26558@@ -0,0 +1,46 @@
26559+/* Copyright 2005 by Hans Reiser, licensing governed by
26560+ * reiser4/README */
26561+
26562+#include "../../inode.h"
26563+
26564+/* this is implementation of build_entry_key method of dir
26565+ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
26566+ This is for directories where we want repeatable and restartable readdir()
26567+ even in case 32bit user level struct dirent (readdir(3)).
26568+*/
26569+void
26570+build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
26571+ reiser4_key * result)
26572+{
26573+ oid_t objectid;
26574+
26575+ assert("nikita-2283", dir != NULL);
26576+ assert("nikita-2284", name != NULL);
26577+ assert("nikita-2285", name->name != NULL);
26578+ assert("nikita-2286", result != NULL);
26579+
26580+ reiser4_key_init(result);
26581+ /* locality of directory entry's key is objectid of parent
26582+ directory */
26583+ set_key_locality(result, get_inode_oid(dir));
26584+ /* minor packing locality is constant */
26585+ set_key_type(result, KEY_FILE_NAME_MINOR);
26586+ /* dot is special case---we always want it to be first entry in
26587+ a directory. Actually, we just want to have smallest
26588+ directory entry.
26589+ */
26590+ if ((name->len == 1) && (name->name[0] == '.'))
26591+ return;
26592+
26593+ /* objectid of key is 31 lowest bits of hash. */
26594+ objectid =
26595+ inode_hash_plugin(dir)->hash(name->name,
26596+ (int)name->len) & 0x7fffffff;
26597+
26598+ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
26599+ set_key_objectid(result, objectid);
26600+
26601+ /* offset is always 0. */
26602+ set_key_offset(result, (__u64) 0);
26603+ return;
26604+}
26605diff -urN linux-2.6.24.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.24/fs/reiser4/plugin/dir_plugin_common.c
26606--- linux-2.6.24.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
26607+++ linux-2.6.24/fs/reiser4/plugin/dir_plugin_common.c 2008-01-25 11:39:06.964214902 +0300
26608@@ -0,0 +1,872 @@
26609+/* Copyright 2005 by Hans Reiser, licensing governed by
26610+ reiser4/README */
26611+
26612+/* this file contains typical implementations for most of methods of
26613+ directory plugin
26614+*/
26615+
26616+#include "../inode.h"
26617+
26618+int reiser4_find_entry(struct inode *dir, struct dentry *name,
26619+ lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
26620+int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
26621+void check_light_weight(struct inode *inode, struct inode *parent);
26622+
26623+/* this is common implementation of get_parent method of dir plugin
26624+ this is used by NFS kernel server to "climb" up directory tree to
26625+ check permissions
26626+ */
26627+struct dentry *get_parent_common(struct inode *child)
26628+{
26629+ struct super_block *s;
26630+ struct inode *parent;
26631+ struct dentry dotdot;
26632+ struct dentry *dentry;
26633+ reiser4_key key;
26634+ int result;
26635+
26636+ /*
26637+ * lookup dotdot entry.
26638+ */
26639+
26640+ s = child->i_sb;
26641+ memset(&dotdot, 0, sizeof(dotdot));
26642+ dotdot.d_name.name = "..";
26643+ dotdot.d_name.len = 2;
26644+ dotdot.d_op = &get_super_private(s)->ops.dentry;
26645+
26646+ result = reiser4_lookup_name(child, &dotdot, &key);
26647+ if (result != 0)
26648+ return ERR_PTR(result);
26649+
26650+ parent = reiser4_iget(s, &key, 1);
26651+ if (!IS_ERR(parent)) {
26652+ /*
26653+ * FIXME-NIKITA dubious: attributes are inherited from @child
26654+ * to @parent. But:
26655+ *
26656+ * (*) this is the only this we can do
26657+ *
26658+ * (*) attributes of light-weight object are inherited
26659+ * from a parent through which object was looked up first,
26660+ * so it is ambiguous anyway.
26661+ *
26662+ */
26663+ check_light_weight(parent, child);
26664+ reiser4_iget_complete(parent);
26665+ dentry = d_alloc_anon(parent);
26666+ if (dentry == NULL) {
26667+ iput(parent);
26668+ dentry = ERR_PTR(RETERR(-ENOMEM));
26669+ } else
26670+ dentry->d_op = &get_super_private(s)->ops.dentry;
26671+ } else if (PTR_ERR(parent) == -ENOENT)
26672+ dentry = ERR_PTR(RETERR(-ESTALE));
26673+ else
26674+ dentry = (void *)parent;
26675+ return dentry;
26676+}
26677+
26678+/* this is common implementation of is_name_acceptable method of dir
26679+ plugin
26680+ */
26681+int is_name_acceptable_common(const struct inode *inode, /* directory to check */
26682+ const char *name UNUSED_ARG, /* name to check */
26683+ int len /* @name's length */ )
26684+{
26685+ assert("nikita-733", inode != NULL);
26686+ assert("nikita-734", name != NULL);
26687+ assert("nikita-735", len > 0);
26688+
26689+ return len <= reiser4_max_filename_len(inode);
26690+}
26691+
26692+/* there is no common implementation of build_entry_key method of dir
26693+ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
26694+ plugin/dir/seekable.c:build_entry_key_seekable() for example
26695+*/
26696+
26697+/* this is common implementation of build_readdir_key method of dir
26698+ plugin
26699+ see reiser4_readdir_common for more details
26700+*/
26701+int build_readdir_key_common(struct file *dir /* directory being read */ ,
26702+ reiser4_key * result /* where to store key */ )
26703+{
26704+ reiser4_file_fsdata *fdata;
26705+ struct inode *inode;
26706+
26707+ assert("nikita-1361", dir != NULL);
26708+ assert("nikita-1362", result != NULL);
26709+ assert("nikita-1363", dir->f_dentry != NULL);
26710+ inode = dir->f_dentry->d_inode;
26711+ assert("nikita-1373", inode != NULL);
26712+
26713+ fdata = reiser4_get_file_fsdata(dir);
26714+ if (IS_ERR(fdata))
26715+ return PTR_ERR(fdata);
26716+ assert("nikita-1364", fdata != NULL);
26717+ return extract_key_from_de_id(get_inode_oid(inode),
26718+ &fdata->dir.readdir.position.
26719+ dir_entry_key, result);
26720+
26721+}
26722+
26723+void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
26724+ int adj);
26725+
26726+/* this is common implementation of add_entry method of dir plugin
26727+*/
26728+int reiser4_add_entry_common(struct inode *object, /* directory to add new name
26729+ * in */
26730+ struct dentry *where, /* new name */
26731+ reiser4_object_create_data * data, /* parameters of
26732+ * new object */
26733+ reiser4_dir_entry_desc * entry /* parameters of
26734+ * new directory
26735+ * entry */)
26736+{
26737+ int result;
26738+ coord_t *coord;
26739+ lock_handle lh;
26740+ struct reiser4_dentry_fsdata *fsdata;
26741+ reiser4_block_nr reserve;
26742+
26743+ assert("nikita-1114", object != NULL);
26744+ assert("nikita-1250", where != NULL);
26745+
26746+ fsdata = reiser4_get_dentry_fsdata(where);
26747+ if (unlikely(IS_ERR(fsdata)))
26748+ return PTR_ERR(fsdata);
26749+
26750+ reserve = inode_dir_plugin(object)->estimate.add_entry(object);
26751+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
26752+ return RETERR(-ENOSPC);
26753+
26754+ init_lh(&lh);
26755+ coord = &fsdata->dec.entry_coord;
26756+ coord_clear_iplug(coord);
26757+
26758+ /* check for this entry in a directory. This is plugin method. */
26759+ result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
26760+ entry);
26761+ if (likely(result == -ENOENT)) {
26762+ /* add new entry. Just pass control to the directory
26763+ item plugin. */
26764+ assert("nikita-1709", inode_dir_item_plugin(object));
26765+ assert("nikita-2230", coord->node == lh.node);
26766+ reiser4_seal_done(&fsdata->dec.entry_seal);
26767+ result =
26768+ inode_dir_item_plugin(object)->s.dir.add_entry(object,
26769+ coord, &lh,
26770+ where,
26771+ entry);
26772+ if (result == 0) {
26773+ reiser4_adjust_dir_file(object, where,
26774+ fsdata->dec.pos + 1, +1);
26775+ INODE_INC_FIELD(object, i_size);
26776+ }
26777+ } else if (result == 0) {
26778+ assert("nikita-2232", coord->node == lh.node);
26779+ result = RETERR(-EEXIST);
26780+ }
26781+ done_lh(&lh);
26782+
26783+ return result;
26784+}
26785+
26786+/**
26787+ * rem_entry - remove entry from directory item
26788+ * @dir:
26789+ * @dentry:
26790+ * @entry:
26791+ * @coord:
26792+ * @lh:
26793+ *
26794+ * Checks that coordinate @coord is set properly and calls item plugin
26795+ * method to cut entry.
26796+ */
26797+static int
26798+rem_entry(struct inode *dir, struct dentry *dentry,
26799+ reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
26800+{
26801+ item_plugin *iplug;
26802+ struct inode *child;
26803+
26804+ iplug = inode_dir_item_plugin(dir);
26805+ child = dentry->d_inode;
26806+ assert("nikita-3399", child != NULL);
26807+
26808+ /* check that we are really destroying an entry for @child */
26809+ if (REISER4_DEBUG) {
26810+ int result;
26811+ reiser4_key key;
26812+
26813+ result = iplug->s.dir.extract_key(coord, &key);
26814+ if (result != 0)
26815+ return result;
26816+ if (get_key_objectid(&key) != get_inode_oid(child)) {
26817+ warning("nikita-3397",
26818+ "rem_entry: %#llx != %#llx\n",
26819+ get_key_objectid(&key),
26820+ (unsigned long long)get_inode_oid(child));
26821+ return RETERR(-EIO);
26822+ }
26823+ }
26824+ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
26825+}
26826+
26827+/**
26828+ * reiser4_rem_entry_common - remove entry from a directory
26829+ * @dir: directory to remove entry from
26830+ * @where: name that is being removed
26831+ * @entry: description of entry being removed
26832+ *
26833+ * This is common implementation of rem_entry method of dir plugin.
26834+ */
26835+int reiser4_rem_entry_common(struct inode *dir,
26836+ struct dentry *dentry,
26837+ reiser4_dir_entry_desc *entry)
26838+{
26839+ int result;
26840+ coord_t *coord;
26841+ lock_handle lh;
26842+ struct reiser4_dentry_fsdata *fsdata;
26843+ __u64 tograb;
26844+
26845+ assert("nikita-1124", dir != NULL);
26846+ assert("nikita-1125", dentry != NULL);
26847+
26848+ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
26849+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
26850+ if (result != 0)
26851+ return RETERR(-ENOSPC);
26852+
26853+ init_lh(&lh);
26854+
26855+ /* check for this entry in a directory. This is plugin method. */
26856+ result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
26857+ fsdata = reiser4_get_dentry_fsdata(dentry);
26858+ if (IS_ERR(fsdata)) {
26859+ done_lh(&lh);
26860+ return PTR_ERR(fsdata);
26861+ }
26862+
26863+ coord = &fsdata->dec.entry_coord;
26864+
26865+ assert("nikita-3404",
26866+ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
26867+ dir->i_size <= 1);
26868+
26869+ coord_clear_iplug(coord);
26870+ if (result == 0) {
26871+ /* remove entry. Just pass control to the directory item
26872+ plugin. */
26873+ assert("vs-542", inode_dir_item_plugin(dir));
26874+ reiser4_seal_done(&fsdata->dec.entry_seal);
26875+ reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
26876+ result =
26877+ WITH_COORD(coord,
26878+ rem_entry(dir, dentry, entry, coord, &lh));
26879+ if (result == 0) {
26880+ if (dir->i_size >= 1)
26881+ INODE_DEC_FIELD(dir, i_size);
26882+ else {
26883+ warning("nikita-2509", "Dir %llu is runt",
26884+ (unsigned long long)
26885+ get_inode_oid(dir));
26886+ result = RETERR(-EIO);
26887+ }
26888+
26889+ assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
26890+ dentry->d_inode->i_size != 2 ||
26891+ inode_dir_plugin(dentry->d_inode) == NULL);
26892+ }
26893+ }
26894+ done_lh(&lh);
26895+
26896+ return result;
26897+}
26898+
26899+static reiser4_block_nr estimate_init(struct inode *parent,
26900+ struct inode *object);
26901+static int create_dot_dotdot(struct inode *object, struct inode *parent);
26902+
26903+/* this is common implementation of init method of dir plugin
26904+ create "." and ".." entries
26905+*/
26906+int reiser4_dir_init_common(struct inode *object, /* new directory */
26907+ struct inode *parent, /* parent directory */
26908+ reiser4_object_create_data * data /* info passed
26909+ * to us, this
26910+ * is filled by
26911+ * reiser4()
26912+ * syscall in
26913+ * particular */)
26914+{
26915+ reiser4_block_nr reserve;
26916+
26917+ assert("nikita-680", object != NULL);
26918+ assert("nikita-681", S_ISDIR(object->i_mode));
26919+ assert("nikita-682", parent != NULL);
26920+ assert("nikita-684", data != NULL);
26921+ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
26922+ assert("nikita-687", object->i_mode & S_IFDIR);
26923+
26924+ reserve = estimate_init(parent, object);
26925+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
26926+ return RETERR(-ENOSPC);
26927+
26928+ return create_dot_dotdot(object, parent);
26929+}
26930+
26931+/* this is common implementation of done method of dir plugin
26932+ remove "." entry
26933+*/
26934+int reiser4_dir_done_common(struct inode *object /* object being deleted */ )
26935+{
26936+ int result;
26937+ reiser4_block_nr reserve;
26938+ struct dentry goodby_dots;
26939+ reiser4_dir_entry_desc entry;
26940+
26941+ assert("nikita-1449", object != NULL);
26942+
26943+ if (reiser4_inode_get_flag(object, REISER4_NO_SD))
26944+ return 0;
26945+
26946+ /* of course, this can be rewritten to sweep everything in one
26947+ reiser4_cut_tree(). */
26948+ memset(&entry, 0, sizeof entry);
26949+
26950+ /* FIXME: this done method is called from reiser4_delete_dir_common which
26951+ * reserved space already */
26952+ reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
26953+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
26954+ return RETERR(-ENOSPC);
26955+
26956+ memset(&goodby_dots, 0, sizeof goodby_dots);
26957+ entry.obj = goodby_dots.d_inode = object;
26958+ goodby_dots.d_name.name = ".";
26959+ goodby_dots.d_name.len = 1;
26960+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
26961+ reiser4_free_dentry_fsdata(&goodby_dots);
26962+ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
26963+ /* only worth a warning
26964+
26965+ "values of \ eB\ f will give rise to dom!\n"
26966+ -- v6src/s2/mv.c:89
26967+ */
26968+ warning("nikita-2252", "Cannot remove dot of %lli: %i",
26969+ (unsigned long long)get_inode_oid(object), result);
26970+ return 0;
26971+}
26972+
26973+/* this is common implementation of attach method of dir plugin
26974+*/
26975+int reiser4_attach_common(struct inode *child UNUSED_ARG,
26976+ struct inode *parent UNUSED_ARG)
26977+{
26978+ assert("nikita-2647", child != NULL);
26979+ assert("nikita-2648", parent != NULL);
26980+
26981+ return 0;
26982+}
26983+
26984+/* this is common implementation of detach method of dir plugin
26985+ remove "..", decrease nlink on parent
26986+*/
26987+int reiser4_detach_common(struct inode *object, struct inode *parent)
26988+{
26989+ int result;
26990+ struct dentry goodby_dots;
26991+ reiser4_dir_entry_desc entry;
26992+
26993+ assert("nikita-2885", object != NULL);
26994+ assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
26995+
26996+ memset(&entry, 0, sizeof entry);
26997+
26998+ /* NOTE-NIKITA this only works if @parent is -the- parent of
26999+ @object, viz. object whose key is stored in dotdot
27000+ entry. Wouldn't work with hard-links on directories. */
27001+ memset(&goodby_dots, 0, sizeof goodby_dots);
27002+ entry.obj = goodby_dots.d_inode = parent;
27003+ goodby_dots.d_name.name = "..";
27004+ goodby_dots.d_name.len = 2;
27005+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
27006+ reiser4_free_dentry_fsdata(&goodby_dots);
27007+ if (result == 0) {
27008+ /* the dot should be the only entry remaining at this time... */
27009+ assert("nikita-3400",
27010+ object->i_size == 1 && object->i_nlink <= 2);
27011+#if 0
27012+ /* and, together with the only name directory can have, they
27013+ * provides for the last 2 remaining references. If we get
27014+ * here as part of error handling during mkdir, @object
27015+ * possibly has no name yet, so its nlink == 1. If we get here
27016+ * from rename (targeting empty directory), it has no name
27017+ * already, so its nlink == 1. */
27018+ assert("nikita-3401",
27019+ object->i_nlink == 2 || object->i_nlink == 1);
27020+#endif
27021+
27022+ /* decrement nlink of directory removed ".." pointed
27023+ to */
27024+ reiser4_del_nlink(parent, NULL, 0);
27025+ }
27026+ return result;
27027+}
27028+
27029+/* this is common implementation of estimate.add_entry method of
27030+ dir plugin
27031+ estimation of adding entry which supposes that entry is inserting a
27032+ unit into item
27033+*/
27034+reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
27035+{
27036+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
27037+}
27038+
27039+/* this is common implementation of estimate.rem_entry method of dir
27040+ plugin
27041+*/
27042+reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
27043+{
27044+ return estimate_one_item_removal(reiser4_tree_by_inode(inode));
27045+}
27046+
27047+/* this is common implementation of estimate.unlink method of dir
27048+ plugin
27049+*/
27050+reiser4_block_nr
27051+dir_estimate_unlink_common(const struct inode * parent,
27052+ const struct inode * object)
27053+{
27054+ reiser4_block_nr res;
27055+
27056+ /* hashed_rem_entry(object) */
27057+ res = inode_dir_plugin(object)->estimate.rem_entry(object);
27058+ /* del_nlink(parent) */
27059+ res += 2 * inode_file_plugin(parent)->estimate.update(parent);
27060+
27061+ return res;
27062+}
27063+
27064+/*
27065+ * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
27066+ * methods: if @inode is a light-weight file, setup its credentials
27067+ * that are not stored in the stat-data in this case
27068+ */
27069+void check_light_weight(struct inode *inode, struct inode *parent)
27070+{
27071+ if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
27072+ inode->i_uid = parent->i_uid;
27073+ inode->i_gid = parent->i_gid;
27074+ /* clear light-weight flag. If inode would be read by any
27075+ other name, [ug]id wouldn't change. */
27076+ reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
27077+ }
27078+}
27079+
27080+/* looks for name specified in @dentry in directory @parent and if name is
27081+ found - key of object found entry points to is stored in @entry->key */
27082+int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup for
27083+ * name in */
27084+ struct dentry *dentry, /* name to look for */
27085+ reiser4_key * key /* place to store key */ )
27086+{
27087+ int result;
27088+ coord_t *coord;
27089+ lock_handle lh;
27090+ const char *name;
27091+ int len;
27092+ reiser4_dir_entry_desc entry;
27093+ struct reiser4_dentry_fsdata *fsdata;
27094+
27095+ assert("nikita-1247", parent != NULL);
27096+ assert("nikita-1248", dentry != NULL);
27097+ assert("nikita-1123", dentry->d_name.name != NULL);
27098+ assert("vs-1486",
27099+ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
27100+
27101+ name = dentry->d_name.name;
27102+ len = dentry->d_name.len;
27103+
27104+ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
27105+ /* some arbitrary error code to return */
27106+ return RETERR(-ENAMETOOLONG);
27107+
27108+ fsdata = reiser4_get_dentry_fsdata(dentry);
27109+ if (IS_ERR(fsdata))
27110+ return PTR_ERR(fsdata);
27111+
27112+ coord = &fsdata->dec.entry_coord;
27113+ coord_clear_iplug(coord);
27114+ init_lh(&lh);
27115+
27116+ /* find entry in a directory. This is plugin method. */
27117+ result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
27118+ &entry);
27119+ if (result == 0) {
27120+ /* entry was found, extract object key from it. */
27121+ result =
27122+ WITH_COORD(coord,
27123+ item_plugin_by_coord(coord)->s.dir.
27124+ extract_key(coord, key));
27125+ }
27126+ done_lh(&lh);
27127+ return result;
27128+
27129+}
27130+
27131+/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
27132+static reiser4_block_nr
27133+estimate_init(struct inode *parent, struct inode *object)
27134+{
27135+ reiser4_block_nr res = 0;
27136+
27137+ assert("vpf-321", parent != NULL);
27138+ assert("vpf-322", object != NULL);
27139+
27140+ /* hashed_add_entry(object) */
27141+ res += inode_dir_plugin(object)->estimate.add_entry(object);
27142+ /* reiser4_add_nlink(object) */
27143+ res += inode_file_plugin(object)->estimate.update(object);
27144+ /* hashed_add_entry(object) */
27145+ res += inode_dir_plugin(object)->estimate.add_entry(object);
27146+ /* reiser4_add_nlink(parent) */
27147+ res += inode_file_plugin(parent)->estimate.update(parent);
27148+
27149+ return 0;
27150+}
27151+
27152+/* helper function for reiser4_dir_init_common(). Create "." and ".." */
27153+static int create_dot_dotdot(struct inode *object /* object to create dot and
27154+ * dotdot for */ ,
27155+ struct inode *parent /* parent of @object */)
27156+{
27157+ int result;
27158+ struct dentry dots_entry;
27159+ reiser4_dir_entry_desc entry;
27160+
27161+ assert("nikita-688", object != NULL);
27162+ assert("nikita-689", S_ISDIR(object->i_mode));
27163+ assert("nikita-691", parent != NULL);
27164+
27165+ /* We store dot and dotdot as normal directory entries. This is
27166+ not necessary, because almost all information stored in them
27167+ is already in the stat-data of directory, the only thing
27168+ being missed is objectid of grand-parent directory that can
27169+ easily be added there as extension.
27170+
27171+ But it is done the way it is done, because not storing dot
27172+ and dotdot will lead to the following complications:
27173+
27174+ . special case handling in ->lookup().
27175+ . addition of another extension to the sd.
27176+ . dependency on key allocation policy for stat data.
27177+
27178+ */
27179+
27180+ memset(&entry, 0, sizeof entry);
27181+ memset(&dots_entry, 0, sizeof dots_entry);
27182+ entry.obj = dots_entry.d_inode = object;
27183+ dots_entry.d_name.name = ".";
27184+ dots_entry.d_name.len = 1;
27185+ result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
27186+ reiser4_free_dentry_fsdata(&dots_entry);
27187+
27188+ if (result == 0) {
27189+ result = reiser4_add_nlink(object, object, 0);
27190+ if (result == 0) {
27191+ entry.obj = dots_entry.d_inode = parent;
27192+ dots_entry.d_name.name = "..";
27193+ dots_entry.d_name.len = 2;
27194+ result = reiser4_add_entry_common(object,
27195+ &dots_entry, NULL, &entry);
27196+ reiser4_free_dentry_fsdata(&dots_entry);
27197+ /* if creation of ".." failed, iput() will delete
27198+ object with ".". */
27199+ if (result == 0) {
27200+ result = reiser4_add_nlink(parent, object, 0);
27201+ if (result != 0)
27202+ /*
27203+ * if we failed to bump i_nlink, try
27204+ * to remove ".."
27205+ */
27206+ reiser4_detach_common(object, parent);
27207+ }
27208+ }
27209+ }
27210+
27211+ if (result != 0) {
27212+ /*
27213+ * in the case of error, at least update stat-data so that,
27214+ * ->i_nlink updates are not lingering.
27215+ */
27216+ reiser4_update_sd(object);
27217+ reiser4_update_sd(parent);
27218+ }
27219+
27220+ return result;
27221+}
27222+
27223+/*
27224+ * return 0 iff @coord contains a directory entry for the file with the name
27225+ * @name.
27226+ */
27227+static int
27228+check_item(const struct inode *dir, const coord_t * coord, const char *name)
27229+{
27230+ item_plugin *iplug;
27231+ char buf[DE_NAME_BUF_LEN];
27232+
27233+ iplug = item_plugin_by_coord(coord);
27234+ if (iplug == NULL) {
27235+ warning("nikita-1135", "Cannot get item plugin");
27236+ print_coord("coord", coord, 1);
27237+ return RETERR(-EIO);
27238+ } else if (item_id_by_coord(coord) !=
27239+ item_id_by_plugin(inode_dir_item_plugin(dir))) {
27240+ /* item id of current item does not match to id of items a
27241+ directory is built of */
27242+ warning("nikita-1136", "Wrong item plugin");
27243+ print_coord("coord", coord, 1);
27244+ return RETERR(-EIO);
27245+ }
27246+ assert("nikita-1137", iplug->s.dir.extract_name);
27247+
27248+ /* Compare name stored in this entry with name we are looking for.
27249+
27250+ NOTE-NIKITA Here should go code for support of something like
27251+ unicode, code tables, etc.
27252+ */
27253+ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
27254+}
27255+
27256+static int
27257+check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
27258+{
27259+ return WITH_COORD(coord, check_item(dir, coord, name->name));
27260+}
27261+
27262+/*
27263+ * argument package used by entry_actor to scan entries with identical keys.
27264+ */
27265+struct entry_actor_args {
27266+ /* name we are looking for */
27267+ const char *name;
27268+ /* key of directory entry. entry_actor() scans through sequence of
27269+ * items/units having the same key */
27270+ reiser4_key *key;
27271+ /* how many entries with duplicate key was scanned so far. */
27272+ int non_uniq;
27273+#if REISER4_USE_COLLISION_LIMIT
27274+ /* scan limit */
27275+ int max_non_uniq;
27276+#endif
27277+ /* return parameter: set to true, if ->name wasn't found */
27278+ int not_found;
27279+ /* what type of lock to take when moving to the next node during
27280+ * scan */
27281+ znode_lock_mode mode;
27282+
27283+ /* last coord that was visited during scan */
27284+ coord_t last_coord;
27285+ /* last node locked during scan */
27286+ lock_handle last_lh;
27287+ /* inode of directory */
27288+ const struct inode *inode;
27289+};
27290+
27291+/* Function called by reiser4_find_entry() to look for given name
27292+ in the directory. */
27293+static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
27294+ coord_t * coord /* current coord */ ,
27295+ lock_handle * lh /* current lock handle */ ,
27296+ void *entry_actor_arg /* argument to scan */ )
27297+{
27298+ reiser4_key unit_key;
27299+ struct entry_actor_args *args;
27300+
27301+ assert("nikita-1131", tree != NULL);
27302+ assert("nikita-1132", coord != NULL);
27303+ assert("nikita-1133", entry_actor_arg != NULL);
27304+
27305+ args = entry_actor_arg;
27306+ ++args->non_uniq;
27307+#if REISER4_USE_COLLISION_LIMIT
27308+ if (args->non_uniq > args->max_non_uniq) {
27309+ args->not_found = 1;
27310+ /* hash collision overflow. */
27311+ return RETERR(-EBUSY);
27312+ }
27313+#endif
27314+
27315+ /*
27316+ * did we just reach the end of the sequence of items/units with
27317+ * identical keys?
27318+ */
27319+ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
27320+ assert("nikita-1791",
27321+ keylt(args->key, unit_key_by_coord(coord, &unit_key)));
27322+ args->not_found = 1;
27323+ args->last_coord.between = AFTER_UNIT;
27324+ return 0;
27325+ }
27326+
27327+ coord_dup(&args->last_coord, coord);
27328+ /*
27329+ * did scan just moved to the next node?
27330+ */
27331+ if (args->last_lh.node != lh->node) {
27332+ int lock_result;
27333+
27334+ /*
27335+ * if so, lock new node with the mode requested by the caller
27336+ */
27337+ done_lh(&args->last_lh);
27338+ assert("nikita-1896", znode_is_any_locked(lh->node));
27339+ lock_result = longterm_lock_znode(&args->last_lh, lh->node,
27340+ args->mode, ZNODE_LOCK_HIPRI);
27341+ if (lock_result != 0)
27342+ return lock_result;
27343+ }
27344+ return check_item(args->inode, coord, args->name);
27345+}
27346+
27347+/* Look for given @name within directory @dir.
27348+
27349+ This is called during lookup, creation and removal of directory
27350+ entries and on reiser4_rename_common
27351+
27352+ First calculate key that directory entry for @name would have. Search
27353+ for this key in the tree. If such key is found, scan all items with
27354+ the same key, checking name in each directory entry along the way.
27355+*/
27356+int reiser4_find_entry(struct inode *dir, /* directory to scan */
27357+ struct dentry *de, /* name to search for */
27358+ lock_handle * lh, /* resulting lock handle */
27359+ znode_lock_mode mode, /* required lock mode */
27360+ reiser4_dir_entry_desc * entry /* parameters of found
27361+ directory entry */)
27362+{
27363+ const struct qstr *name;
27364+ seal_t *seal;
27365+ coord_t *coord;
27366+ int result;
27367+ __u32 flags;
27368+ struct de_location *dec;
27369+ struct reiser4_dentry_fsdata *fsdata;
27370+
27371+ assert("nikita-1130", lh != NULL);
27372+ assert("nikita-1128", dir != NULL);
27373+
27374+ name = &de->d_name;
27375+ assert("nikita-1129", name != NULL);
27376+
27377+ /* dentry private data don't require lock, because dentry
27378+ manipulations are protected by i_mutex on parent.
27379+
27380+ This is not so for inodes, because there is no -the- parent in
27381+ inode case.
27382+ */
27383+ fsdata = reiser4_get_dentry_fsdata(de);
27384+ if (IS_ERR(fsdata))
27385+ return PTR_ERR(fsdata);
27386+ dec = &fsdata->dec;
27387+
27388+ coord = &dec->entry_coord;
27389+ coord_clear_iplug(coord);
27390+ seal = &dec->entry_seal;
27391+ /* compose key of directory entry for @name */
27392+ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
27393+
27394+ if (reiser4_seal_is_set(seal)) {
27395+ /* check seal */
27396+ result = reiser4_seal_validate(seal, coord, &entry->key,
27397+ lh, mode, ZNODE_LOCK_LOPRI);
27398+ if (result == 0) {
27399+ /* key was found. Check that it is really item we are
27400+ looking for. */
27401+ result = check_entry(dir, coord, name);
27402+ if (result == 0)
27403+ return 0;
27404+ }
27405+ }
27406+ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
27407+ /*
27408+ * find place in the tree where directory item should be located.
27409+ */
27410+ result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
27411+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
27412+ flags, NULL /*ra_info */ );
27413+ if (result == CBK_COORD_FOUND) {
27414+ struct entry_actor_args arg;
27415+
27416+ /* fast path: no hash collisions */
27417+ result = check_entry(dir, coord, name);
27418+ if (result == 0) {
27419+ reiser4_seal_init(seal, coord, &entry->key);
27420+ dec->pos = 0;
27421+ } else if (result > 0) {
27422+ /* Iterate through all units with the same keys. */
27423+ arg.name = name->name;
27424+ arg.key = &entry->key;
27425+ arg.not_found = 0;
27426+ arg.non_uniq = 0;
27427+#if REISER4_USE_COLLISION_LIMIT
27428+ arg.max_non_uniq = max_hash_collisions(dir);
27429+ assert("nikita-2851", arg.max_non_uniq > 1);
27430+#endif
27431+ arg.mode = mode;
27432+ arg.inode = dir;
27433+ coord_init_zero(&arg.last_coord);
27434+ init_lh(&arg.last_lh);
27435+
27436+ result = reiser4_iterate_tree
27437+ (reiser4_tree_by_inode(dir),
27438+ coord, lh,
27439+ entry_actor, &arg, mode, 1);
27440+ /* if end of the tree or extent was reached during
27441+ scanning. */
27442+ if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
27443+ /* step back */
27444+ done_lh(lh);
27445+
27446+ result = zload(arg.last_coord.node);
27447+ if (result == 0) {
27448+ coord_clear_iplug(&arg.last_coord);
27449+ coord_dup(coord, &arg.last_coord);
27450+ move_lh(lh, &arg.last_lh);
27451+ result = RETERR(-ENOENT);
27452+ zrelse(arg.last_coord.node);
27453+ --arg.non_uniq;
27454+ }
27455+ }
27456+
27457+ done_lh(&arg.last_lh);
27458+ if (result == 0)
27459+ reiser4_seal_init(seal, coord, &entry->key);
27460+
27461+ if (result == 0 || result == -ENOENT) {
27462+ assert("nikita-2580", arg.non_uniq > 0);
27463+ dec->pos = arg.non_uniq - 1;
27464+ }
27465+ }
27466+ } else
27467+ dec->pos = -1;
27468+ return result;
27469+}
27470+
27471+/*
27472+ Local variables:
27473+ c-indentation-style: "K&R"
27474+ mode-name: "LC"
27475+ c-basic-offset: 8
27476+ tab-width: 8
27477+ fill-column: 120
27478+ scroll-step: 1
27479+ End:
27480+*/
27481diff -urN linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format40.c
27482--- linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 03:00:00.000000000 +0300
27483+++ linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format40.c 2008-01-25 11:39:06.964214902 +0300
27484@@ -0,0 +1,655 @@
27485+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27486+
27487+#include "../../debug.h"
27488+#include "../../dformat.h"
27489+#include "../../key.h"
27490+#include "../node/node.h"
27491+#include "../space/space_allocator.h"
27492+#include "disk_format40.h"
27493+#include "../plugin.h"
27494+#include "../../txnmgr.h"
27495+#include "../../jnode.h"
27496+#include "../../tree.h"
27497+#include "../../super.h"
27498+#include "../../wander.h"
27499+#include "../../inode.h"
27500+#include "../../ktxnmgrd.h"
27501+#include "../../status_flags.h"
27502+
27503+#include <linux/types.h> /* for __u?? */
27504+#include <linux/fs.h> /* for struct super_block */
27505+#include <linux/buffer_head.h>
27506+
27507+/* reiser 4.0 default disk layout */
27508+
27509+/* Amount of free blocks needed to perform release_format40 when fs gets
27510+ mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
27511+ & tx record. */
27512+#define RELEASE_RESERVED 4
27513+
27514+/* The greatest supported format40 version number */
27515+#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
27516+
27517+/* This flag indicates that backup should be updated
27518+ (the update is performed by fsck) */
27519+#define FORMAT40_UPDATE_BACKUP (1 << 31)
27520+
27521+/* functions to access fields of format40_disk_super_block */
27522+static __u64 get_format40_block_count(const format40_disk_super_block * sb)
27523+{
27524+ return le64_to_cpu(get_unaligned(&sb->block_count));
27525+}
27526+
27527+static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
27528+{
27529+ return le64_to_cpu(get_unaligned(&sb->free_blocks));
27530+}
27531+
27532+static __u64 get_format40_root_block(const format40_disk_super_block * sb)
27533+{
27534+ return le64_to_cpu(get_unaligned(&sb->root_block));
27535+}
27536+
27537+static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
27538+{
27539+ return le16_to_cpu(get_unaligned(&sb->tree_height));
27540+}
27541+
27542+static __u64 get_format40_file_count(const format40_disk_super_block * sb)
27543+{
27544+ return le64_to_cpu(get_unaligned(&sb->file_count));
27545+}
27546+
27547+static __u64 get_format40_oid(const format40_disk_super_block * sb)
27548+{
27549+ return le64_to_cpu(get_unaligned(&sb->oid));
27550+}
27551+
27552+static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
27553+{
27554+ return le32_to_cpu(get_unaligned(&sb->mkfs_id));
27555+}
27556+
27557+static __u64 get_format40_flags(const format40_disk_super_block * sb)
27558+{
27559+ return le64_to_cpu(get_unaligned(&sb->flags));
27560+}
27561+
27562+static __u32 get_format40_version(const format40_disk_super_block * sb)
27563+{
27564+ return le32_to_cpu(get_unaligned(&sb->version)) &
27565+ ~FORMAT40_UPDATE_BACKUP;
27566+}
27567+
27568+static int update_backup_version(const format40_disk_super_block * sb)
27569+{
27570+ return (le32_to_cpu(get_unaligned(&sb->version)) &
27571+ FORMAT40_UPDATE_BACKUP);
27572+}
27573+
27574+static int update_disk_version(const format40_disk_super_block * sb)
27575+{
27576+ return (get_format40_version(sb) < FORMAT40_VERSION);
27577+}
27578+
27579+static int incomplete_compatibility(const format40_disk_super_block * sb)
27580+{
27581+ return (get_format40_version(sb) > FORMAT40_VERSION);
27582+}
27583+
27584+static format40_super_info *get_sb_info(struct super_block *super)
27585+{
27586+ return &get_super_private(super)->u.format40;
27587+}
27588+
27589+static int consult_diskmap(struct super_block *s)
27590+{
27591+ format40_super_info *info;
27592+ journal_location *jloc;
27593+
27594+ info = get_sb_info(s);
27595+ jloc = &get_super_private(s)->jloc;
27596+ /* Default format-specific locations, if there is nothing in
27597+ * diskmap */
27598+ jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
27599+ jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
27600+ info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
27601+#ifdef CONFIG_REISER4_BADBLOCKS
27602+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
27603+ &jloc->footer);
27604+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
27605+ &jloc->header);
27606+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
27607+ &info->loc.super);
27608+#endif
27609+ return 0;
27610+}
27611+
27612+/* find any valid super block of disk_format40 (even if the first
27613+ super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
27614+ if needed */
27615+static struct buffer_head *find_a_disk_format40_super_block(struct super_block
27616+ *s)
27617+{
27618+ struct buffer_head *super_bh;
27619+ format40_disk_super_block *disk_sb;
27620+ format40_super_info *info;
27621+
27622+ assert("umka-487", s != NULL);
27623+
27624+ info = get_sb_info(s);
27625+
27626+ super_bh = sb_bread(s, info->loc.super);
27627+ if (super_bh == NULL)
27628+ return ERR_PTR(RETERR(-EIO));
27629+
27630+ disk_sb = (format40_disk_super_block *) super_bh->b_data;
27631+ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
27632+ brelse(super_bh);
27633+ return ERR_PTR(RETERR(-EINVAL));
27634+ }
27635+
27636+ reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
27637+ reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
27638+ le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27639+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27640+
27641+ return super_bh;
27642+}
27643+
27644+/* find the most recent version of super block. This is called after journal is
27645+ replayed */
27646+static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
27647+{
27648+ /* Here the most recent superblock copy has to be read. However, as
27649+ journal replay isn't complete, we are using
27650+ find_a_disk_format40_super_block() function. */
27651+ return find_a_disk_format40_super_block(s);
27652+}
27653+
27654+static int get_super_jnode(struct super_block *s)
27655+{
27656+ reiser4_super_info_data *sbinfo = get_super_private(s);
27657+ jnode *sb_jnode;
27658+ int ret;
27659+
27660+ sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
27661+
27662+ ret = jload(sb_jnode);
27663+
27664+ if (ret) {
27665+ reiser4_drop_io_head(sb_jnode);
27666+ return ret;
27667+ }
27668+
27669+ pin_jnode_data(sb_jnode);
27670+ jrelse(sb_jnode);
27671+
27672+ sbinfo->u.format40.sb_jnode = sb_jnode;
27673+
27674+ return 0;
27675+}
27676+
27677+static void done_super_jnode(struct super_block *s)
27678+{
27679+ jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
27680+
27681+ if (sb_jnode) {
27682+ unpin_jnode_data(sb_jnode);
27683+ reiser4_drop_io_head(sb_jnode);
27684+ }
27685+}
27686+
27687+typedef enum format40_init_stage {
27688+ NONE_DONE = 0,
27689+ CONSULT_DISKMAP,
27690+ FIND_A_SUPER,
27691+ INIT_JOURNAL_INFO,
27692+ INIT_STATUS,
27693+ JOURNAL_REPLAY,
27694+ READ_SUPER,
27695+ KEY_CHECK,
27696+ INIT_OID,
27697+ INIT_TREE,
27698+ JOURNAL_RECOVER,
27699+ INIT_SA,
27700+ INIT_JNODE,
27701+ ALL_DONE
27702+} format40_init_stage;
27703+
27704+static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
27705+{
27706+ format40_disk_super_block *sb_copy;
27707+
27708+ sb_copy = kmalloc(sizeof(format40_disk_super_block),
27709+ reiser4_ctx_gfp_mask_get());
27710+ if (sb_copy == NULL)
27711+ return ERR_PTR(RETERR(-ENOMEM));
27712+ memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
27713+ sizeof(format40_disk_super_block));
27714+ return sb_copy;
27715+}
27716+
27717+static int check_key_format(const format40_disk_super_block *sb_copy)
27718+{
27719+ if (!equi(REISER4_LARGE_KEY,
27720+ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
27721+ warning("nikita-3228", "Key format mismatch. "
27722+ "Only %s keys are supported.",
27723+ REISER4_LARGE_KEY ? "large" : "small");
27724+ return RETERR(-EINVAL);
27725+ }
27726+ return 0;
27727+}
27728+
27729+/**
27730+ * try_init_format40
27731+ * @super:
27732+ * @stage:
27733+ *
27734+ */
27735+static int try_init_format40(struct super_block *super,
27736+ format40_init_stage *stage)
27737+{
27738+ int result;
27739+ struct buffer_head *super_bh;
27740+ reiser4_super_info_data *sbinfo;
27741+ format40_disk_super_block *sb_copy;
27742+ tree_level height;
27743+ reiser4_block_nr root_block;
27744+ node_plugin *nplug;
27745+
27746+ assert("vs-475", super != NULL);
27747+ assert("vs-474", get_super_private(super));
27748+
27749+ *stage = NONE_DONE;
27750+
27751+ result = consult_diskmap(super);
27752+ if (result)
27753+ return result;
27754+ *stage = CONSULT_DISKMAP;
27755+
27756+ super_bh = find_a_disk_format40_super_block(super);
27757+ if (IS_ERR(super_bh))
27758+ return PTR_ERR(super_bh);
27759+ brelse(super_bh);
27760+ *stage = FIND_A_SUPER;
27761+
27762+ /* ok, we are sure that filesystem format is a format40 format */
27763+
27764+ /* map jnodes for journal control blocks (header, footer) to disk */
27765+ result = reiser4_init_journal_info(super);
27766+ if (result)
27767+ return result;
27768+ *stage = INIT_JOURNAL_INFO;
27769+
27770+ /* ok, we are sure that filesystem format is a format40 format */
27771+ /* Now check it's state */
27772+ result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
27773+ if (result != 0 && result != -EINVAL)
27774+ /* -EINVAL means there is no magic, so probably just old
27775+ * fs. */
27776+ return result;
27777+ *stage = INIT_STATUS;
27778+
27779+ result = reiser4_status_query(NULL, NULL);
27780+ if (result == REISER4_STATUS_MOUNT_WARN)
27781+ notice("vpf-1363", "Warning: mounting %s with errors.",
27782+ super->s_id);
27783+ if (result == REISER4_STATUS_MOUNT_RO)
27784+ notice("vpf-1364", "Warning: mounting %s with fatal errors,"
27785+ " forcing read-only mount.", super->s_id);
27786+ result = reiser4_journal_replay(super);
27787+ if (result)
27788+ return result;
27789+ *stage = JOURNAL_REPLAY;
27790+
27791+ super_bh = read_super_block(super);
27792+ if (IS_ERR(super_bh))
27793+ return PTR_ERR(super_bh);
27794+ *stage = READ_SUPER;
27795+
27796+ /* allocate and make a copy of format40_disk_super_block */
27797+ sb_copy = copy_sb(super_bh);
27798+ brelse(super_bh);
27799+
27800+ if (IS_ERR(sb_copy))
27801+ return PTR_ERR(sb_copy);
27802+ printk("reiser4: %s: found disk format 4.0.%u.\n",
27803+ super->s_id,
27804+ get_format40_version(sb_copy));
27805+ if (incomplete_compatibility(sb_copy))
27806+ printk("reiser4: Warning: The last completely supported "
27807+ "version of disk format40 is %u. Some objects of "
27808+ "the semantic tree can be unaccessible.\n",
27809+ FORMAT40_VERSION);
27810+ /* make sure that key format of kernel and filesystem match */
27811+ result = check_key_format(sb_copy);
27812+ if (result) {
27813+ kfree(sb_copy);
27814+ return result;
27815+ }
27816+ *stage = KEY_CHECK;
27817+
27818+ result = oid_init_allocator(super, get_format40_file_count(sb_copy),
27819+ get_format40_oid(sb_copy));
27820+ if (result) {
27821+ kfree(sb_copy);
27822+ return result;
27823+ }
27824+ *stage = INIT_OID;
27825+
27826+ /* get things necessary to init reiser4_tree */
27827+ root_block = get_format40_root_block(sb_copy);
27828+ height = get_format40_tree_height(sb_copy);
27829+ nplug = node_plugin_by_id(NODE40_ID);
27830+
27831+ /* initialize reiser4_super_info_data */
27832+ sbinfo = get_super_private(super);
27833+ assert("", sbinfo->tree.super == super);
27834+ /* init reiser4_tree for the filesystem */
27835+ result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
27836+ if (result) {
27837+ kfree(sb_copy);
27838+ return result;
27839+ }
27840+ *stage = INIT_TREE;
27841+
27842+ /*
27843+ * initialize reiser4_super_info_data with data from format40 super
27844+ * block
27845+ */
27846+ sbinfo->default_uid = 0;
27847+ sbinfo->default_gid = 0;
27848+ sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
27849+ /* number of blocks in filesystem and reserved space */
27850+ reiser4_set_block_count(super, get_format40_block_count(sb_copy));
27851+ sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
27852+ sbinfo->version = get_format40_version(sb_copy);
27853+ kfree(sb_copy);
27854+
27855+ if (update_backup_version(sb_copy))
27856+ printk("reiser4: Warning: metadata backup is not updated. "
27857+ "Please run 'fsck.reiser4 --fix' on %s.\n",
27858+ super->s_id);
27859+
27860+ sbinfo->fsuid = 0;
27861+ sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
27862+ * are not supported */
27863+ sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
27864+ * layout 40 are
27865+ * of one
27866+ * plugin */
27867+ /* sbinfo->tmgr is initialized already */
27868+
27869+ /* recover sb data which were logged separately from sb block */
27870+
27871+ /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
27872+ * oid_init_allocator() and reiser4_set_free_blocks() with new
27873+ * data. What's the reason to call them above? */
27874+ result = reiser4_journal_recover_sb_data(super);
27875+ if (result != 0)
27876+ return result;
27877+ *stage = JOURNAL_RECOVER;
27878+
27879+ /*
27880+ * Set number of used blocks. The number of used blocks is not stored
27881+ * neither in on-disk super block nor in the journal footer blocks. At
27882+ * this moment actual values of total blocks and free block counters
27883+ * are set in the reiser4 super block (in-memory structure) and we can
27884+ * calculate number of used blocks from them.
27885+ */
27886+ reiser4_set_data_blocks(super,
27887+ reiser4_block_count(super) -
27888+ reiser4_free_blocks(super));
27889+
27890+#if REISER4_DEBUG
27891+ sbinfo->min_blocks_used = 16 /* reserved area */ +
27892+ 2 /* super blocks */ +
27893+ 2 /* journal footer and header */ ;
27894+#endif
27895+
27896+ /* init disk space allocator */
27897+ result = sa_init_allocator(reiser4_get_space_allocator(super),
27898+ super, NULL);
27899+ if (result)
27900+ return result;
27901+ *stage = INIT_SA;
27902+
27903+ result = get_super_jnode(super);
27904+ if (result == 0)
27905+ *stage = ALL_DONE;
27906+ return result;
27907+}
27908+
27909+/* plugin->u.format.get_ready */
27910+int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
27911+{
27912+ int result;
27913+ format40_init_stage stage;
27914+
27915+ result = try_init_format40(s, &stage);
27916+ switch (stage) {
27917+ case ALL_DONE:
27918+ assert("nikita-3458", result == 0);
27919+ break;
27920+ case INIT_JNODE:
27921+ done_super_jnode(s);
27922+ case INIT_SA:
27923+ sa_destroy_allocator(reiser4_get_space_allocator(s), s);
27924+ case JOURNAL_RECOVER:
27925+ case INIT_TREE:
27926+ reiser4_done_tree(&get_super_private(s)->tree);
27927+ case INIT_OID:
27928+ case KEY_CHECK:
27929+ case READ_SUPER:
27930+ case JOURNAL_REPLAY:
27931+ case INIT_STATUS:
27932+ reiser4_status_finish();
27933+ case INIT_JOURNAL_INFO:
27934+ reiser4_done_journal_info(s);
27935+ case FIND_A_SUPER:
27936+ case CONSULT_DISKMAP:
27937+ case NONE_DONE:
27938+ break;
27939+ default:
27940+ impossible("nikita-3457", "init stage: %i", stage);
27941+ }
27942+
27943+ if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
27944+ return RETERR(-ENOSPC);
27945+
27946+ return result;
27947+}
27948+
27949+static void pack_format40_super(const struct super_block *s, char *data)
27950+{
27951+ format40_disk_super_block *super_data =
27952+ (format40_disk_super_block *) data;
27953+
27954+ reiser4_super_info_data *sbinfo = get_super_private(s);
27955+
27956+ assert("zam-591", data != NULL);
27957+
27958+ put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
27959+ &super_data->free_blocks);
27960+
27961+ put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
27962+ &super_data->root_block);
27963+
27964+ put_unaligned(cpu_to_le64(oid_next(s)),
27965+ &super_data->oid);
27966+
27967+ put_unaligned(cpu_to_le64(oids_used(s)),
27968+ &super_data->file_count);
27969+
27970+ put_unaligned(cpu_to_le16(sbinfo->tree.height),
27971+ &super_data->tree_height);
27972+
27973+ if (update_disk_version(super_data)) {
27974+ __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
27975+
27976+ put_unaligned(cpu_to_le32(version), &super_data->version);
27977+ }
27978+}
27979+
27980+/* plugin->u.format.log_super
27981+ return a jnode which should be added to transaction when the super block
27982+ gets logged */
27983+jnode *log_super_format40(struct super_block *s)
27984+{
27985+ jnode *sb_jnode;
27986+
27987+ sb_jnode = get_super_private(s)->u.format40.sb_jnode;
27988+
27989+ jload(sb_jnode);
27990+
27991+ pack_format40_super(s, jdata(sb_jnode));
27992+
27993+ jrelse(sb_jnode);
27994+
27995+ return sb_jnode;
27996+}
27997+
27998+/* plugin->u.format.release */
27999+int release_format40(struct super_block *s)
28000+{
28001+ int ret;
28002+ reiser4_super_info_data *sbinfo;
28003+
28004+ sbinfo = get_super_private(s);
28005+ assert("zam-579", sbinfo != NULL);
28006+
28007+ if (!rofs_super(s)) {
28008+ ret = reiser4_capture_super_block(s);
28009+ if (ret != 0)
28010+ warning("vs-898",
28011+ "reiser4_capture_super_block failed: %d",
28012+ ret);
28013+
28014+ ret = txnmgr_force_commit_all(s, 1);
28015+ if (ret != 0)
28016+ warning("jmacd-74438", "txn_force failed: %d", ret);
28017+
28018+ all_grabbed2free();
28019+ }
28020+
28021+ sa_destroy_allocator(&sbinfo->space_allocator, s);
28022+ reiser4_done_journal_info(s);
28023+ done_super_jnode(s);
28024+
28025+ rcu_barrier();
28026+ reiser4_done_tree(&sbinfo->tree);
28027+ /* call finish_rcu(), because some znode were "released" in
28028+ * reiser4_done_tree(). */
28029+ rcu_barrier();
28030+
28031+ return 0;
28032+}
28033+
28034+#define FORMAT40_ROOT_LOCALITY 41
28035+#define FORMAT40_ROOT_OBJECTID 42
28036+
28037+/* plugin->u.format.root_dir_key */
28038+const reiser4_key *root_dir_key_format40(const struct super_block *super
28039+ UNUSED_ARG)
28040+{
28041+ static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
28042+ .el = {
28043+ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
28044+#if REISER4_LARGE_KEY
28045+ ON_LARGE_KEY(0ull,)
28046+#endif
28047+ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
28048+ 0ull
28049+ }
28050+ };
28051+
28052+ return &FORMAT40_ROOT_DIR_KEY;
28053+}
28054+
28055+/* plugin->u.format.check_open.
28056+ Check the opened object for validness. For now it checks for the valid oid &
28057+ locality only, can be improved later and it its work may depend on the mount
28058+ options. */
28059+int check_open_format40(const struct inode *object)
28060+{
28061+ oid_t max, oid;
28062+
28063+ max = oid_next(object->i_sb) - 1;
28064+
28065+ /* Check the oid. */
28066+ oid = get_inode_oid(object);
28067+ if (oid > max) {
28068+ warning("vpf-1360", "The object with the oid %llu "
28069+ "greater then the max used oid %llu found.",
28070+ (unsigned long long)oid, (unsigned long long)max);
28071+
28072+ return RETERR(-EIO);
28073+ }
28074+
28075+ /* Check the locality. */
28076+ oid = reiser4_inode_data(object)->locality_id;
28077+ if (oid > max) {
28078+ warning("vpf-1361", "The object with the locality %llu "
28079+ "greater then the max used oid %llu found.",
28080+ (unsigned long long)oid, (unsigned long long)max);
28081+
28082+ return RETERR(-EIO);
28083+ }
28084+
28085+ return 0;
28086+}
28087+
28088+/* plugin->u.format.version_update.
28089+ Perform all version update operations from the on-disk
28090+ format40_disk_super_block.version on disk to FORMAT40_VERSION.
28091+ */
28092+int version_update_format40(struct super_block *super) {
28093+ txn_handle * trans;
28094+ lock_handle lh;
28095+ txn_atom *atom;
28096+ int ret;
28097+
28098+ /* Nothing to do if RO mount or the on-disk version is not less. */
28099+ if (super->s_flags & MS_RDONLY)
28100+ return 0;
28101+
28102+ if (get_super_private(super)->version >= FORMAT40_VERSION)
28103+ return 0;
28104+
28105+ printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
28106+ "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
28107+ "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
28108+
28109+ /* Mark the uber znode dirty to call log_super on write_logs. */
28110+ init_lh(&lh);
28111+ ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
28112+ ZNODE_LOCK_HIPRI, &lh);
28113+ if (ret != 0)
28114+ return ret;
28115+
28116+ znode_make_dirty(lh.node);
28117+ done_lh(&lh);
28118+
28119+ /* Update the backup blocks. */
28120+
28121+ /* Force write_logs immediately. */
28122+ trans = get_current_context()->trans;
28123+ atom = get_current_atom_locked();
28124+ assert("vpf-1906", atom != NULL);
28125+
28126+ spin_lock_txnh(trans);
28127+ return force_commit_atom(trans);
28128+}
28129+
28130+/* Make Linus happy.
28131+ Local variables:
28132+ c-indentation-style: "K&R"
28133+ mode-name: "LC"
28134+ c-basic-offset: 8
28135+ tab-width: 8
28136+ fill-column: 120
28137+ scroll-step: 1
28138+ End:
28139+*/
28140diff -urN linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format40.h
28141--- linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 03:00:00.000000000 +0300
28142+++ linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format40.h 2008-01-25 11:39:06.968215932 +0300
28143@@ -0,0 +1,109 @@
28144+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28145+
28146+/* this file contains:
28147+ - definition of ondisk super block of standart disk layout for
28148+ reiser 4.0 (layout 40)
28149+ - definition of layout 40 specific portion of in-core super block
28150+ - declarations of functions implementing methods of layout plugin
28151+ for layout 40
28152+ - declarations of functions used to get/set fields in layout 40 super block
28153+*/
28154+
28155+#ifndef __DISK_FORMAT40_H__
28156+#define __DISK_FORMAT40_H__
28157+
28158+/* magic for default reiser4 layout */
28159+#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
28160+#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
28161+
28162+#include "../../dformat.h"
28163+
28164+#include <linux/fs.h> /* for struct super_block */
28165+
28166+typedef enum {
28167+ FORMAT40_LARGE_KEYS
28168+} format40_flags;
28169+
28170+/* ondisk super block for format 40. It is 512 bytes long */
28171+typedef struct format40_disk_super_block {
28172+ /* 0 */ d64 block_count;
28173+ /* number of block in a filesystem */
28174+ /* 8 */ d64 free_blocks;
28175+ /* number of free blocks */
28176+ /* 16 */ d64 root_block;
28177+ /* filesystem tree root block */
28178+ /* 24 */ d64 oid;
28179+ /* smallest free objectid */
28180+ /* 32 */ d64 file_count;
28181+ /* number of files in a filesystem */
28182+ /* 40 */ d64 flushes;
28183+ /* number of times super block was
28184+ flushed. Needed if format 40
28185+ will have few super blocks */
28186+ /* 48 */ d32 mkfs_id;
28187+ /* unique identifier of fs */
28188+ /* 52 */ char magic[16];
28189+ /* magic string ReIsEr40FoRmAt */
28190+ /* 68 */ d16 tree_height;
28191+ /* height of filesystem tree */
28192+ /* 70 */ d16 formatting_policy;
28193+ /* not used anymore */
28194+ /* 72 */ d64 flags;
28195+ /* 80 */ d32 version;
28196+ /* on-disk format version number
28197+ initially assigned by mkfs as the greatest format40
28198+ version number supported by reiser4progs and updated
28199+ in mount time in accordance with the greatest format40
28200+ version number supported by kernel.
28201+ Is used by fsck to catch possible corruption and
28202+ for various compatibility issues */
28203+ /* 84 */ char not_used[428];
28204+} format40_disk_super_block;
28205+
28206+/* format 40 specific part of reiser4_super_info_data */
28207+typedef struct format40_super_info {
28208+/* format40_disk_super_block actual_sb; */
28209+ jnode *sb_jnode;
28210+ struct {
28211+ reiser4_block_nr super;
28212+ } loc;
28213+} format40_super_info;
28214+
28215+/* Defines for journal header and footer respectively. */
28216+#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
28217+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
28218+
28219+#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
28220+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
28221+
28222+#define FORMAT40_STATUS_BLOCKNR \
28223+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
28224+
28225+/* Diskmap declarations */
28226+#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
28227+#define FORMAT40_SUPER 1
28228+#define FORMAT40_JH 2
28229+#define FORMAT40_JF 3
28230+
28231+/* declarations of functions implementing methods of layout plugin for
28232+ format 40. The functions theirself are in disk_format40.c */
28233+extern int init_format_format40(struct super_block *, void *data);
28234+extern const reiser4_key *root_dir_key_format40(const struct super_block *);
28235+extern int release_format40(struct super_block *s);
28236+extern jnode *log_super_format40(struct super_block *s);
28237+extern int check_open_format40(const struct inode *object);
28238+extern int version_update_format40(struct super_block *super);
28239+
28240+/* __DISK_FORMAT40_H__ */
28241+#endif
28242+
28243+/* Make Linus happy.
28244+ Local variables:
28245+ c-indentation-style: "K&R"
28246+ mode-name: "LC"
28247+ c-basic-offset: 8
28248+ tab-width: 8
28249+ fill-column: 120
28250+ scroll-step: 1
28251+ End:
28252+*/
28253diff -urN linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format.c
28254--- linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 03:00:00.000000000 +0300
28255+++ linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format.c 2008-01-25 11:39:06.968215932 +0300
28256@@ -0,0 +1,38 @@
28257+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28258+
28259+#include "../../debug.h"
28260+#include "../plugin_header.h"
28261+#include "disk_format40.h"
28262+#include "disk_format.h"
28263+#include "../plugin.h"
28264+
28265+/* initialization of disk layout plugins */
28266+disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
28267+ [FORMAT40_ID] = {
28268+ .h = {
28269+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
28270+ .id = FORMAT40_ID,
28271+ .pops = NULL,
28272+ .label = "reiser40",
28273+ .desc = "standard disk layout for reiser40",
28274+ .linkage = {NULL, NULL}
28275+ },
28276+ .init_format = init_format_format40,
28277+ .root_dir_key = root_dir_key_format40,
28278+ .release = release_format40,
28279+ .log_super = log_super_format40,
28280+ .check_open = check_open_format40,
28281+ .version_update = version_update_format40
28282+ }
28283+};
28284+
28285+/* Make Linus happy.
28286+ Local variables:
28287+ c-indentation-style: "K&R"
28288+ mode-name: "LC"
28289+ c-basic-offset: 8
28290+ tab-width: 8
28291+ fill-column: 120
28292+ scroll-step: 1
28293+ End:
28294+*/
28295diff -urN linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format.h
28296--- linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 03:00:00.000000000 +0300
28297+++ linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format.h 2008-01-25 11:39:06.968215932 +0300
28298@@ -0,0 +1,27 @@
28299+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28300+
28301+/* identifiers for disk layouts, they are also used as indexes in array of disk
28302+ plugins */
28303+
28304+#if !defined( __REISER4_DISK_FORMAT_H__ )
28305+#define __REISER4_DISK_FORMAT_H__
28306+
28307+typedef enum {
28308+ /* standard reiser4 disk layout plugin id */
28309+ FORMAT40_ID,
28310+ LAST_FORMAT_ID
28311+} disk_format_id;
28312+
28313+/* __REISER4_DISK_FORMAT_H__ */
28314+#endif
28315+
28316+/* Make Linus happy.
28317+ Local variables:
28318+ c-indentation-style: "K&R"
28319+ mode-name: "LC"
28320+ c-basic-offset: 8
28321+ tab-width: 8
28322+ fill-column: 120
28323+ scroll-step: 1
28324+ End:
28325+*/
28326diff -urN linux-2.6.24.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.24/fs/reiser4/plugin/disk_format/Makefile
28327--- linux-2.6.24.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 03:00:00.000000000 +0300
28328+++ linux-2.6.24/fs/reiser4/plugin/disk_format/Makefile 2008-01-25 11:39:06.968215932 +0300
28329@@ -0,0 +1,5 @@
28330+obj-$(CONFIG_REISER4_FS) += df_plugins.o
28331+
28332+df_plugins-objs := \
28333+ disk_format40.o \
28334+ disk_format.o
28335diff -urN linux-2.6.24.orig/fs/reiser4/plugin/fibration.c linux-2.6.24/fs/reiser4/plugin/fibration.c
28336--- linux-2.6.24.orig/fs/reiser4/plugin/fibration.c 1970-01-01 03:00:00.000000000 +0300
28337+++ linux-2.6.24/fs/reiser4/plugin/fibration.c 2008-01-25 11:39:06.968215932 +0300
28338@@ -0,0 +1,175 @@
28339+/* Copyright 2004 by Hans Reiser, licensing governed by
28340+ * reiser4/README */
28341+
28342+/* Directory fibrations */
28343+
28344+/*
28345+ * Suppose we have a directory tree with sources of some project. During
28346+ * compilation .o files are created within this tree. This makes access
28347+ * to the original source files less efficient, because source files are
28348+ * now "diluted" by object files: default directory plugin uses prefix
28349+ * of a file name as a part of the key for directory entry (and this
28350+ * part is also inherited by the key of file body). This means that
28351+ * foo.o will be located close to foo.c and foo.h in the tree.
28352+ *
28353+ * To avoid this effect directory plugin fill highest 7 (unused
28354+ * originally) bits of the second component of the directory entry key
28355+ * by bit-pattern depending on the file name (see
28356+ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
28357+ * "fibre". Fibre of the file name key is inherited by key of stat data
28358+ * and keys of file body (in the case of REISER4_LARGE_KEY).
28359+ *
28360+ * Fibre for a given file is chosen by per-directory fibration
28361+ * plugin. Names within given fibre are ordered lexicographically.
28362+ */
28363+
28364+#include "../debug.h"
28365+#include "plugin_header.h"
28366+#include "plugin.h"
28367+#include "../super.h"
28368+#include "../inode.h"
28369+
28370+#include <linux/types.h>
28371+
28372+static const int fibre_shift = 57;
28373+
28374+#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
28375+
28376+/*
28377+ * Trivial fibration: all files of directory are just ordered
28378+ * lexicographically.
28379+ */
28380+static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
28381+{
28382+ return FIBRE_NO(0);
28383+}
28384+
28385+/*
28386+ * dot-o fibration: place .o files after all others.
28387+ */
28388+static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
28389+{
28390+ /* special treatment for .*\.o */
28391+ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
28392+ return FIBRE_NO(1);
28393+ else
28394+ return FIBRE_NO(0);
28395+}
28396+
28397+/*
28398+ * ext.1 fibration: subdivide directory into 128 fibrations one for each
28399+ * 7bit extension character (file "foo.h" goes into fibre "h"), plus
28400+ * default fibre for the rest.
28401+ */
28402+static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
28403+{
28404+ if (len > 2 && name[len - 2] == '.')
28405+ return FIBRE_NO(name[len - 1]);
28406+ else
28407+ return FIBRE_NO(0);
28408+}
28409+
28410+/*
28411+ * ext.3 fibration: try to separate files with different 3-character
28412+ * extensions from each other.
28413+ */
28414+static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
28415+{
28416+ if (len > 4 && name[len - 4] == '.')
28417+ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
28418+ else
28419+ return FIBRE_NO(0);
28420+}
28421+
28422+static int change_fibration(struct inode *inode,
28423+ reiser4_plugin * plugin,
28424+ pset_member memb)
28425+{
28426+ int result;
28427+
28428+ assert("nikita-3503", inode != NULL);
28429+ assert("nikita-3504", plugin != NULL);
28430+
28431+ assert("nikita-3505", is_reiser4_inode(inode));
28432+ assert("nikita-3506", inode_dir_plugin(inode) != NULL);
28433+ assert("nikita-3507",
28434+ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
28435+
28436+ result = 0;
28437+ if (inode_fibration_plugin(inode) == NULL ||
28438+ inode_fibration_plugin(inode)->h.id != plugin->h.id) {
28439+ if (is_dir_empty(inode) == 0)
28440+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
28441+ PSET_FIBRATION, plugin);
28442+ else
28443+ result = RETERR(-ENOTEMPTY);
28444+
28445+ }
28446+ return result;
28447+}
28448+
28449+static reiser4_plugin_ops fibration_plugin_ops = {
28450+ .init = NULL,
28451+ .load = NULL,
28452+ .save_len = NULL,
28453+ .save = NULL,
28454+ .change = change_fibration
28455+};
28456+
28457+/* fibration plugins */
28458+fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
28459+ [FIBRATION_LEXICOGRAPHIC] = {
28460+ .h = {
28461+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28462+ .id = FIBRATION_LEXICOGRAPHIC,
28463+ .pops = &fibration_plugin_ops,
28464+ .label = "lexicographic",
28465+ .desc = "no fibration",
28466+ .linkage = {NULL, NULL}
28467+ },
28468+ .fibre = fibre_trivial
28469+ },
28470+ [FIBRATION_DOT_O] = {
28471+ .h = {
28472+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28473+ .id = FIBRATION_DOT_O,
28474+ .pops = &fibration_plugin_ops,
28475+ .label = "dot-o",
28476+ .desc = "fibrate .o files separately",
28477+ .linkage = {NULL, NULL}
28478+ },
28479+ .fibre = fibre_dot_o
28480+ },
28481+ [FIBRATION_EXT_1] = {
28482+ .h = {
28483+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28484+ .id = FIBRATION_EXT_1,
28485+ .pops = &fibration_plugin_ops,
28486+ .label = "ext-1",
28487+ .desc = "fibrate file by single character extension",
28488+ .linkage = {NULL, NULL}
28489+ },
28490+ .fibre = fibre_ext_1
28491+ },
28492+ [FIBRATION_EXT_3] = {
28493+ .h = {
28494+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28495+ .id = FIBRATION_EXT_3,
28496+ .pops = &fibration_plugin_ops,
28497+ .label = "ext-3",
28498+ .desc = "fibrate file by three character extension",
28499+ .linkage = {NULL, NULL}
28500+ },
28501+ .fibre = fibre_ext_3
28502+ }
28503+};
28504+
28505+/*
28506+ * Local variables:
28507+ * c-indentation-style: "K&R"
28508+ * mode-name: "LC"
28509+ * c-basic-offset: 8
28510+ * tab-width: 8
28511+ * fill-column: 79
28512+ * End:
28513+ */
28514diff -urN linux-2.6.24.orig/fs/reiser4/plugin/fibration.h linux-2.6.24/fs/reiser4/plugin/fibration.h
28515--- linux-2.6.24.orig/fs/reiser4/plugin/fibration.h 1970-01-01 03:00:00.000000000 +0300
28516+++ linux-2.6.24/fs/reiser4/plugin/fibration.h 2008-01-25 11:39:06.968215932 +0300
28517@@ -0,0 +1,37 @@
28518+/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
28519+
28520+/* Fibration plugin used by hashed directory plugin to segment content
28521+ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
28522+
28523+#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
28524+#define __FS_REISER4_PLUGIN_FIBRATION_H__
28525+
28526+#include "plugin_header.h"
28527+
28528+typedef struct fibration_plugin {
28529+ /* generic fields */
28530+ plugin_header h;
28531+
28532+ __u64(*fibre) (const struct inode * dir, const char *name, int len);
28533+} fibration_plugin;
28534+
28535+typedef enum {
28536+ FIBRATION_LEXICOGRAPHIC,
28537+ FIBRATION_DOT_O,
28538+ FIBRATION_EXT_1,
28539+ FIBRATION_EXT_3,
28540+ LAST_FIBRATION_ID
28541+} reiser4_fibration_id;
28542+
28543+/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
28544+#endif
28545+
28546+/* Make Linus happy.
28547+ Local variables:
28548+ c-indentation-style: "K&R"
28549+ mode-name: "LC"
28550+ c-basic-offset: 8
28551+ tab-width: 8
28552+ fill-column: 120
28553+ End:
28554+*/
28555diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.24/fs/reiser4/plugin/file/cryptcompress.c
28556--- linux-2.6.24.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 03:00:00.000000000 +0300
28557+++ linux-2.6.24/fs/reiser4/plugin/file/cryptcompress.c 2008-01-25 11:40:16.690167725 +0300
28558@@ -0,0 +1,3776 @@
28559+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
28560+ reiser4/README */
28561+/*
28562+ * Written by Edward Shishkin.
28563+ *
28564+ * Implementations of inode/file/address_space operations
28565+ * specific for cryptcompress file plugin which manages
28566+ * regular files built of compressed and(or) encrypted bodies.
28567+ * See http://dev.namesys.com/CryptcompressPlugin for details.
28568+ */
28569+
28570+#include "../../inode.h"
28571+#include "../cluster.h"
28572+#include "../object.h"
28573+#include "../../tree_walk.h"
28574+#include "cryptcompress.h"
28575+
28576+#include <linux/pagevec.h>
28577+#include <asm/uaccess.h>
28578+#include <linux/swap.h>
28579+#include <linux/writeback.h>
28580+#include <linux/random.h>
28581+#include <linux/scatterlist.h>
28582+
28583+/*
28584+ Managing primary and secondary caches by Reiser4
28585+ cryptcompress file plugin. Synchronization scheme.
28586+
28587+
28588+ +------------------+
28589+ +------------------->| tfm stream |
28590+ | | (compressed data)|
28591+ flush | +------------------+
28592+ +-----------------+ |
28593+ |(->)longterm lock| V
28594+--+ writepages() | | +-***-+ reiser4 +---+
28595+ | | +--+ | *** | storage tree | |
28596+ | | | +-***-+ (primary cache)| |
28597+u | write() (secondary| cache) V / | \ | |
28598+s | ----> +----+ +----+ +----+ +----+ +-***** ******* **----+ ----> | d |
28599+e | | | |page cluster | | | **disk cluster** | | i |
28600+r | <---- +----+ +----+ +----+ +----+ +-***** **********----+ <---- | s |
28601+ | read() ^ ^ | | k |
28602+ | | (->)longterm lock| | page_io()| |
28603+ | | +------+ | |
28604+--+ readpages() | | +---+
28605+ | V
28606+ | +------------------+
28607+ +--------------------| tfm stream |
28608+ | (plain text) |
28609+ +------------------+
28610+*/
28611+
28612+/* get cryptcompress specific portion of inode */
28613+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode)
28614+{
28615+ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
28616+}
28617+
28618+/* plugin->u.file.init_inode_data */
28619+void init_inode_data_cryptcompress(struct inode *inode,
28620+ reiser4_object_create_data * crd,
28621+ int create)
28622+{
28623+ struct cryptcompress_info *data;
28624+
28625+ data = cryptcompress_inode_data(inode);
28626+ assert("edward-685", data != NULL);
28627+
28628+ memset(data, 0, sizeof(*data));
28629+
28630+ mutex_init(&data->checkin_mutex);
28631+ data->trunc_index = ULONG_MAX;
28632+ turn_on_compression(data);
28633+ set_lattice_factor(data, MIN_LATTICE_FACTOR);
28634+ init_inode_ordering(inode, crd, create);
28635+}
28636+
28637+/* The following is a part of reiser4 cipher key manager
28638+ which is called when opening/creating a cryptcompress file */
28639+
28640+/* get/set cipher key info */
28641+struct reiser4_crypto_info * inode_crypto_info (struct inode * inode)
28642+{
28643+ assert("edward-90", inode != NULL);
28644+ assert("edward-91", reiser4_inode_data(inode) != NULL);
28645+ return cryptcompress_inode_data(inode)->crypt;
28646+}
28647+
28648+static void set_inode_crypto_info (struct inode * inode,
28649+ struct reiser4_crypto_info * info)
28650+{
28651+ cryptcompress_inode_data(inode)->crypt = info;
28652+}
28653+
28654+/* allocate a cipher key info */
28655+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode)
28656+{
28657+ struct reiser4_crypto_info *info;
28658+ int fipsize;
28659+
28660+ info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
28661+ if (!info)
28662+ return ERR_PTR(-ENOMEM);
28663+
28664+ fipsize = inode_digest_plugin(inode)->fipsize;
28665+ info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
28666+ if (!info->keyid) {
28667+ kfree(info);
28668+ return ERR_PTR(-ENOMEM);
28669+ }
28670+ info->host = inode;
28671+ return info;
28672+}
28673+
28674+#if 0
28675+/* allocate/free low-level info for cipher and digest
28676+ transforms */
28677+static int alloc_crypto_tfms(struct reiser4_crypto_info * info)
28678+{
28679+ struct crypto_blkcipher * ctfm = NULL;
28680+ struct crypto_hash * dtfm = NULL;
28681+ cipher_plugin * cplug = inode_cipher_plugin(info->host);
28682+ digest_plugin * dplug = inode_digest_plugin(info->host);
28683+
28684+ if (cplug->alloc) {
28685+ ctfm = cplug->alloc();
28686+ if (IS_ERR(ctfm)) {
28687+ warning("edward-1364",
28688+ "Can not allocate info for %s\n",
28689+ cplug->h.desc);
28690+ return RETERR(PTR_ERR(ctfm));
28691+ }
28692+ }
28693+ info_set_cipher(info, ctfm);
28694+ if (dplug->alloc) {
28695+ dtfm = dplug->alloc();
28696+ if (IS_ERR(dtfm)) {
28697+ warning("edward-1365",
28698+ "Can not allocate info for %s\n",
28699+ dplug->h.desc);
28700+ goto unhappy_with_digest;
28701+ }
28702+ }
28703+ info_set_digest(info, dtfm);
28704+ return 0;
28705+ unhappy_with_digest:
28706+ if (cplug->free) {
28707+ cplug->free(ctfm);
28708+ info_set_cipher(info, NULL);
28709+ }
28710+ return RETERR(PTR_ERR(dtfm));
28711+}
28712+#endif
28713+
28714+static void
28715+free_crypto_tfms(struct reiser4_crypto_info * info)
28716+{
28717+ assert("edward-1366", info != NULL);
28718+ if (!info_get_cipher(info)) {
28719+ assert("edward-1601", !info_get_digest(info));
28720+ return;
28721+ }
28722+ inode_cipher_plugin(info->host)->free(info_get_cipher(info));
28723+ info_set_cipher(info, NULL);
28724+ inode_digest_plugin(info->host)->free(info_get_digest(info));
28725+ info_set_digest(info, NULL);
28726+ return;
28727+}
28728+
28729+#if 0
28730+/* create a key fingerprint for disk stat-data */
28731+static int create_keyid (struct reiser4_crypto_info * info,
28732+ struct reiser4_crypto_data * data)
28733+{
28734+ int ret = -ENOMEM;
28735+ size_t blk, pad;
28736+ __u8 * dmem;
28737+ __u8 * cmem;
28738+ struct hash_desc ddesc;
28739+ struct blkcipher_desc cdesc;
28740+ struct scatterlist sg;
28741+
28742+ assert("edward-1367", info != NULL);
28743+ assert("edward-1368", info->keyid != NULL);
28744+
28745+ ddesc.tfm = info_get_digest(info);
28746+ ddesc.flags = 0;
28747+ cdesc.tfm = info_get_cipher(info);
28748+ cdesc.flags = 0;
28749+
28750+ dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
28751+ reiser4_ctx_gfp_mask_get());
28752+ if (!dmem)
28753+ goto exit1;
28754+
28755+ blk = crypto_blkcipher_blocksize(cdesc.tfm);
28756+
28757+ pad = data->keyid_size % blk;
28758+ pad = (pad ? blk - pad : 0);
28759+
28760+ cmem = kmalloc((size_t)data->keyid_size + pad,
28761+ reiser4_ctx_gfp_mask_get());
28762+ if (!cmem)
28763+ goto exit2;
28764+ memcpy(cmem, data->keyid, data->keyid_size);
28765+ memset(cmem + data->keyid_size, 0, pad);
28766+
28767+ sg_init_one(&sg, cmem, data->keyid_size + pad);
28768+
28769+ ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
28770+ data->keyid_size + pad);
28771+ if (ret) {
28772+ warning("edward-1369",
28773+ "encryption failed flags=%x\n", cdesc.flags);
28774+ goto exit3;
28775+ }
28776+ ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
28777+ if (ret) {
28778+ warning("edward-1602",
28779+ "digest failed flags=%x\n", ddesc.flags);
28780+ goto exit3;
28781+ }
28782+ memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
28783+ exit3:
28784+ kfree(cmem);
28785+ exit2:
28786+ kfree(dmem);
28787+ exit1:
28788+ return ret;
28789+}
28790+#endif
28791+
28792+static void destroy_keyid(struct reiser4_crypto_info * info)
28793+{
28794+ assert("edward-1370", info != NULL);
28795+ assert("edward-1371", info->keyid != NULL);
28796+ kfree(info->keyid);
28797+ return;
28798+}
28799+
28800+static void __free_crypto_info (struct inode * inode)
28801+{
28802+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
28803+ assert("edward-1372", info != NULL);
28804+
28805+ free_crypto_tfms(info);
28806+ destroy_keyid(info);
28807+ kfree(info);
28808+}
28809+
28810+#if 0
28811+static void instantiate_crypto_info(struct reiser4_crypto_info * info)
28812+{
28813+ assert("edward-1373", info != NULL);
28814+ assert("edward-1374", info->inst == 0);
28815+ info->inst = 1;
28816+}
28817+#endif
28818+
28819+static void uninstantiate_crypto_info(struct reiser4_crypto_info * info)
28820+{
28821+ assert("edward-1375", info != NULL);
28822+ info->inst = 0;
28823+}
28824+
28825+#if 0
28826+static int is_crypto_info_instantiated(struct reiser4_crypto_info * info)
28827+{
28828+ return info->inst;
28829+}
28830+
28831+static int inode_has_cipher_key(struct inode * inode)
28832+{
28833+ assert("edward-1376", inode != NULL);
28834+ return inode_crypto_info(inode) &&
28835+ is_crypto_info_instantiated(inode_crypto_info(inode));
28836+}
28837+#endif
28838+
28839+static void free_crypto_info (struct inode * inode)
28840+{
28841+ uninstantiate_crypto_info(inode_crypto_info(inode));
28842+ __free_crypto_info(inode);
28843+}
28844+
28845+static int need_cipher(struct inode * inode)
28846+{
28847+ return inode_cipher_plugin(inode) !=
28848+ cipher_plugin_by_id(NONE_CIPHER_ID);
28849+}
28850+
28851+/* Parse @data which contains a (uninstantiated) cipher key imported
28852+ from user space, create a low-level cipher info and attach it to
28853+ the @object. If success, then info contains an instantiated key */
28854+#if 0
28855+struct reiser4_crypto_info * create_crypto_info(struct inode * object,
28856+ struct reiser4_crypto_data * data)
28857+{
28858+ int ret;
28859+ struct reiser4_crypto_info * info;
28860+
28861+ assert("edward-1377", data != NULL);
28862+ assert("edward-1378", need_cipher(object));
28863+
28864+ if (inode_file_plugin(object) !=
28865+ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
28866+ return ERR_PTR(-EINVAL);
28867+
28868+ info = reiser4_alloc_crypto_info(object);
28869+ if (IS_ERR(info))
28870+ return info;
28871+ ret = alloc_crypto_tfms(info);
28872+ if (ret)
28873+ goto err;
28874+ /* instantiating a key */
28875+ ret = crypto_blkcipher_setkey(info_get_cipher(info),
28876+ data->key,
28877+ data->keysize);
28878+ if (ret) {
28879+ warning("edward-1379",
28880+ "setkey failed flags=%x",
28881+ crypto_blkcipher_get_flags(info_get_cipher(info)));
28882+ goto err;
28883+ }
28884+ info->keysize = data->keysize;
28885+ ret = create_keyid(info, data);
28886+ if (ret)
28887+ goto err;
28888+ instantiate_crypto_info(info);
28889+ return info;
28890+ err:
28891+ __free_crypto_info(object);
28892+ return ERR_PTR(ret);
28893+}
28894+#endif
28895+
28896+/* increment/decrement a load counter when
28897+ attaching/detaching the crypto-stat to any object */
28898+static void load_crypto_info(struct reiser4_crypto_info * info)
28899+{
28900+ assert("edward-1380", info != NULL);
28901+ inc_keyload_count(info);
28902+}
28903+
28904+static void unload_crypto_info(struct inode * inode)
28905+{
28906+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
28907+ assert("edward-1381", info->keyload_count > 0);
28908+
28909+ dec_keyload_count(inode_crypto_info(inode));
28910+ if (info->keyload_count == 0)
28911+ /* final release */
28912+ free_crypto_info(inode);
28913+}
28914+
28915+/* attach/detach an existing crypto-stat */
28916+void reiser4_attach_crypto_info(struct inode * inode,
28917+ struct reiser4_crypto_info * info)
28918+{
28919+ assert("edward-1382", inode != NULL);
28920+ assert("edward-1383", info != NULL);
28921+ assert("edward-1384", inode_crypto_info(inode) == NULL);
28922+
28923+ set_inode_crypto_info(inode, info);
28924+ load_crypto_info(info);
28925+}
28926+
28927+/* returns true, if crypto stat can be attached to the @host */
28928+#if REISER4_DEBUG
28929+static int host_allows_crypto_info(struct inode * host)
28930+{
28931+ int ret;
28932+ file_plugin * fplug = inode_file_plugin(host);
28933+
28934+ switch (fplug->h.id) {
28935+ case CRYPTCOMPRESS_FILE_PLUGIN_ID:
28936+ ret = 1;
28937+ break;
28938+ default:
28939+ ret = 0;
28940+ }
28941+ return ret;
28942+}
28943+#endif /* REISER4_DEBUG */
28944+
28945+static void reiser4_detach_crypto_info(struct inode * inode)
28946+{
28947+ assert("edward-1385", inode != NULL);
28948+ assert("edward-1386", host_allows_crypto_info(inode));
28949+
28950+ if (inode_crypto_info(inode))
28951+ unload_crypto_info(inode);
28952+ set_inode_crypto_info(inode, NULL);
28953+}
28954+
28955+#if 0
28956+
28957+/* compare fingerprints of @child and @parent */
28958+static int keyid_eq(struct reiser4_crypto_info * child,
28959+ struct reiser4_crypto_info * parent)
28960+{
28961+ return !memcmp(child->keyid,
28962+ parent->keyid,
28963+ info_digest_plugin(parent)->fipsize);
28964+}
28965+
28966+/* check if a crypto-stat (which is bound to @parent) can be inherited */
28967+int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
28968+{
28969+ if (!need_cipher(child))
28970+ return 0;
28971+ /* the child is created */
28972+ if (!inode_crypto_info(child))
28973+ return 1;
28974+ /* the child is looked up */
28975+ if (!inode_crypto_info(parent))
28976+ return 0;
28977+ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
28978+ inode_digest_plugin(child) == inode_digest_plugin(parent) &&
28979+ inode_crypto_info(child)->keysize ==
28980+ inode_crypto_info(parent)->keysize &&
28981+ keyid_eq(inode_crypto_info(child), inode_crypto_info(parent)));
28982+}
28983+#endif
28984+
28985+/* helper functions for ->create() method of the cryptcompress plugin */
28986+static int inode_set_crypto(struct inode * object)
28987+{
28988+ reiser4_inode * info;
28989+ if (!inode_crypto_info(object)) {
28990+ if (need_cipher(object))
28991+ return RETERR(-EINVAL);
28992+ /* the file is not to be encrypted */
28993+ return 0;
28994+ }
28995+ info = reiser4_inode_data(object);
28996+ info->extmask |= (1 << CRYPTO_STAT);
28997+ return 0;
28998+}
28999+
29000+static int inode_init_compression(struct inode * object)
29001+{
29002+ int result = 0;
29003+ assert("edward-1461", object != NULL);
29004+ if (inode_compression_plugin(object)->init)
29005+ result = inode_compression_plugin(object)->init();
29006+ return result;
29007+}
29008+
29009+static int inode_check_cluster(struct inode * object)
29010+{
29011+ assert("edward-696", object != NULL);
29012+
29013+ if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) {
29014+ warning("edward-1320", "Can not support '%s' "
29015+ "logical clusters (less then page size)",
29016+ inode_cluster_plugin(object)->h.label);
29017+ return RETERR(-EINVAL);
29018+ }
29019+ if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){
29020+ warning("edward-1463", "Can not support '%s' "
29021+ "logical clusters (too big for transform)",
29022+ inode_cluster_plugin(object)->h.label);
29023+ return RETERR(-EINVAL);
29024+ }
29025+ return 0;
29026+}
29027+
29028+/* plugin->destroy_inode() */
29029+void destroy_inode_cryptcompress(struct inode * inode)
29030+{
29031+ assert("edward-1464", INODE_PGCOUNT(inode) == 0);
29032+ reiser4_detach_crypto_info(inode);
29033+ return;
29034+}
29035+
29036+/* plugin->create_object():
29037+. install plugins
29038+. attach crypto info if specified
29039+. attach compression info if specified
29040+. attach cluster info
29041+*/
29042+int create_object_cryptcompress(struct inode *object, struct inode *parent,
29043+ reiser4_object_create_data * data)
29044+{
29045+ int result;
29046+ reiser4_inode *info;
29047+
29048+ assert("edward-23", object != NULL);
29049+ assert("edward-24", parent != NULL);
29050+ assert("edward-30", data != NULL);
29051+ assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
29052+ assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
29053+
29054+ info = reiser4_inode_data(object);
29055+
29056+ assert("edward-29", info != NULL);
29057+
29058+ /* set file bit */
29059+ info->plugin_mask |= (1 << PSET_FILE);
29060+
29061+ /* set crypto */
29062+ result = inode_set_crypto(object);
29063+ if (result)
29064+ goto error;
29065+ /* set compression */
29066+ result = inode_init_compression(object);
29067+ if (result)
29068+ goto error;
29069+ /* set cluster */
29070+ result = inode_check_cluster(object);
29071+ if (result)
29072+ goto error;
29073+
29074+ /* save everything in disk stat-data */
29075+ result = write_sd_by_inode_common(object);
29076+ if (!result)
29077+ return 0;
29078+ error:
29079+ reiser4_detach_crypto_info(object);
29080+ return result;
29081+}
29082+
29083+/* plugin->open() */
29084+int open_cryptcompress(struct inode * inode, struct file * file)
29085+{
29086+ return 0;
29087+}
29088+
29089+/* returns a blocksize, the attribute of a cipher algorithm */
29090+static unsigned int
29091+cipher_blocksize(struct inode * inode)
29092+{
29093+ assert("edward-758", need_cipher(inode));
29094+ assert("edward-1400", inode_crypto_info(inode) != NULL);
29095+ return crypto_blkcipher_blocksize
29096+ (info_get_cipher(inode_crypto_info(inode)));
29097+}
29098+
29099+/* returns offset translated by scale factor of the crypto-algorithm */
29100+static loff_t inode_scaled_offset (struct inode * inode,
29101+ const loff_t src_off /* input offset */)
29102+{
29103+ assert("edward-97", inode != NULL);
29104+
29105+ if (!need_cipher(inode) ||
29106+ src_off == get_key_offset(reiser4_min_key()) ||
29107+ src_off == get_key_offset(reiser4_max_key()))
29108+ return src_off;
29109+
29110+ return inode_cipher_plugin(inode)->scale(inode,
29111+ cipher_blocksize(inode),
29112+ src_off);
29113+}
29114+
29115+/* returns disk cluster size */
29116+size_t inode_scaled_cluster_size(struct inode * inode)
29117+{
29118+ assert("edward-110", inode != NULL);
29119+
29120+ return inode_scaled_offset(inode, inode_cluster_size(inode));
29121+}
29122+
29123+/* set number of cluster pages */
29124+static void set_cluster_nrpages(struct cluster_handle * clust,
29125+ struct inode *inode)
29126+{
29127+ struct reiser4_slide * win;
29128+
29129+ assert("edward-180", clust != NULL);
29130+ assert("edward-1040", inode != NULL);
29131+
29132+ clust->old_nrpages = size_in_pages(lbytes(clust->index, inode));
29133+ win = clust->win;
29134+ if (!win) {
29135+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
29136+ return;
29137+ }
29138+ assert("edward-1176", clust->op != LC_INVAL);
29139+ assert("edward-1064", win->off + win->count + win->delta != 0);
29140+
29141+ if (win->stat == HOLE_WINDOW &&
29142+ win->off == 0 && win->count == inode_cluster_size(inode)) {
29143+ /* special case: writing a "fake" logical cluster */
29144+ clust->nr_pages = 0;
29145+ return;
29146+ }
29147+ clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta,
29148+ lbytes(clust->index, inode)));
29149+ return;
29150+}
29151+
29152+/* plugin->key_by_inode()
29153+ build key of a disk cluster */
29154+int key_by_inode_cryptcompress(struct inode *inode, loff_t off,
29155+ reiser4_key * key)
29156+{
29157+ assert("edward-64", inode != 0);
29158+
29159+ if (likely(off != get_key_offset(reiser4_max_key())))
29160+ off = off_to_clust_to_off(off, inode);
29161+ if (inode_crypto_info(inode))
29162+ off = inode_scaled_offset(inode, off);
29163+
29164+ key_by_inode_and_offset_common(inode, 0, key);
29165+ set_key_offset(key, (__u64)off);
29166+ return 0;
29167+}
29168+
29169+/* plugin->flow_by_inode() */
29170+/* flow is used to read/write disk clusters */
29171+int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf,
29172+ int user, /* 1: @buf is of user space,
29173+ 0: kernel space */
29174+ loff_t size, /* @buf size */
29175+ loff_t off, /* offset to start io from */
29176+ rw_op op, /* READ or WRITE */
29177+ flow_t * f /* resulting flow */)
29178+{
29179+ assert("edward-436", f != NULL);
29180+ assert("edward-149", inode != NULL);
29181+ assert("edward-150", inode_file_plugin(inode) != NULL);
29182+ assert("edward-1465", user == 0); /* we use flow to read/write
29183+ disk clusters located in
29184+ kernel space */
29185+ f->length = size;
29186+ memcpy(&f->data, &buf, sizeof(buf));
29187+ f->user = user;
29188+ f->op = op;
29189+
29190+ return key_by_inode_cryptcompress(inode, off, &f->key);
29191+}
29192+
29193+static int
29194+cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
29195+ znode_lock_mode lock_mode)
29196+{
29197+ coord_t *coord;
29198+
29199+ assert("edward-704", hint != NULL);
29200+ assert("edward-1089", !hint_is_valid(hint));
29201+ assert("edward-706", hint->lh.owner == NULL);
29202+
29203+ coord = &hint->ext_coord.coord;
29204+
29205+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
29206+ /* hint either not set or set by different operation */
29207+ return RETERR(-E_REPEAT);
29208+
29209+ if (get_key_offset(key) != hint->offset)
29210+ /* hint is set for different key */
29211+ return RETERR(-E_REPEAT);
29212+
29213+ assert("edward-707", reiser4_schedulable());
29214+
29215+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
29216+ key, &hint->lh, lock_mode,
29217+ ZNODE_LOCK_LOPRI);
29218+}
29219+
29220+/* reserve disk space when writing a logical cluster */
29221+static int reserve4cluster(struct inode *inode, struct cluster_handle *clust)
29222+{
29223+ int result = 0;
29224+
29225+ assert("edward-965", reiser4_schedulable());
29226+ assert("edward-439", inode != NULL);
29227+ assert("edward-440", clust != NULL);
29228+ assert("edward-441", clust->pages != NULL);
29229+
29230+ if (clust->nr_pages == 0) {
29231+ assert("edward-1152", clust->win != NULL);
29232+ assert("edward-1153", clust->win->stat == HOLE_WINDOW);
29233+ /* don't reserve disk space for fake logical cluster */
29234+ return 0;
29235+ }
29236+ assert("edward-442", jprivate(clust->pages[0]) != NULL);
29237+
29238+ result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
29239+ estimate_update_cluster(inode),
29240+ BA_CAN_COMMIT);
29241+ if (result)
29242+ return result;
29243+ clust->reserved = 1;
29244+ grabbed2cluster_reserved(estimate_insert_cluster(inode) +
29245+ estimate_update_cluster(inode));
29246+#if REISER4_DEBUG
29247+ clust->reserved_prepped = estimate_update_cluster(inode);
29248+ clust->reserved_unprepped = estimate_insert_cluster(inode);
29249+#endif
29250+ /* there can be space grabbed by txnmgr_force_commit_all */
29251+ return 0;
29252+}
29253+
29254+/* free reserved disk space if writing a logical cluster fails */
29255+static void free_reserved4cluster(struct inode *inode,
29256+ struct cluster_handle *ch, int count)
29257+{
29258+ assert("edward-967", ch->reserved == 1);
29259+
29260+ cluster_reserved2free(count);
29261+ ch->reserved = 0;
29262+}
29263+
29264+/* The core search procedure of the cryptcompress plugin.
29265+ If returned value is not cbk_errored, then current znode is locked */
29266+static int find_cluster_item(hint_t * hint,
29267+ const reiser4_key * key, /* key of the item we are
29268+ looking for */
29269+ znode_lock_mode lock_mode /* which lock */ ,
29270+ ra_info_t * ra_info, lookup_bias bias, __u32 flags)
29271+{
29272+ int result;
29273+ reiser4_key ikey;
29274+ int went_right = 0;
29275+ coord_t *coord = &hint->ext_coord.coord;
29276+ coord_t orig = *coord;
29277+
29278+ assert("edward-152", hint != NULL);
29279+
29280+ if (!hint_is_valid(hint)) {
29281+ result = cryptcompress_hint_validate(hint, key, lock_mode);
29282+ if (result == -E_REPEAT)
29283+ goto traverse_tree;
29284+ else if (result) {
29285+ assert("edward-1216", 0);
29286+ return result;
29287+ }
29288+ hint_set_valid(hint);
29289+ }
29290+ assert("edward-709", znode_is_any_locked(coord->node));
29291+
29292+ /* In-place lookup is going here, it means we just need to
29293+ check if next item of the @coord match to the @keyhint) */
29294+
29295+ if (equal_to_rdk(coord->node, key)) {
29296+ result = goto_right_neighbor(coord, &hint->lh);
29297+ if (result == -E_NO_NEIGHBOR) {
29298+ assert("edward-1217", 0);
29299+ return RETERR(-EIO);
29300+ }
29301+ if (result)
29302+ return result;
29303+ assert("edward-1218", equal_to_ldk(coord->node, key));
29304+ went_right = 1;
29305+ } else {
29306+ coord->item_pos++;
29307+ coord->unit_pos = 0;
29308+ coord->between = AT_UNIT;
29309+ }
29310+ result = zload(coord->node);
29311+ if (result)
29312+ return result;
29313+ assert("edward-1219", !node_is_empty(coord->node));
29314+
29315+ if (!coord_is_existing_item(coord)) {
29316+ zrelse(coord->node);
29317+ goto not_found;
29318+ }
29319+ item_key_by_coord(coord, &ikey);
29320+ zrelse(coord->node);
29321+ if (!keyeq(key, &ikey))
29322+ goto not_found;
29323+ /* Ok, item is found, update node counts */
29324+ if (went_right)
29325+ dclust_inc_extension_ncount(hint);
29326+ return CBK_COORD_FOUND;
29327+
29328+ not_found:
29329+ assert("edward-1220", coord->item_pos > 0);
29330+ //coord->item_pos--;
29331+ /* roll back */
29332+ *coord = orig;
29333+ ON_DEBUG(coord_update_v(coord));
29334+ return CBK_COORD_NOTFOUND;
29335+
29336+ traverse_tree:
29337+ assert("edward-713", hint->lh.owner == NULL);
29338+ assert("edward-714", reiser4_schedulable());
29339+
29340+ reiser4_unset_hint(hint);
29341+ dclust_init_extension(hint);
29342+ coord_init_zero(coord);
29343+ result = coord_by_key(current_tree, key, coord, &hint->lh,
29344+ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
29345+ CBK_UNIQUE | flags, ra_info);
29346+ if (cbk_errored(result))
29347+ return result;
29348+ if(result == CBK_COORD_FOUND)
29349+ dclust_inc_extension_ncount(hint);
29350+ hint_set_valid(hint);
29351+ return result;
29352+}
29353+
29354+/* This function is called by deflate[inflate] manager when
29355+ creating a transformed/plain stream to check if we should
29356+ create/cut some overhead. If this returns true, then @oh
29357+ contains the size of this overhead.
29358+ */
29359+static int need_cut_or_align(struct inode * inode,
29360+ struct cluster_handle * ch, rw_op rw, int * oh)
29361+{
29362+ struct tfm_cluster * tc = &ch->tc;
29363+ switch (rw) {
29364+ case WRITE_OP: /* estimate align */
29365+ *oh = tc->len % cipher_blocksize(inode);
29366+ if (*oh != 0)
29367+ return 1;
29368+ break;
29369+ case READ_OP: /* estimate cut */
29370+ *oh = *(tfm_output_data(ch) + tc->len - 1);
29371+ break;
29372+ default:
29373+ impossible("edward-1401", "bad option");
29374+ }
29375+ return (tc->len != tc->lsize);
29376+}
29377+
29378+/* create/cut an overhead of transformed/plain stream */
29379+static void align_or_cut_overhead(struct inode * inode,
29380+ struct cluster_handle * ch, rw_op rw)
29381+{
29382+ int oh;
29383+ cipher_plugin * cplug = inode_cipher_plugin(inode);
29384+
29385+ assert("edward-1402", need_cipher(inode));
29386+
29387+ if (!need_cut_or_align(inode, ch, rw, &oh))
29388+ return;
29389+ switch (rw) {
29390+ case WRITE_OP: /* do align */
29391+ ch->tc.len +=
29392+ cplug->align_stream(tfm_input_data(ch) +
29393+ ch->tc.len, ch->tc.len,
29394+ cipher_blocksize(inode));
29395+ *(tfm_input_data(ch) + ch->tc.len - 1) =
29396+ cipher_blocksize(inode) - oh;
29397+ break;
29398+ case READ_OP: /* do cut */
29399+ assert("edward-1403", oh <= cipher_blocksize(inode));
29400+ ch->tc.len -= oh;
29401+ break;
29402+ default:
29403+ impossible("edward-1404", "bad option");
29404+ }
29405+ return;
29406+}
29407+
29408+static unsigned max_cipher_overhead(struct inode * inode)
29409+{
29410+ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
29411+ return 0;
29412+ return cipher_blocksize(inode);
29413+}
29414+
29415+static int deflate_overhead(struct inode *inode)
29416+{
29417+ return (inode_compression_plugin(inode)->
29418+ checksum ? DC_CHECKSUM_SIZE : 0);
29419+}
29420+
29421+static unsigned deflate_overrun(struct inode * inode, int ilen)
29422+{
29423+ return coa_overrun(inode_compression_plugin(inode), ilen);
29424+}
29425+
29426+/* Estimating compressibility of a logical cluster by various
29427+ policies represented by compression mode plugin.
29428+ If this returns false, then compressor won't be called for
29429+ the cluster of index @index.
29430+*/
29431+static int should_compress(struct tfm_cluster * tc, cloff_t index,
29432+ struct inode *inode)
29433+{
29434+ compression_plugin *cplug = inode_compression_plugin(inode);
29435+ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
29436+
29437+ assert("edward-1321", tc->len != 0);
29438+ assert("edward-1322", cplug != NULL);
29439+ assert("edward-1323", mplug != NULL);
29440+
29441+ return /* estimate by size */
29442+ (cplug->min_size_deflate ?
29443+ tc->len >= cplug->min_size_deflate() :
29444+ 1) &&
29445+ /* estimate by compression mode plugin */
29446+ (mplug->should_deflate ?
29447+ mplug->should_deflate(inode, index) :
29448+ 1);
29449+}
29450+
29451+/* Evaluating results of compression transform.
29452+ Returns true, if we need to accept this results */
29453+static int save_compressed(int size_before, int size_after, struct inode *inode)
29454+{
29455+ return (size_after + deflate_overhead(inode) +
29456+ max_cipher_overhead(inode) < size_before);
29457+}
29458+
29459+/* Guess result of the evaluation above */
29460+static int need_inflate(struct cluster_handle * ch, struct inode * inode,
29461+ int encrypted /* is cluster encrypted */ )
29462+{
29463+ struct tfm_cluster * tc = &ch->tc;
29464+
29465+ assert("edward-142", tc != 0);
29466+ assert("edward-143", inode != NULL);
29467+
29468+ return tc->len <
29469+ (encrypted ?
29470+ inode_scaled_offset(inode, tc->lsize) :
29471+ tc->lsize);
29472+}
29473+
29474+/* If results of compression were accepted, then we add
29475+ a checksum to catch possible disk cluster corruption.
29476+ The following is a format of the data stored in disk clusters:
29477+
29478+ data This is (transformed) logical cluster.
29479+ cipher_overhead This is created by ->align() method
29480+ of cipher plugin. May be absent.
29481+ checksum (4) This is created by ->checksum method
29482+ of compression plugin to check
29483+ integrity. May be absent.
29484+
29485+ Crypto overhead format:
29486+
29487+ data
29488+ control_byte (1) contains aligned overhead size:
29489+ 1 <= overhead <= cipher_blksize
29490+*/
29491+/* Append a checksum at the end of a transformed stream */
29492+static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29493+{
29494+ __u32 checksum;
29495+
29496+ assert("edward-1309", tc != NULL);
29497+ assert("edward-1310", tc->len > 0);
29498+ assert("edward-1311", cplug->checksum != NULL);
29499+
29500+ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
29501+ put_unaligned(cpu_to_le32(checksum),
29502+ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
29503+ tc->len += (int)DC_CHECKSUM_SIZE;
29504+}
29505+
29506+/* Check a disk cluster checksum.
29507+ Returns 0 if checksum is correct, otherwise returns 1 */
29508+static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29509+{
29510+ assert("edward-1312", tc != NULL);
29511+ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
29512+ assert("edward-1314", cplug->checksum != NULL);
29513+
29514+ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
29515+ tc->len - (int)DC_CHECKSUM_SIZE) !=
29516+ le32_to_cpu(get_unaligned((d32 *)
29517+ (tfm_stream_data(tc, INPUT_STREAM)
29518+ + tc->len - (int)DC_CHECKSUM_SIZE)))) {
29519+ warning("edward-156",
29520+ "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
29521+ (int)le32_to_cpu
29522+ (get_unaligned((d32 *)
29523+ (tfm_stream_data(tc, INPUT_STREAM) +
29524+ tc->len - (int)DC_CHECKSUM_SIZE))),
29525+ (int)cplug->checksum
29526+ (tfm_stream_data(tc, INPUT_STREAM),
29527+ tc->len - (int)DC_CHECKSUM_SIZE));
29528+ return 1;
29529+ }
29530+ tc->len -= (int)DC_CHECKSUM_SIZE;
29531+ return 0;
29532+}
29533+
29534+/* get input/output stream for some transform action */
29535+int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc,
29536+ tfm_stream_id id)
29537+{
29538+ size_t size = inode_scaled_cluster_size(inode);
29539+
29540+ assert("edward-901", tc != NULL);
29541+ assert("edward-1027", inode_compression_plugin(inode) != NULL);
29542+
29543+ if (cluster_get_tfm_act(tc) == TFMA_WRITE)
29544+ size += deflate_overrun(inode, inode_cluster_size(inode));
29545+
29546+ if (!get_tfm_stream(tc, id) && id == INPUT_STREAM)
29547+ alternate_streams(tc);
29548+ if (!get_tfm_stream(tc, id))
29549+ return alloc_tfm_stream(tc, size, id);
29550+
29551+ assert("edward-902", tfm_stream_is_set(tc, id));
29552+
29553+ if (tfm_stream_size(tc, id) < size)
29554+ return realloc_tfm_stream(tc, size, id);
29555+ return 0;
29556+}
29557+
29558+/* Common deflate manager */
29559+int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode)
29560+{
29561+ int result = 0;
29562+ int compressed = 0;
29563+ int encrypted = 0;
29564+ struct tfm_cluster * tc = &clust->tc;
29565+ compression_plugin * coplug;
29566+
29567+ assert("edward-401", inode != NULL);
29568+ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
29569+ assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
29570+ assert("edward-498", !tfm_cluster_is_uptodate(tc));
29571+
29572+ coplug = inode_compression_plugin(inode);
29573+ if (should_compress(tc, clust->index, inode)) {
29574+ /* try to compress, discard bad results */
29575+ __u32 dst_len;
29576+ compression_mode_plugin * mplug =
29577+ inode_compression_mode_plugin(inode);
29578+ assert("edward-602", coplug != NULL);
29579+ assert("edward-1423", coplug->compress != NULL);
29580+
29581+ result = grab_coa(tc, coplug);
29582+ if (result) {
29583+ warning("edward-1424",
29584+ "alloc_coa failed with ret=%d, skipped compression",
29585+ result);
29586+ goto cipher;
29587+ }
29588+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29589+ if (result) {
29590+ warning("edward-1425",
29591+ "alloc stream failed with ret=%d, skipped compression",
29592+ result);
29593+ goto cipher;
29594+ }
29595+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
29596+ coplug->compress(get_coa(tc, coplug->h.id, tc->act),
29597+ tfm_input_data(clust), tc->len,
29598+ tfm_output_data(clust), &dst_len);
29599+ /* make sure we didn't overwrite extra bytes */
29600+ assert("edward-603",
29601+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
29602+
29603+ /* evaluate results of compression transform */
29604+ if (save_compressed(tc->len, dst_len, inode)) {
29605+ /* good result, accept */
29606+ tc->len = dst_len;
29607+ if (mplug->accept_hook != NULL) {
29608+ result = mplug->accept_hook(inode, clust->index);
29609+ if (result)
29610+ warning("edward-1426",
29611+ "accept_hook failed with ret=%d",
29612+ result);
29613+ }
29614+ compressed = 1;
29615+ }
29616+ else {
29617+ /* bad result, discard */
29618+#if 0
29619+ if (cluster_is_complete(clust, inode))
29620+ warning("edward-1496",
29621+ "incompressible cluster %lu (inode %llu)",
29622+ clust->index,
29623+ (unsigned long long)get_inode_oid(inode));
29624+#endif
29625+ if (mplug->discard_hook != NULL &&
29626+ cluster_is_complete(clust, inode)) {
29627+ result = mplug->discard_hook(inode,
29628+ clust->index);
29629+ if (result)
29630+ warning("edward-1427",
29631+ "discard_hook failed with ret=%d",
29632+ result);
29633+ }
29634+ }
29635+ }
29636+ cipher:
29637+ if (need_cipher(inode)) {
29638+ cipher_plugin * ciplug;
29639+ struct blkcipher_desc desc;
29640+ struct scatterlist src;
29641+ struct scatterlist dst;
29642+
29643+ ciplug = inode_cipher_plugin(inode);
29644+ desc.tfm = info_get_cipher(inode_crypto_info(inode));
29645+ desc.flags = 0;
29646+ if (compressed)
29647+ alternate_streams(tc);
29648+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29649+ if (result)
29650+ return result;
29651+
29652+ align_or_cut_overhead(inode, clust, WRITE_OP);
29653+ sg_init_one(&src, tfm_input_data(clust), tc->len);
29654+ sg_init_one(&dst, tfm_output_data(clust), tc->len);
29655+
29656+ result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
29657+ if (result) {
29658+ warning("edward-1405",
29659+ "encryption failed flags=%x\n", desc.flags);
29660+ return result;
29661+ }
29662+ encrypted = 1;
29663+ }
29664+ if (compressed && coplug->checksum != NULL)
29665+ dc_set_checksum(coplug, tc);
29666+ if (!compressed && !encrypted)
29667+ alternate_streams(tc);
29668+ return result;
29669+}
29670+
29671+/* Common inflate manager. */
29672+int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode)
29673+{
29674+ int result = 0;
29675+ int transformed = 0;
29676+ struct tfm_cluster * tc = &clust->tc;
29677+ compression_plugin * coplug;
29678+
29679+ assert("edward-905", inode != NULL);
29680+ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
29681+ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
29682+ assert("edward-1349", tc->act == TFMA_READ);
29683+ assert("edward-907", !tfm_cluster_is_uptodate(tc));
29684+
29685+ /* Handle a checksum (if any) */
29686+ coplug = inode_compression_plugin(inode);
29687+ if (need_inflate(clust, inode, need_cipher(inode)) &&
29688+ coplug->checksum != NULL) {
29689+ result = dc_check_checksum(coplug, tc);
29690+ if (unlikely(result)) {
29691+ warning("edward-1460",
29692+ "Inode %llu: disk cluster %lu looks corrupted",
29693+ (unsigned long long)get_inode_oid(inode),
29694+ clust->index);
29695+ return RETERR(-EIO);
29696+ }
29697+ }
29698+ if (need_cipher(inode)) {
29699+ cipher_plugin * ciplug;
29700+ struct blkcipher_desc desc;
29701+ struct scatterlist src;
29702+ struct scatterlist dst;
29703+
29704+ ciplug = inode_cipher_plugin(inode);
29705+ desc.tfm = info_get_cipher(inode_crypto_info(inode));
29706+ desc.flags = 0;
29707+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29708+ if (result)
29709+ return result;
29710+ assert("edward-909", tfm_cluster_is_set(tc));
29711+
29712+ sg_init_one(&src, tfm_input_data(clust), tc->len);
29713+ sg_init_one(&dst, tfm_output_data(clust), tc->len);
29714+
29715+ result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
29716+ if (result) {
29717+ warning("edward-1600", "decrypt failed flags=%x\n",
29718+ desc.flags);
29719+ return result;
29720+ }
29721+ align_or_cut_overhead(inode, clust, READ_OP);
29722+ transformed = 1;
29723+ }
29724+ if (need_inflate(clust, inode, 0)) {
29725+ unsigned dst_len = inode_cluster_size(inode);
29726+ if(transformed)
29727+ alternate_streams(tc);
29728+
29729+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29730+ if (result)
29731+ return result;
29732+ assert("edward-1305", coplug->decompress != NULL);
29733+ assert("edward-910", tfm_cluster_is_set(tc));
29734+
29735+ coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
29736+ tfm_input_data(clust), tc->len,
29737+ tfm_output_data(clust), &dst_len);
29738+ /* check length */
29739+ tc->len = dst_len;
29740+ assert("edward-157", dst_len == tc->lsize);
29741+ transformed = 1;
29742+ }
29743+ if (!transformed)
29744+ alternate_streams(tc);
29745+ return result;
29746+}
29747+
29748+/* This is implementation of readpage method of struct
29749+ address_space_operations for cryptcompress plugin. */
29750+int readpage_cryptcompress(struct file *file, struct page *page)
29751+{
29752+ reiser4_context *ctx;
29753+ struct cluster_handle clust;
29754+ item_plugin *iplug;
29755+ int result;
29756+
29757+ assert("edward-88", PageLocked(page));
29758+ assert("vs-976", !PageUptodate(page));
29759+ assert("edward-89", page->mapping && page->mapping->host);
29760+
29761+ ctx = reiser4_init_context(page->mapping->host->i_sb);
29762+ if (IS_ERR(ctx)) {
29763+ unlock_page(page);
29764+ return PTR_ERR(ctx);
29765+ }
29766+ assert("edward-113",
29767+ ergo(file != NULL,
29768+ page->mapping == file->f_dentry->d_inode->i_mapping));
29769+
29770+ if (PageUptodate(page)) {
29771+ warning("edward-1338", "page is already uptodate\n");
29772+ unlock_page(page);
29773+ reiser4_exit_context(ctx);
29774+ return 0;
29775+ }
29776+ cluster_init_read(&clust, NULL);
29777+ clust.file = file;
29778+ iplug = item_plugin_by_id(CTAIL_ID);
29779+ if (!iplug->s.file.readpage) {
29780+ unlock_page(page);
29781+ put_cluster_handle(&clust);
29782+ reiser4_exit_context(ctx);
29783+ return -EINVAL;
29784+ }
29785+ result = iplug->s.file.readpage(&clust, page);
29786+
29787+ put_cluster_handle(&clust);
29788+ reiser4_txn_restart(ctx);
29789+ reiser4_exit_context(ctx);
29790+ return result;
29791+}
29792+
29793+/* number of pages to check in */
29794+static int get_new_nrpages(struct cluster_handle * clust)
29795+{
29796+ switch (clust->op) {
29797+ case LC_APPOV:
29798+ return clust->nr_pages;
29799+ case LC_TRUNC:
29800+ assert("edward-1179", clust->win != NULL);
29801+ return size_in_pages(clust->win->off + clust->win->count);
29802+ default:
29803+ impossible("edward-1180", "bad page cluster option");
29804+ return 0;
29805+ }
29806+}
29807+
29808+static void set_cluster_pages_dirty(struct cluster_handle * clust,
29809+ struct inode * inode)
29810+{
29811+ int i;
29812+ struct page *pg;
29813+ int nrpages = get_new_nrpages(clust);
29814+
29815+ for (i = 0; i < nrpages; i++) {
29816+
29817+ pg = clust->pages[i];
29818+ assert("edward-968", pg != NULL);
29819+ lock_page(pg);
29820+ assert("edward-1065", PageUptodate(pg));
29821+ reiser4_set_page_dirty_internal(pg);
29822+ unlock_page(pg);
29823+ mark_page_accessed(pg);
29824+ }
29825+}
29826+
29827+/* Grab a page cluster for read/write operations.
29828+ Attach a jnode for write operations (when preparing for modifications, which
29829+ are supposed to be committed).
29830+
29831+ We allocate only one jnode per page cluster; this jnode is binded to the
29832+ first page of this cluster, so we have an extra-reference that will be put
29833+ as soon as jnode is evicted from memory), other references will be cleaned
29834+ up in flush time (assume that check in page cluster was successful).
29835+*/
29836+int grab_page_cluster(struct inode * inode,
29837+ struct cluster_handle * clust, rw_op rw)
29838+{
29839+ int i;
29840+ int result = 0;
29841+ jnode *node = NULL;
29842+
29843+ assert("edward-182", clust != NULL);
29844+ assert("edward-183", clust->pages != NULL);
29845+ assert("edward-1466", clust->node == NULL);
29846+ assert("edward-1428", inode != NULL);
29847+ assert("edward-1429", inode->i_mapping != NULL);
29848+ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
29849+
29850+ if (clust->nr_pages == 0)
29851+ return 0;
29852+
29853+ for (i = 0; i < clust->nr_pages; i++) {
29854+
29855+ assert("edward-1044", clust->pages[i] == NULL);
29856+
29857+ clust->pages[i] =
29858+ find_or_create_page(inode->i_mapping,
29859+ clust_to_pg(clust->index, inode) + i,
29860+ reiser4_ctx_gfp_mask_get());
29861+ if (!clust->pages[i]) {
29862+ result = RETERR(-ENOMEM);
29863+ break;
29864+ }
29865+ if (i == 0 && rw == WRITE_OP) {
29866+ node = jnode_of_page(clust->pages[i]);
29867+ if (IS_ERR(node)) {
29868+ result = PTR_ERR(node);
29869+ unlock_page(clust->pages[i]);
29870+ break;
29871+ }
29872+ JF_SET(node, JNODE_CLUSTER_PAGE);
29873+ assert("edward-920", jprivate(clust->pages[0]));
29874+ }
29875+ INODE_PGCOUNT_INC(inode);
29876+ unlock_page(clust->pages[i]);
29877+ }
29878+ if (unlikely(result)) {
29879+ while (i) {
29880+ put_cluster_page(clust->pages[--i]);
29881+ INODE_PGCOUNT_DEC(inode);
29882+ }
29883+ if (node && !IS_ERR(node))
29884+ jput(node);
29885+ return result;
29886+ }
29887+ clust->node = node;
29888+ return 0;
29889+}
29890+
29891+static void truncate_page_cluster_range(struct inode * inode,
29892+ struct page ** pages,
29893+ cloff_t index,
29894+ int from, int count,
29895+ int even_cows)
29896+{
29897+ assert("edward-1467", count > 0);
29898+ reiser4_invalidate_pages(inode->i_mapping,
29899+ clust_to_pg(index, inode) + from,
29900+ count, even_cows);
29901+}
29902+
29903+/* Put @count pages starting from @from offset */
29904+void __put_page_cluster(int from, int count,
29905+ struct page ** pages, struct inode * inode)
29906+{
29907+ int i;
29908+ assert("edward-1468", pages != NULL);
29909+ assert("edward-1469", inode != NULL);
29910+ assert("edward-1470", from >= 0 && count >= 0);
29911+
29912+ for (i = 0; i < count; i++) {
29913+ assert("edward-1471", pages[from + i] != NULL);
29914+ assert("edward-1472",
29915+ pages[from + i]->index == pages[from]->index + i);
29916+
29917+ put_cluster_page(pages[from + i]);
29918+ INODE_PGCOUNT_DEC(inode);
29919+ }
29920+}
29921+
29922+/*
29923+ * This is dual to grab_page_cluster,
29924+ * however if @rw == WRITE_OP, then we call this function
29925+ * only if something is failed before checkin page cluster.
29926+ */
29927+void put_page_cluster(struct cluster_handle * clust,
29928+ struct inode * inode, rw_op rw)
29929+{
29930+ assert("edward-445", clust != NULL);
29931+ assert("edward-922", clust->pages != NULL);
29932+ assert("edward-446",
29933+ ergo(clust->nr_pages != 0, clust->pages[0] != NULL));
29934+
29935+ __put_page_cluster(0, clust->nr_pages, clust->pages, inode);
29936+ if (rw == WRITE_OP) {
29937+ if (unlikely(clust->node)) {
29938+ assert("edward-447",
29939+ clust->node == jprivate(clust->pages[0]));
29940+ jput(clust->node);
29941+ clust->node = NULL;
29942+ }
29943+ }
29944+}
29945+
29946+#if REISER4_DEBUG
29947+int cryptcompress_inode_ok(struct inode *inode)
29948+{
29949+ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
29950+ return 0;
29951+ if (!cluster_shift_ok(inode_cluster_shift(inode)))
29952+ return 0;
29953+ return 1;
29954+}
29955+
29956+static int window_ok(struct reiser4_slide * win, struct inode *inode)
29957+{
29958+ assert("edward-1115", win != NULL);
29959+ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
29960+
29961+ return (win->off != inode_cluster_size(inode)) &&
29962+ (win->off + win->count + win->delta <= inode_cluster_size(inode));
29963+}
29964+
29965+static int cluster_ok(struct cluster_handle * clust, struct inode *inode)
29966+{
29967+ assert("edward-279", clust != NULL);
29968+
29969+ if (!clust->pages)
29970+ return 0;
29971+ return (clust->win ? window_ok(clust->win, inode) : 1);
29972+}
29973+#if 0
29974+static int pages_truncate_ok(struct inode *inode, pgoff_t start)
29975+{
29976+ int found;
29977+ struct page * page;
29978+
29979+ found = find_get_pages(inode->i_mapping, start, 1, &page);
29980+ if (found)
29981+ put_cluster_page(page);
29982+ return !found;
29983+}
29984+#else
29985+#define pages_truncate_ok(inode, start) 1
29986+#endif
29987+
29988+static int jnode_truncate_ok(struct inode *inode, cloff_t index)
29989+{
29990+ jnode *node;
29991+ node = jlookup(current_tree, get_inode_oid(inode),
29992+ clust_to_pg(index, inode));
29993+ if (likely(!node))
29994+ return 1;
29995+ jput(node);
29996+ return 0;
29997+}
29998+
29999+static int find_fake_appended(struct inode *inode, cloff_t * index);
30000+
30001+static int body_truncate_ok(struct inode *inode, cloff_t aidx)
30002+{
30003+ int result;
30004+ cloff_t raidx;
30005+
30006+ result = find_fake_appended(inode, &raidx);
30007+ return !result && (aidx == raidx);
30008+}
30009+#endif
30010+
30011+/* guess next window stat */
30012+static inline window_stat next_window_stat(struct reiser4_slide * win)
30013+{
30014+ assert("edward-1130", win != NULL);
30015+ return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
30016+ HOLE_WINDOW : DATA_WINDOW);
30017+}
30018+
30019+/* guess and set next cluster index and window params */
30020+static void move_update_window(struct inode * inode,
30021+ struct cluster_handle * clust,
30022+ loff_t file_off, loff_t to_file)
30023+{
30024+ struct reiser4_slide * win;
30025+
30026+ assert("edward-185", clust != NULL);
30027+ assert("edward-438", clust->pages != NULL);
30028+ assert("edward-281", cluster_ok(clust, inode));
30029+
30030+ win = clust->win;
30031+ if (!win)
30032+ return;
30033+
30034+ switch (win->stat) {
30035+ case DATA_WINDOW:
30036+ /* increment */
30037+ clust->index++;
30038+ win->stat = DATA_WINDOW;
30039+ win->off = 0;
30040+ win->count = min((loff_t)inode_cluster_size(inode), to_file);
30041+ break;
30042+ case HOLE_WINDOW:
30043+ switch (next_window_stat(win)) {
30044+ case HOLE_WINDOW:
30045+ /* skip */
30046+ clust->index = off_to_clust(file_off, inode);
30047+ win->stat = HOLE_WINDOW;
30048+ win->off = 0;
30049+ win->count = off_to_cloff(file_off, inode);
30050+ win->delta = min((loff_t)(inode_cluster_size(inode) -
30051+ win->count), to_file);
30052+ break;
30053+ case DATA_WINDOW:
30054+ /* stay */
30055+ win->stat = DATA_WINDOW;
30056+ /* off+count+delta=inv */
30057+ win->off = win->off + win->count;
30058+ win->count = win->delta;
30059+ win->delta = 0;
30060+ break;
30061+ default:
30062+ impossible("edward-282", "wrong next window state");
30063+ }
30064+ break;
30065+ default:
30066+ impossible("edward-283", "wrong current window state");
30067+ }
30068+ assert("edward-1068", cluster_ok(clust, inode));
30069+}
30070+
30071+static int update_sd_cryptcompress(struct inode *inode)
30072+{
30073+ int result = 0;
30074+
30075+ assert("edward-978", reiser4_schedulable());
30076+
30077+ result = reiser4_grab_space_force(/* one for stat data update */
30078+ estimate_update_common(inode),
30079+ BA_CAN_COMMIT);
30080+ if (result)
30081+ return result;
30082+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
30083+ result = reiser4_update_sd(inode);
30084+
30085+ return result;
30086+}
30087+
30088+static void uncapture_cluster_jnode(jnode * node)
30089+{
30090+ txn_atom *atom;
30091+
30092+ assert_spin_locked(&(node->guard));
30093+
30094+ atom = jnode_get_atom(node);
30095+ if (atom == NULL) {
30096+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
30097+ spin_unlock_jnode(node);
30098+ return;
30099+ }
30100+ reiser4_uncapture_block(node);
30101+ spin_unlock_atom(atom);
30102+ jput(node);
30103+}
30104+
30105+static void put_found_pages(struct page **pages, int nr)
30106+{
30107+ int i;
30108+ for (i = 0; i < nr; i++) {
30109+ assert("edward-1045", pages[i] != NULL);
30110+ put_cluster_page(pages[i]);
30111+ }
30112+}
30113+
30114+/* Lifecycle of a logical cluster in the system.
30115+ *
30116+ *
30117+ * Logical cluster of a cryptcompress file is represented in the system by
30118+ * . page cluster (in memory, primary cache, contains plain text);
30119+ * . disk cluster (in memory, secondary cache, contains transformed text).
30120+ * Primary cache is to reduce number of transform operations (compression,
30121+ * encryption), i.e. to implement transform-caching strategy.
30122+ * Secondary cache is to reduce number of I/O operations, i.e. for usual
30123+ * write-caching strategy. Page cluster is a set of pages, i.e. mapping of
30124+ * a logical cluster to the primary cache. Disk cluster is a set of items
30125+ * of the same type defined by some reiser4 item plugin id.
30126+ *
30127+ * 1. Performing modifications
30128+ *
30129+ * Every modification of a cryptcompress file is considered as a set of
30130+ * operations performed on file's logical clusters. Every such "atomic"
30131+ * modification is truncate, append and(or) overwrite some bytes of a
30132+ * logical cluster performed in the primary cache with the following
30133+ * synchronization with the secondary cache (in flush time). Disk clusters,
30134+ * which live in the secondary cache, are supposed to be synchronized with
30135+ * disk. The mechanism of synchronization of primary and secondary caches
30136+ * includes so-called checkin/checkout technique described below.
30137+ *
30138+ * 2. Submitting modifications
30139+ *
30140+ * Each page cluster has associated jnode (a special in-memory header to
30141+ * keep a track of transactions in reiser4), which is attached to its first
30142+ * page when grabbing page cluster for modifications (see grab_page_cluster).
30143+ * Submitting modifications (see checkin_logical_cluster) is going per logical
30144+ * cluster and includes:
30145+ * . checkin_cluster_size;
30146+ * . checkin_page_cluster.
30147+ * checkin_cluster_size() is resolved to file size update (which completely
30148+ * defines new size of logical cluster (number of file's bytes in a logical
30149+ * cluster).
30150+ * checkin_page_cluster() captures jnode of a page cluster and installs
30151+ * jnode's dirty flag (if needed) to indicate that modifications are
30152+ * successfully checked in.
30153+ *
30154+ * 3. Checking out modifications
30155+ *
30156+ * Is going per logical cluster in flush time (see checkout_logical_cluster).
30157+ * This is the time of synchronizing primary and secondary caches.
30158+ * checkout_logical_cluster() includes:
30159+ * . checkout_page_cluster (retrieving checked in pages).
30160+ * . uncapture jnode (including clear dirty flag and unlock)
30161+ *
30162+ * 4. Committing modifications
30163+ *
30164+ * Proceeding a synchronization of primary and secondary caches. When checking
30165+ * out page cluster (the phase above) pages are locked/flushed/unlocked
30166+ * one-by-one in ascending order of their indexes to contiguous stream, which
30167+ * is supposed to be transformed (compressed, encrypted), chopped up into items
30168+ * and committed to disk as a disk cluster.
30169+ *
30170+ * 5. Managing page references
30171+ *
30172+ * Every checked in page have a special additional "control" reference,
30173+ * which is dropped at checkout. We need this to avoid unexpected evicting
30174+ * pages from memory before checkout. Control references are managed so
30175+ * they are not accumulated with every checkin:
30176+ *
30177+ * 0
30178+ * checkin -> 1
30179+ * 0 -> checkout
30180+ * checkin -> 1
30181+ * checkin -> 1
30182+ * checkin -> 1
30183+ * 0 -> checkout
30184+ * ...
30185+ *
30186+ * Every page cluster has its own unique "cluster lock". Update/drop
30187+ * references are serialized via this lock. Number of checked in cluster
30188+ * pages is calculated by i_size under cluster lock. File size is updated
30189+ * at every checkin action also under cluster lock (except cases of
30190+ * appending/truncating fake logical clusters).
30191+ *
30192+ * Proof of correctness:
30193+ *
30194+ * Since we update file size under cluster lock, in the case of non-fake
30195+ * logical cluster with its lock held we do have expected number of checked
30196+ * in pages. On the other hand, append/truncate of fake logical clusters
30197+ * doesn't change number of checked in pages of any cluster.
30198+ *
30199+ * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode.
30200+ * Currently, I don't see any reason to create a special lock for those
30201+ * needs.
30202+ */
30203+
30204+static inline void lock_cluster(jnode * node)
30205+{
30206+ spin_lock_jnode(node);
30207+}
30208+
30209+static inline void unlock_cluster(jnode * node)
30210+{
30211+ spin_unlock_jnode(node);
30212+}
30213+
30214+static inline void unlock_cluster_uncapture(jnode * node)
30215+{
30216+ uncapture_cluster_jnode(node);
30217+}
30218+
30219+/* Set new file size by window. Cluster lock is required. */
30220+static void checkin_file_size(struct cluster_handle * clust,
30221+ struct inode * inode)
30222+{
30223+ loff_t new_size;
30224+ struct reiser4_slide * win;
30225+
30226+ assert("edward-1181", clust != NULL);
30227+ assert("edward-1182", inode != NULL);
30228+ assert("edward-1473", clust->pages != NULL);
30229+ assert("edward-1474", clust->pages[0] != NULL);
30230+ assert("edward-1475", jprivate(clust->pages[0]) != NULL);
30231+ assert_spin_locked(&(jprivate(clust->pages[0])->guard));
30232+
30233+
30234+ win = clust->win;
30235+ assert("edward-1183", win != NULL);
30236+
30237+ new_size = clust_to_off(clust->index, inode) + win->off;
30238+
30239+ switch (clust->op) {
30240+ case LC_APPOV:
30241+ if (new_size + win->count <= i_size_read(inode))
30242+ /* overwrite only */
30243+ return;
30244+ new_size += win->count;
30245+ break;
30246+ case LC_TRUNC:
30247+ break;
30248+ default:
30249+ impossible("edward-1184", "bad page cluster option");
30250+ break;
30251+ }
30252+ inode_check_scale_nolock(inode, i_size_read(inode), new_size);
30253+ i_size_write(inode, new_size);
30254+ return;
30255+}
30256+
30257+static inline void checkin_cluster_size(struct cluster_handle * clust,
30258+ struct inode * inode)
30259+{
30260+ if (clust->win)
30261+ checkin_file_size(clust, inode);
30262+}
30263+
30264+static int checkin_page_cluster(struct cluster_handle * clust,
30265+ struct inode * inode)
30266+{
30267+ int result;
30268+ jnode * node;
30269+ int old_nrpages = clust->old_nrpages;
30270+ int new_nrpages = get_new_nrpages(clust);
30271+
30272+ node = clust->node;
30273+
30274+ assert("edward-221", node != NULL);
30275+ assert("edward-971", clust->reserved == 1);
30276+ assert("edward-1263",
30277+ clust->reserved_prepped == estimate_update_cluster(inode));
30278+ assert("edward-1264", clust->reserved_unprepped == 0);
30279+
30280+ if (JF_ISSET(node, JNODE_DIRTY)) {
30281+ /*
30282+ * page cluster was checked in, but not yet
30283+ * checked out, so release related resources
30284+ */
30285+ free_reserved4cluster(inode, clust,
30286+ estimate_update_cluster(inode));
30287+ __put_page_cluster(0, clust->old_nrpages,
30288+ clust->pages, inode);
30289+ } else {
30290+ result = capture_cluster_jnode(node);
30291+ if (unlikely(result)) {
30292+ unlock_cluster(node);
30293+ return result;
30294+ }
30295+ jnode_make_dirty_locked(node);
30296+ clust->reserved = 0;
30297+ }
30298+ unlock_cluster(node);
30299+
30300+ if (new_nrpages < old_nrpages) {
30301+ /* truncate >= 1 complete pages */
30302+ __put_page_cluster(new_nrpages,
30303+ old_nrpages - new_nrpages,
30304+ clust->pages, inode);
30305+ truncate_page_cluster_range(inode,
30306+ clust->pages, clust->index,
30307+ new_nrpages,
30308+ old_nrpages - new_nrpages,
30309+ 0);
30310+ }
30311+#if REISER4_DEBUG
30312+ clust->reserved_prepped -= estimate_update_cluster(inode);
30313+#endif
30314+ return 0;
30315+}
30316+
30317+/* Submit modifications of a logical cluster */
30318+static int checkin_logical_cluster(struct cluster_handle * clust,
30319+ struct inode *inode)
30320+{
30321+ int result = 0;
30322+ jnode * node;
30323+
30324+ node = clust->node;
30325+
30326+ assert("edward-1035", node != NULL);
30327+ assert("edward-1029", clust != NULL);
30328+ assert("edward-1030", clust->reserved == 1);
30329+ assert("edward-1031", clust->nr_pages != 0);
30330+ assert("edward-1032", clust->pages != NULL);
30331+ assert("edward-1033", clust->pages[0] != NULL);
30332+ assert("edward-1446", jnode_is_cluster_page(node));
30333+ assert("edward-1476", node == jprivate(clust->pages[0]));
30334+
30335+ lock_cluster(node);
30336+ checkin_cluster_size(clust, inode);
30337+ /* this will unlock cluster */
30338+ result = checkin_page_cluster(clust, inode);
30339+ jput(node);
30340+ clust->node = NULL;
30341+ return result;
30342+}
30343+
30344+/*
30345+ * Retrieve size of logical cluster that was checked in at
30346+ * the latest modifying session (cluster lock is required)
30347+ */
30348+static inline void checkout_cluster_size(struct cluster_handle * clust,
30349+ struct inode * inode)
30350+{
30351+ struct tfm_cluster *tc = &clust->tc;
30352+
30353+ tc->len = lbytes(clust->index, inode);
30354+ assert("edward-1478", tc->len != 0);
30355+}
30356+
30357+/*
30358+ * Retrieve a page cluster with the latest submitted modifications
30359+ * and flush its pages to previously allocated contiguous stream.
30360+ */
30361+static void checkout_page_cluster(struct cluster_handle * clust,
30362+ jnode * node, struct inode * inode)
30363+{
30364+ int i;
30365+ int found;
30366+ int to_put;
30367+ struct tfm_cluster *tc = &clust->tc;
30368+
30369+ /* find and put checked in pages: cluster is locked,
30370+ * so we must get expected number (to_put) of pages
30371+ */
30372+ to_put = size_in_pages(lbytes(clust->index, inode));
30373+ found = find_get_pages(inode->i_mapping,
30374+ clust_to_pg(clust->index, inode),
30375+ to_put, clust->pages);
30376+ BUG_ON(found != to_put);
30377+
30378+ __put_page_cluster(0, to_put, clust->pages, inode);
30379+ unlock_cluster_uncapture(node);
30380+
30381+ /* Flush found pages.
30382+ *
30383+ * Note, that we don't disable modifications while flushing,
30384+ * moreover, some found pages can be truncated, as we have
30385+ * released cluster lock.
30386+ */
30387+ for (i = 0; i < found; i++) {
30388+ int in_page;
30389+ char * data;
30390+ assert("edward-1479",
30391+ clust->pages[i]->index == clust->pages[0]->index + i);
30392+
30393+ lock_page(clust->pages[i]);
30394+ if (!PageUptodate(clust->pages[i])) {
30395+ /* page was truncated */
30396+ assert("edward-1480",
30397+ i_size_read(inode) <= page_offset(clust->pages[i]));
30398+ assert("edward-1481",
30399+ clust->pages[i]->mapping != inode->i_mapping);
30400+ unlock_page(clust->pages[i]);
30401+ break;
30402+ }
30403+ /* Update the number of bytes in the logical cluster,
30404+ * as it could be partially truncated. Note, that only
30405+ * partial truncate is possible (complete truncate can
30406+ * not go here, as it is performed via ->kill_hook()
30407+ * called by cut_file_items(), and the last one must
30408+ * wait for znode locked with parent coord).
30409+ */
30410+ checkout_cluster_size(clust, inode);
30411+
30412+ /* this can be zero, as new file size is
30413+ checked in before truncating pages */
30414+ in_page = __mbp(tc->len, i);
30415+
30416+ data = kmap(clust->pages[i]);
30417+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
30418+ data, in_page);
30419+ kunmap(clust->pages[i]);
30420+
30421+ if (PageDirty(clust->pages[i]))
30422+ cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
30423+
30424+ unlock_page(clust->pages[i]);
30425+
30426+ if (in_page < PAGE_CACHE_SIZE)
30427+ /* end of the file */
30428+ break;
30429+ }
30430+ put_found_pages(clust->pages, found); /* find_get_pages */
30431+ tc->lsize = tc->len;
30432+ return;
30433+}
30434+
30435+/* Check out modifications of a logical cluster */
30436+int checkout_logical_cluster(struct cluster_handle * clust,
30437+ jnode * node, struct inode *inode)
30438+{
30439+ int result;
30440+ struct tfm_cluster *tc = &clust->tc;
30441+
30442+ assert("edward-980", node != NULL);
30443+ assert("edward-236", inode != NULL);
30444+ assert("edward-237", clust != NULL);
30445+ assert("edward-240", !clust->win);
30446+ assert("edward-241", reiser4_schedulable());
30447+ assert("edward-718", cryptcompress_inode_ok(inode));
30448+
30449+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
30450+ if (result) {
30451+ warning("edward-1430", "alloc stream failed with ret=%d",
30452+ result);
30453+ return RETERR(-E_REPEAT);
30454+ }
30455+ lock_cluster(node);
30456+
30457+ if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) {
30458+ /* race with another flush */
30459+ warning("edward-982",
30460+ "checking out logical cluster %lu of inode %llu: "
30461+ "jnode is not dirty", clust->index,
30462+ (unsigned long long)get_inode_oid(inode));
30463+ unlock_cluster(node);
30464+ return RETERR(-E_REPEAT);
30465+ }
30466+ cluster_reserved2grabbed(estimate_update_cluster(inode));
30467+
30468+ /* this will unlock cluster */
30469+ checkout_page_cluster(clust, node, inode);
30470+ return 0;
30471+}
30472+
30473+/* set hint for the cluster of the index @index */
30474+static void set_hint_cluster(struct inode *inode, hint_t * hint,
30475+ cloff_t index, znode_lock_mode mode)
30476+{
30477+ reiser4_key key;
30478+ assert("edward-722", cryptcompress_inode_ok(inode));
30479+ assert("edward-723",
30480+ inode_file_plugin(inode) ==
30481+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
30482+
30483+ inode_file_plugin(inode)->key_by_inode(inode,
30484+ clust_to_off(index, inode),
30485+ &key);
30486+
30487+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
30488+ hint->offset = get_key_offset(&key);
30489+ hint->mode = mode;
30490+}
30491+
30492+void invalidate_hint_cluster(struct cluster_handle * clust)
30493+{
30494+ assert("edward-1291", clust != NULL);
30495+ assert("edward-1292", clust->hint != NULL);
30496+
30497+ done_lh(&clust->hint->lh);
30498+ hint_clr_valid(clust->hint);
30499+}
30500+
30501+static void put_hint_cluster(struct cluster_handle * clust,
30502+ struct inode *inode, znode_lock_mode mode)
30503+{
30504+ assert("edward-1286", clust != NULL);
30505+ assert("edward-1287", clust->hint != NULL);
30506+
30507+ set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
30508+ invalidate_hint_cluster(clust);
30509+}
30510+
30511+static int balance_dirty_page_cluster(struct cluster_handle * clust,
30512+ struct inode *inode, loff_t off,
30513+ loff_t to_file,
30514+ int nr_dirtied)
30515+{
30516+ int result;
30517+ struct cryptcompress_info * info;
30518+
30519+ assert("edward-724", inode != NULL);
30520+ assert("edward-725", cryptcompress_inode_ok(inode));
30521+ assert("edward-1547",
30522+ nr_dirtied != 0 && nr_dirtied <= cluster_nrpages(inode));
30523+
30524+ /* set next window params */
30525+ move_update_window(inode, clust, off, to_file);
30526+
30527+ result = update_sd_cryptcompress(inode);
30528+ if (result)
30529+ return result;
30530+ assert("edward-726", clust->hint->lh.owner == NULL);
30531+ info = cryptcompress_inode_data(inode);
30532+
30533+ mutex_unlock(&info->checkin_mutex);
30534+ reiser4_txn_restart_current();
30535+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, nr_dirtied);
30536+ mutex_lock(&info->checkin_mutex);
30537+ return 0;
30538+}
30539+
30540+/* set zeroes to the page cluster, proceed it, and maybe, try to capture
30541+ its pages */
30542+static int write_hole(struct inode *inode, struct cluster_handle * clust,
30543+ loff_t file_off, loff_t to_file)
30544+{
30545+ int result = 0;
30546+ unsigned cl_off, cl_count = 0;
30547+ unsigned to_pg, pg_off;
30548+ struct reiser4_slide * win;
30549+
30550+ assert("edward-190", clust != NULL);
30551+ assert("edward-1069", clust->win != NULL);
30552+ assert("edward-191", inode != NULL);
30553+ assert("edward-727", cryptcompress_inode_ok(inode));
30554+ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
30555+ assert("edward-1154",
30556+ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
30557+
30558+ win = clust->win;
30559+
30560+ assert("edward-1070", win != NULL);
30561+ assert("edward-201", win->stat == HOLE_WINDOW);
30562+ assert("edward-192", cluster_ok(clust, inode));
30563+
30564+ if (win->off == 0 && win->count == inode_cluster_size(inode)) {
30565+ /* This part of the hole will be represented by "fake"
30566+ * logical cluster, i.e. which doesn't have appropriate
30567+ * disk cluster until someone modify this logical cluster
30568+ * and make it dirty.
30569+ * So go forward here..
30570+ */
30571+ move_update_window(inode, clust, file_off, to_file);
30572+ return 0;
30573+ }
30574+ cl_count = win->count; /* number of zeroes to write */
30575+ cl_off = win->off;
30576+ pg_off = off_to_pgoff(win->off);
30577+
30578+ while (cl_count) {
30579+ struct page *page;
30580+ page = clust->pages[off_to_pg(cl_off)];
30581+
30582+ assert("edward-284", page != NULL);
30583+
30584+ to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count);
30585+ lock_page(page);
30586+ zero_user_page(page, pg_off, to_pg, KM_USER0);
30587+ SetPageUptodate(page);
30588+ reiser4_set_page_dirty_internal(page);
30589+ mark_page_accessed(page);
30590+ unlock_page(page);
30591+
30592+ cl_off += to_pg;
30593+ cl_count -= to_pg;
30594+ pg_off = 0;
30595+ }
30596+ if (!win->delta) {
30597+ /* only zeroes in this window, try to capture
30598+ */
30599+ result = checkin_logical_cluster(clust, inode);
30600+ if (result)
30601+ return result;
30602+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
30603+ result = balance_dirty_page_cluster(clust,
30604+ inode, file_off, to_file,
30605+ win_count_to_nrpages(win));
30606+ } else
30607+ move_update_window(inode, clust, file_off, to_file);
30608+ return result;
30609+}
30610+
30611+/*
30612+ The main disk search procedure for cryptcompress plugin, which
30613+ . scans all items of disk cluster with the lock mode @mode
30614+ . maybe reads each one (if @read)
30615+ . maybe makes its znode dirty (if write lock mode was specified)
30616+
30617+ NOTE-EDWARD: Callers should handle the case when disk cluster
30618+ is incomplete (-EIO)
30619+*/
30620+int find_disk_cluster(struct cluster_handle * clust,
30621+ struct inode *inode, int read, znode_lock_mode mode)
30622+{
30623+ flow_t f;
30624+ hint_t *hint;
30625+ int result = 0;
30626+ int was_grabbed;
30627+ ra_info_t ra_info;
30628+ file_plugin *fplug;
30629+ item_plugin *iplug;
30630+ struct tfm_cluster *tc;
30631+ struct cryptcompress_info * info;
30632+
30633+ assert("edward-138", clust != NULL);
30634+ assert("edward-728", clust->hint != NULL);
30635+ assert("edward-226", reiser4_schedulable());
30636+ assert("edward-137", inode != NULL);
30637+ assert("edward-729", cryptcompress_inode_ok(inode));
30638+
30639+ hint = clust->hint;
30640+ fplug = inode_file_plugin(inode);
30641+ was_grabbed = get_current_context()->grabbed_blocks;
30642+ info = cryptcompress_inode_data(inode);
30643+ tc = &clust->tc;
30644+
30645+ assert("edward-462", !tfm_cluster_is_uptodate(tc));
30646+ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
30647+
30648+ dclust_init_extension(hint);
30649+
30650+ /* set key of the first disk cluster item */
30651+ fplug->flow_by_inode(inode,
30652+ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
30653+ 0 /* kernel space */ ,
30654+ inode_scaled_cluster_size(inode),
30655+ clust_to_off(clust->index, inode), READ_OP, &f);
30656+ if (mode == ZNODE_WRITE_LOCK) {
30657+ /* reserve for flush to make dirty all the leaf nodes
30658+ which contain disk cluster */
30659+ result =
30660+ reiser4_grab_space_force(estimate_dirty_cluster(inode),
30661+ BA_CAN_COMMIT);
30662+ if (result)
30663+ goto out;
30664+ }
30665+
30666+ ra_info.key_to_stop = f.key;
30667+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30668+
30669+ while (f.length) {
30670+ result = find_cluster_item(hint, &f.key, mode,
30671+ NULL, FIND_EXACT,
30672+ (mode == ZNODE_WRITE_LOCK ?
30673+ CBK_FOR_INSERT : 0));
30674+ switch (result) {
30675+ case CBK_COORD_NOTFOUND:
30676+ result = 0;
30677+ if (inode_scaled_offset
30678+ (inode, clust_to_off(clust->index, inode)) ==
30679+ get_key_offset(&f.key)) {
30680+ /* first item not found, this is treated
30681+ as disk cluster is absent */
30682+ clust->dstat = FAKE_DISK_CLUSTER;
30683+ goto out;
30684+ }
30685+ /* we are outside the cluster, stop search here */
30686+ assert("edward-146",
30687+ f.length != inode_scaled_cluster_size(inode));
30688+ goto ok;
30689+ case CBK_COORD_FOUND:
30690+ assert("edward-148",
30691+ hint->ext_coord.coord.between == AT_UNIT);
30692+ assert("edward-460",
30693+ hint->ext_coord.coord.unit_pos == 0);
30694+
30695+ coord_clear_iplug(&hint->ext_coord.coord);
30696+ result = zload_ra(hint->ext_coord.coord.node, &ra_info);
30697+ if (unlikely(result))
30698+ goto out;
30699+ iplug = item_plugin_by_coord(&hint->ext_coord.coord);
30700+ assert("edward-147",
30701+ item_id_by_coord(&hint->ext_coord.coord) ==
30702+ CTAIL_ID);
30703+
30704+ result = iplug->s.file.read(NULL, &f, hint);
30705+ if (result) {
30706+ zrelse(hint->ext_coord.coord.node);
30707+ goto out;
30708+ }
30709+ if (mode == ZNODE_WRITE_LOCK) {
30710+ /* Don't make dirty more nodes then it was
30711+ estimated (see comments before
30712+ estimate_dirty_cluster). Missed nodes will be
30713+ read up in flush time if they are evicted from
30714+ memory */
30715+ if (dclust_get_extension_ncount(hint) <=
30716+ estimate_dirty_cluster(inode))
30717+ znode_make_dirty(hint->ext_coord.coord.node);
30718+
30719+ znode_set_convertible(hint->ext_coord.coord.
30720+ node);
30721+ }
30722+ zrelse(hint->ext_coord.coord.node);
30723+ break;
30724+ default:
30725+ goto out;
30726+ }
30727+ }
30728+ ok:
30729+ /* at least one item was found */
30730+ /* NOTE-EDWARD: Callers should handle the case
30731+ when disk cluster is incomplete (-EIO) */
30732+ tc->len = inode_scaled_cluster_size(inode) - f.length;
30733+ tc->lsize = lbytes(clust->index, inode);
30734+ assert("edward-1196", tc->len > 0);
30735+ assert("edward-1406", tc->lsize > 0);
30736+
30737+ if (hint_is_unprepped_dclust(clust->hint)) {
30738+ clust->dstat = UNPR_DISK_CLUSTER;
30739+ } else if (clust->index == info->trunc_index) {
30740+ clust->dstat = TRNC_DISK_CLUSTER;
30741+ } else {
30742+ clust->dstat = PREP_DISK_CLUSTER;
30743+ dclust_set_extension_dsize(clust->hint, tc->len);
30744+ }
30745+ out:
30746+ assert("edward-1339",
30747+ get_current_context()->grabbed_blocks >= was_grabbed);
30748+ grabbed2free(get_current_context(),
30749+ get_current_super_private(),
30750+ get_current_context()->grabbed_blocks - was_grabbed);
30751+ return result;
30752+}
30753+
30754+int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode,
30755+ znode_lock_mode lock_mode)
30756+{
30757+ reiser4_key key;
30758+ ra_info_t ra_info;
30759+
30760+ assert("edward-730", reiser4_schedulable());
30761+ assert("edward-731", clust != NULL);
30762+ assert("edward-732", inode != NULL);
30763+
30764+ if (hint_is_valid(clust->hint)) {
30765+ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
30766+ assert("edward-1294",
30767+ znode_is_write_locked(clust->hint->lh.node));
30768+ /* already have a valid locked position */
30769+ return (clust->dstat ==
30770+ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
30771+ CBK_COORD_FOUND);
30772+ }
30773+ key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
30774+ &key);
30775+ ra_info.key_to_stop = key;
30776+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30777+
30778+ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
30779+ CBK_FOR_INSERT);
30780+}
30781+
30782+/* Read needed cluster pages before modifying.
30783+ If success, @clust->hint contains locked position in the tree.
30784+ Also:
30785+ . find and set disk cluster state
30786+ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
30787+*/
30788+static int read_some_cluster_pages(struct inode * inode,
30789+ struct cluster_handle * clust)
30790+{
30791+ int i;
30792+ int result = 0;
30793+ item_plugin *iplug;
30794+ struct reiser4_slide * win = clust->win;
30795+ znode_lock_mode mode = ZNODE_WRITE_LOCK;
30796+
30797+ iplug = item_plugin_by_id(CTAIL_ID);
30798+
30799+ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
30800+
30801+#if REISER4_DEBUG
30802+ if (clust->nr_pages == 0) {
30803+ /* start write hole from fake disk cluster */
30804+ assert("edward-1117", win != NULL);
30805+ assert("edward-1118", win->stat == HOLE_WINDOW);
30806+ assert("edward-1119", new_logical_cluster(clust, inode));
30807+ }
30808+#endif
30809+ if (new_logical_cluster(clust, inode)) {
30810+ /*
30811+ new page cluster is about to be written, nothing to read,
30812+ */
30813+ assert("edward-734", reiser4_schedulable());
30814+ assert("edward-735", clust->hint->lh.owner == NULL);
30815+
30816+ if (clust->nr_pages) {
30817+ int off;
30818+ struct page * pg;
30819+ assert("edward-1419", clust->pages != NULL);
30820+ pg = clust->pages[clust->nr_pages - 1];
30821+ assert("edward-1420", pg != NULL);
30822+ off = off_to_pgoff(win->off+win->count+win->delta);
30823+ if (off) {
30824+ lock_page(pg);
30825+ zero_user_page(pg, off, PAGE_CACHE_SIZE - off,
30826+ KM_USER0);
30827+ unlock_page(pg);
30828+ }
30829+ }
30830+ clust->dstat = FAKE_DISK_CLUSTER;
30831+ return 0;
30832+ }
30833+ /*
30834+ Here we should search for disk cluster to figure out its real state.
30835+ Also there is one more important reason to do disk search: we need
30836+ to make disk cluster _dirty_ if it exists
30837+ */
30838+
30839+ /* if windows is specified, read the only pages
30840+ that will be modified partially */
30841+
30842+ for (i = 0; i < clust->nr_pages; i++) {
30843+ struct page *pg = clust->pages[i];
30844+
30845+ lock_page(pg);
30846+ if (PageUptodate(pg)) {
30847+ unlock_page(pg);
30848+ continue;
30849+ }
30850+ unlock_page(pg);
30851+
30852+ if (win &&
30853+ i >= size_in_pages(win->off) &&
30854+ i < off_to_pg(win->off + win->count + win->delta))
30855+ /* page will be completely overwritten */
30856+ continue;
30857+
30858+ if (win && (i == clust->nr_pages - 1) &&
30859+ /* the last page is
30860+ partially modified,
30861+ not uptodate .. */
30862+ (size_in_pages(i_size_read(inode)) <= pg->index)) {
30863+ /* .. and appended,
30864+ so set zeroes to the rest */
30865+ int offset;
30866+ lock_page(pg);
30867+ assert("edward-1260",
30868+ size_in_pages(win->off + win->count +
30869+ win->delta) - 1 == i);
30870+
30871+ offset =
30872+ off_to_pgoff(win->off + win->count + win->delta);
30873+ zero_user_page(pg, offset, PAGE_CACHE_SIZE - offset,
30874+ KM_USER0);
30875+ unlock_page(pg);
30876+ /* still not uptodate */
30877+ break;
30878+ }
30879+ lock_page(pg);
30880+ result = do_readpage_ctail(inode, clust, pg, mode);
30881+
30882+ assert("edward-1526", ergo(!result, PageUptodate(pg)));
30883+ unlock_page(pg);
30884+ if (result) {
30885+ warning("edward-219", "do_readpage_ctail failed");
30886+ goto out;
30887+ }
30888+ }
30889+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
30890+ /* disk cluster unclaimed, but we need to make its znodes dirty
30891+ * to make flush update convert its content
30892+ */
30893+ result = find_disk_cluster(clust, inode,
30894+ 0 /* do not read items */,
30895+ mode);
30896+ }
30897+ out:
30898+ tfm_cluster_clr_uptodate(&clust->tc);
30899+ return result;
30900+}
30901+
30902+static int should_create_unprepped_cluster(struct cluster_handle * clust,
30903+ struct inode * inode)
30904+{
30905+ assert("edward-737", clust != NULL);
30906+
30907+ switch (clust->dstat) {
30908+ case PREP_DISK_CLUSTER:
30909+ case UNPR_DISK_CLUSTER:
30910+ return 0;
30911+ case FAKE_DISK_CLUSTER:
30912+ if (clust->win &&
30913+ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
30914+ assert("edward-1172",
30915+ new_logical_cluster(clust, inode));
30916+ return 0;
30917+ }
30918+ return 1;
30919+ default:
30920+ impossible("edward-1173", "bad disk cluster state");
30921+ return 0;
30922+ }
30923+}
30924+
30925+static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust,
30926+ struct inode *inode)
30927+{
30928+ int result;
30929+
30930+ assert("edward-1123", reiser4_schedulable());
30931+ assert("edward-737", clust != NULL);
30932+ assert("edward-738", inode != NULL);
30933+ assert("edward-739", cryptcompress_inode_ok(inode));
30934+ assert("edward-1053", clust->hint != NULL);
30935+
30936+ if (!should_create_unprepped_cluster(clust, inode)) {
30937+ if (clust->reserved) {
30938+ cluster_reserved2free(estimate_insert_cluster(inode));
30939+#if REISER4_DEBUG
30940+ assert("edward-1267",
30941+ clust->reserved_unprepped ==
30942+ estimate_insert_cluster(inode));
30943+ clust->reserved_unprepped -=
30944+ estimate_insert_cluster(inode);
30945+#endif
30946+ }
30947+ return 0;
30948+ }
30949+ assert("edward-1268", clust->reserved);
30950+ cluster_reserved2grabbed(estimate_insert_cluster(inode));
30951+#if REISER4_DEBUG
30952+ assert("edward-1441",
30953+ clust->reserved_unprepped == estimate_insert_cluster(inode));
30954+ clust->reserved_unprepped -= estimate_insert_cluster(inode);
30955+#endif
30956+ result = ctail_insert_unprepped_cluster(clust, inode);
30957+ if (result)
30958+ return result;
30959+
30960+ inode_add_bytes(inode, inode_cluster_size(inode));
30961+
30962+ assert("edward-743", cryptcompress_inode_ok(inode));
30963+ assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
30964+
30965+ clust->dstat = UNPR_DISK_CLUSTER;
30966+ return 0;
30967+}
30968+
30969+/* . Grab page cluster for read, write, setattr, etc. operations;
30970+ * . Truncate its complete pages, if needed;
30971+ */
30972+int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust,
30973+ rw_op rw)
30974+{
30975+ assert("edward-177", inode != NULL);
30976+ assert("edward-741", cryptcompress_inode_ok(inode));
30977+ assert("edward-740", clust->pages != NULL);
30978+
30979+ set_cluster_nrpages(clust, inode);
30980+ reset_cluster_pgset(clust, cluster_nrpages(inode));
30981+ return grab_page_cluster(inode, clust, rw);
30982+}
30983+
30984+/* Truncate complete page cluster of index @index.
30985+ * This is called by ->kill_hook() method of item
30986+ * plugin when deleting a disk cluster of such index.
30987+ */
30988+void truncate_complete_page_cluster(struct inode *inode, cloff_t index,
30989+ int even_cows)
30990+{
30991+ int found;
30992+ int nr_pages;
30993+ jnode *node;
30994+ struct page *pages[MAX_CLUSTER_NRPAGES];
30995+
30996+ node = jlookup(current_tree, get_inode_oid(inode),
30997+ clust_to_pg(index, inode));
30998+ nr_pages = size_in_pages(lbytes(index, inode));
30999+ assert("edward-1483", nr_pages != 0);
31000+ if (!node)
31001+ goto truncate;
31002+ found = find_get_pages(inode->i_mapping,
31003+ clust_to_pg(index, inode),
31004+ cluster_nrpages(inode), pages);
31005+ if (!found) {
31006+ assert("edward-1484", jnode_truncate_ok(inode, index));
31007+ return;
31008+ }
31009+ lock_cluster(node);
31010+
31011+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
31012+ && index == 0)
31013+ /* converting to unix_file is in progress */
31014+ JF_CLR(node, JNODE_CLUSTER_PAGE);
31015+ if (JF_ISSET(node, JNODE_DIRTY)) {
31016+ /*
31017+ * @nr_pages were checked in, but not yet checked out -
31018+ * we need to release them. (also there can be pages
31019+ * attached to page cache by read(), etc. - don't take
31020+ * them into account).
31021+ */
31022+ assert("edward-1198", found >= nr_pages);
31023+
31024+ /* free disk space grabbed for disk cluster converting */
31025+ cluster_reserved2grabbed(estimate_update_cluster(inode));
31026+ grabbed2free(get_current_context(),
31027+ get_current_super_private(),
31028+ estimate_update_cluster(inode));
31029+ __put_page_cluster(0, nr_pages, pages, inode);
31030+
31031+ /* This will clear dirty bit, uncapture and unlock jnode */
31032+ unlock_cluster_uncapture(node);
31033+ } else
31034+ unlock_cluster(node);
31035+ jput(node); /* jlookup */
31036+ put_found_pages(pages, found); /* find_get_pages */
31037+ truncate:
31038+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
31039+ index == 0)
31040+ return;
31041+ truncate_page_cluster_range(inode, pages, index, 0,
31042+ cluster_nrpages(inode),
31043+ even_cows);
31044+ assert("edward-1201",
31045+ ergo(!reiser4_inode_get_flag(inode,
31046+ REISER4_FILE_CONV_IN_PROGRESS),
31047+ jnode_truncate_ok(inode, index)));
31048+ return;
31049+}
31050+
31051+/*
31052+ * Set cluster handle @clust of a logical cluster before
31053+ * modifications which are supposed to be committed.
31054+ *
31055+ * . grab cluster pages;
31056+ * . reserve disk space;
31057+ * . maybe read pages from disk and set the disk cluster dirty;
31058+ * . maybe write hole and check in (partially zeroed) logical cluster;
31059+ * . create 'unprepped' disk cluster for new or fake logical one.
31060+ */
31061+static int prepare_logical_cluster(struct inode *inode,
31062+ loff_t file_off, /* write position
31063+ in the file */
31064+ loff_t to_file, /* bytes of users data
31065+ to write to the file */
31066+ struct cluster_handle * clust,
31067+ logical_cluster_op op)
31068+{
31069+ int result = 0;
31070+ struct reiser4_slide * win = clust->win;
31071+
31072+ reset_cluster_params(clust);
31073+ cluster_set_tfm_act(&clust->tc, TFMA_READ);
31074+#if REISER4_DEBUG
31075+ clust->ctx = get_current_context();
31076+#endif
31077+ assert("edward-1190", op != LC_INVAL);
31078+
31079+ clust->op = op;
31080+
31081+ result = prepare_page_cluster(inode, clust, WRITE_OP);
31082+ if (result)
31083+ return result;
31084+ assert("edward-1447",
31085+ ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
31086+ assert("edward-1448",
31087+ ergo(clust->nr_pages != 0,
31088+ jnode_is_cluster_page(jprivate(clust->pages[0]))));
31089+
31090+ result = reserve4cluster(inode, clust);
31091+ if (result)
31092+ goto err1;
31093+ result = read_some_cluster_pages(inode, clust);
31094+ if (result) {
31095+ free_reserved4cluster(inode,
31096+ clust,
31097+ estimate_update_cluster(inode) +
31098+ estimate_insert_cluster(inode));
31099+ goto err1;
31100+ }
31101+ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
31102+
31103+ result = cryptcompress_make_unprepped_cluster(clust, inode);
31104+ if (result)
31105+ goto err2;
31106+ if (win && win->stat == HOLE_WINDOW) {
31107+ result = write_hole(inode, clust, file_off, to_file);
31108+ if (result)
31109+ goto err2;
31110+ }
31111+ return 0;
31112+ err2:
31113+ free_reserved4cluster(inode, clust,
31114+ estimate_update_cluster(inode));
31115+ err1:
31116+ put_page_cluster(clust, inode, WRITE_OP);
31117+ assert("edward-1125", result == -ENOSPC);
31118+ return result;
31119+}
31120+
31121+/* set window by two offsets */
31122+static void set_window(struct cluster_handle * clust,
31123+ struct reiser4_slide * win, struct inode *inode,
31124+ loff_t o1, loff_t o2)
31125+{
31126+ assert("edward-295", clust != NULL);
31127+ assert("edward-296", inode != NULL);
31128+ assert("edward-1071", win != NULL);
31129+ assert("edward-297", o1 <= o2);
31130+
31131+ clust->index = off_to_clust(o1, inode);
31132+
31133+ win->off = off_to_cloff(o1, inode);
31134+ win->count = min((loff_t)(inode_cluster_size(inode) - win->off),
31135+ o2 - o1);
31136+ win->delta = 0;
31137+
31138+ clust->win = win;
31139+}
31140+
31141+static int set_cluster_by_window(struct inode *inode,
31142+ struct cluster_handle * clust,
31143+ struct reiser4_slide * win, size_t length,
31144+ loff_t file_off)
31145+{
31146+ int result;
31147+
31148+ assert("edward-197", clust != NULL);
31149+ assert("edward-1072", win != NULL);
31150+ assert("edward-198", inode != NULL);
31151+
31152+ result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
31153+ if (result)
31154+ return result;
31155+
31156+ if (file_off > i_size_read(inode)) {
31157+ /* Uhmm, hole in cryptcompress file... */
31158+ loff_t hole_size;
31159+ hole_size = file_off - inode->i_size;
31160+
31161+ set_window(clust, win, inode, inode->i_size, file_off);
31162+ win->stat = HOLE_WINDOW;
31163+ if (win->off + hole_size < inode_cluster_size(inode))
31164+ /* there is also user's data to append to the hole */
31165+ win->delta = min(inode_cluster_size(inode) -
31166+ (win->off + win->count), length);
31167+ return 0;
31168+ }
31169+ set_window(clust, win, inode, file_off, file_off + length);
31170+ win->stat = DATA_WINDOW;
31171+ return 0;
31172+}
31173+
31174+int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
31175+ int count)
31176+{
31177+ int result = 0;
31178+ int (*setting_actor)(struct cluster_handle * clust, int count);
31179+
31180+ assert("edward-1358", clust != NULL);
31181+ assert("edward-1359", page != NULL);
31182+ assert("edward-1360", page->mapping != NULL);
31183+ assert("edward-1361", page->mapping->host != NULL);
31184+
31185+ setting_actor =
31186+ (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
31187+ result = setting_actor(clust, count);
31188+ clust->index = pg_to_clust(page->index, page->mapping->host);
31189+ return result;
31190+}
31191+
31192+/* reset all the params that not get updated */
31193+void reset_cluster_params(struct cluster_handle * clust)
31194+{
31195+ assert("edward-197", clust != NULL);
31196+
31197+ clust->dstat = INVAL_DISK_CLUSTER;
31198+ clust->tc.uptodate = 0;
31199+ clust->tc.len = 0;
31200+}
31201+
31202+/* the heart of write_cryptcompress */
31203+static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
31204+ const char __user *buf, size_t to_write,
31205+ loff_t pos, struct psched_context *cont)
31206+{
31207+ int i;
31208+ hint_t *hint;
31209+ int result = 0;
31210+ size_t count;
31211+ struct reiser4_slide win;
31212+ struct cluster_handle clust;
31213+ struct cryptcompress_info * info;
31214+
31215+ assert("edward-154", buf != NULL);
31216+ assert("edward-161", reiser4_schedulable());
31217+ assert("edward-748", cryptcompress_inode_ok(inode));
31218+ assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
31219+ assert("edward-1274", get_current_context()->grabbed_blocks == 0);
31220+
31221+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31222+ if (hint == NULL)
31223+ return RETERR(-ENOMEM);
31224+
31225+ result = load_file_hint(file, hint);
31226+ if (result) {
31227+ kfree(hint);
31228+ return result;
31229+ }
31230+ count = to_write;
31231+
31232+ reiser4_slide_init(&win);
31233+ cluster_init_read(&clust, &win);
31234+ clust.hint = hint;
31235+ info = cryptcompress_inode_data(inode);
31236+
31237+ mutex_lock(&info->checkin_mutex);
31238+
31239+ result = set_cluster_by_window(inode, &clust, &win, to_write, pos);
31240+ if (result)
31241+ goto out;
31242+
31243+ if (next_window_stat(&win) == HOLE_WINDOW) {
31244+ /* write hole in this iteration
31245+ separated from the loop below */
31246+ result = write_pschedule_hook(file, inode,
31247+ pos,
31248+ &clust,
31249+ cont);
31250+ if (result)
31251+ goto out;
31252+ result = prepare_logical_cluster(inode, pos, count, &clust,
31253+ LC_APPOV);
31254+ if (result)
31255+ goto out;
31256+ }
31257+ do {
31258+ const char __user * src;
31259+ unsigned page_off, to_page;
31260+
31261+ assert("edward-750", reiser4_schedulable());
31262+
31263+ result = write_pschedule_hook(file, inode,
31264+ pos + to_write - count,
31265+ &clust,
31266+ cont);
31267+ if (result)
31268+ goto out;
31269+ if (cont->state == PSCHED_ASSIGNED_NEW)
31270+ goto out_no_release;
31271+
31272+ result = prepare_logical_cluster(inode, pos, count, &clust,
31273+ LC_APPOV);
31274+ if (result)
31275+ goto out;
31276+
31277+ assert("edward-751", cryptcompress_inode_ok(inode));
31278+ assert("edward-204", win.stat == DATA_WINDOW);
31279+ assert("edward-1288", hint_is_valid(clust.hint));
31280+ assert("edward-752",
31281+ znode_is_write_locked(hint->ext_coord.coord.node));
31282+ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
31283+
31284+ /* set write position in page */
31285+ page_off = off_to_pgoff(win.off);
31286+
31287+ /* copy user's data to cluster pages */
31288+ for (i = off_to_pg(win.off), src = buf;
31289+ i < size_in_pages(win.off + win.count);
31290+ i++, src += to_page) {
31291+ to_page = __mbp(win.off + win.count, i) - page_off;
31292+ assert("edward-1039",
31293+ page_off + to_page <= PAGE_CACHE_SIZE);
31294+ assert("edward-287", clust.pages[i] != NULL);
31295+
31296+ fault_in_pages_readable(src, to_page);
31297+
31298+ lock_page(clust.pages[i]);
31299+ result =
31300+ __copy_from_user((char *)kmap(clust.pages[i]) +
31301+ page_off, src, to_page);
31302+ kunmap(clust.pages[i]);
31303+ if (unlikely(result)) {
31304+ unlock_page(clust.pages[i]);
31305+ result = -EFAULT;
31306+ goto err2;
31307+ }
31308+ SetPageUptodate(clust.pages[i]);
31309+ reiser4_set_page_dirty_internal(clust.pages[i]);
31310+ flush_dcache_page(clust.pages[i]);
31311+ mark_page_accessed(clust.pages[i]);
31312+ unlock_page(clust.pages[i]);
31313+ page_off = 0;
31314+ }
31315+ assert("edward-753", cryptcompress_inode_ok(inode));
31316+
31317+ result = checkin_logical_cluster(&clust, inode);
31318+ if (result)
31319+ goto err2;
31320+
31321+ buf += win.count;
31322+ count -= win.count;
31323+
31324+ result = balance_dirty_page_cluster(&clust, inode, 0, count,
31325+ win_count_to_nrpages(&win));
31326+ if (result)
31327+ goto err1;
31328+ assert("edward-755", hint->lh.owner == NULL);
31329+ reset_cluster_params(&clust);
31330+ continue;
31331+ err2:
31332+ put_page_cluster(&clust, inode, WRITE_OP);
31333+ err1:
31334+ if (clust.reserved)
31335+ free_reserved4cluster(inode,
31336+ &clust,
31337+ estimate_update_cluster(inode));
31338+ break;
31339+ } while (count);
31340+ out:
31341+ done_lh(&hint->lh);
31342+ mutex_unlock(&info->checkin_mutex);
31343+ save_file_hint(file, hint);
31344+ out_no_release:
31345+ kfree(hint);
31346+ put_cluster_handle(&clust);
31347+ assert("edward-195",
31348+ ergo((to_write == count),
31349+ (result < 0 || cont->state == PSCHED_ASSIGNED_NEW)));
31350+ return (to_write - count) ? (to_write - count) : result;
31351+}
31352+
31353+/**
31354+ * plugin->write()
31355+ * @file: file to write to
31356+ * @buf: address of user-space buffer
31357+ * @read_amount: number of bytes to write
31358+ * @off: position in file to write to
31359+ */
31360+ssize_t write_cryptcompress(struct file *file, const char __user *buf,
31361+ size_t count, loff_t *off,
31362+ struct psched_context *cont)
31363+{
31364+ ssize_t result;
31365+ struct inode *inode;
31366+ reiser4_context *ctx;
31367+ loff_t pos = *off;
31368+ struct cryptcompress_info *info;
31369+
31370+ assert("edward-1449", cont->state == PSCHED_INVAL_STATE);
31371+
31372+ inode = file->f_dentry->d_inode;
31373+ assert("edward-196", cryptcompress_inode_ok(inode));
31374+
31375+ info = cryptcompress_inode_data(inode);
31376+ ctx = get_current_context();
31377+
31378+ result = generic_write_checks(file, &pos, &count, 0);
31379+ if (unlikely(result != 0)) {
31380+ context_set_commit_async(ctx);
31381+ return result;
31382+ }
31383+ if (unlikely(count == 0))
31384+ return 0;
31385+ result = remove_suid(file->f_dentry);
31386+ if (unlikely(result != 0)) {
31387+ context_set_commit_async(ctx);
31388+ return result;
31389+ }
31390+ /* remove_suid might create a transaction */
31391+ reiser4_txn_restart(ctx);
31392+
31393+ result = do_write_cryptcompress(file, inode, buf, count, pos, cont);
31394+
31395+ if (unlikely(result < 0)) {
31396+ context_set_commit_async(ctx);
31397+ return result;
31398+ }
31399+ /* update position in a file */
31400+ *off = pos + result;
31401+ return result;
31402+}
31403+
31404+/* plugin->readpages */
31405+int readpages_cryptcompress(struct file *file, struct address_space *mapping,
31406+ struct list_head *pages, unsigned nr_pages)
31407+{
31408+ reiser4_context * ctx;
31409+ int ret;
31410+
31411+ ctx = reiser4_init_context(mapping->host->i_sb);
31412+ if (IS_ERR(ctx)) {
31413+ ret = PTR_ERR(ctx);
31414+ goto err;
31415+ }
31416+ /* cryptcompress file can be built of ctail items only */
31417+ ret = readpages_ctail(file, mapping, pages);
31418+ reiser4_txn_restart(ctx);
31419+ reiser4_exit_context(ctx);
31420+ if (ret) {
31421+err:
31422+ put_pages_list(pages);
31423+ }
31424+ return ret;
31425+}
31426+
31427+static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
31428+{
31429+ /* reserve one block to update stat data item */
31430+ assert("edward-1193",
31431+ inode_file_plugin(inode)->estimate.update ==
31432+ estimate_update_common);
31433+ return estimate_update_common(inode);
31434+}
31435+
31436+/**
31437+ * plugin->read
31438+ * @file: file to read from
31439+ * @buf: address of user-space buffer
31440+ * @read_amount: number of bytes to read
31441+ * @off: position in file to read from
31442+ */
31443+ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
31444+ loff_t * off)
31445+{
31446+ ssize_t result;
31447+ struct inode *inode;
31448+ reiser4_context *ctx;
31449+ struct cryptcompress_info *info;
31450+ reiser4_block_nr needed;
31451+
31452+ inode = file->f_dentry->d_inode;
31453+ assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
31454+
31455+ ctx = reiser4_init_context(inode->i_sb);
31456+ if (IS_ERR(ctx))
31457+ return PTR_ERR(ctx);
31458+
31459+ info = cryptcompress_inode_data(inode);
31460+ needed = cryptcompress_estimate_read(inode);
31461+
31462+ result = reiser4_grab_space(needed, BA_CAN_COMMIT);
31463+ if (result != 0) {
31464+ reiser4_exit_context(ctx);
31465+ return result;
31466+ }
31467+ result = do_sync_read(file, buf, size, off);
31468+
31469+ context_set_commit_async(ctx);
31470+ reiser4_exit_context(ctx);
31471+
31472+ return result;
31473+}
31474+
31475+/* Look for a disk cluster and keep lookup result in @found.
31476+ * If @index > 0, then find disk cluster of the index (@index - 1);
31477+ * If @index == 0, then find the rightmost disk cluster.
31478+ * Keep incremented index of the found disk cluster in @found.
31479+ * @found == 0 means that disk cluster was not found (in the last
31480+ * case (@index == 0) it means that file doesn't have disk clusters).
31481+ */
31482+static int lookup_disk_cluster(struct inode *inode, cloff_t * found,
31483+ cloff_t index)
31484+{
31485+ int result;
31486+ reiser4_key key;
31487+ loff_t offset;
31488+ hint_t *hint;
31489+ lock_handle *lh;
31490+ lookup_bias bias;
31491+ coord_t *coord;
31492+ item_plugin *iplug;
31493+
31494+ assert("edward-1131", inode != NULL);
31495+ assert("edward-95", cryptcompress_inode_ok(inode));
31496+
31497+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31498+ if (hint == NULL)
31499+ return RETERR(-ENOMEM);
31500+ hint_init_zero(hint);
31501+ lh = &hint->lh;
31502+
31503+ bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
31504+ offset =
31505+ (index ? clust_to_off(index, inode) -
31506+ 1 : get_key_offset(reiser4_max_key()));
31507+
31508+ key_by_inode_cryptcompress(inode, offset, &key);
31509+
31510+ /* find the last item of this object */
31511+ result =
31512+ find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
31513+ bias, 0);
31514+ if (cbk_errored(result)) {
31515+ done_lh(lh);
31516+ kfree(hint);
31517+ return result;
31518+ }
31519+ if (result == CBK_COORD_NOTFOUND) {
31520+ /* no real disk clusters */
31521+ done_lh(lh);
31522+ kfree(hint);
31523+ *found = 0;
31524+ return 0;
31525+ }
31526+ /* disk cluster is found */
31527+ coord = &hint->ext_coord.coord;
31528+ coord_clear_iplug(coord);
31529+ result = zload(coord->node);
31530+ if (unlikely(result)) {
31531+ done_lh(lh);
31532+ kfree(hint);
31533+ return result;
31534+ }
31535+ iplug = item_plugin_by_coord(coord);
31536+ assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
31537+ assert("edward-1202", ctail_ok(coord));
31538+
31539+ item_key_by_coord(coord, &key);
31540+ *found = off_to_clust(get_key_offset(&key), inode) + 1;
31541+
31542+ assert("edward-1132", ergo(index, index == *found));
31543+
31544+ zrelse(coord->node);
31545+ done_lh(lh);
31546+ kfree(hint);
31547+ return 0;
31548+}
31549+
31550+static int find_fake_appended(struct inode *inode, cloff_t * index)
31551+{
31552+ return lookup_disk_cluster(inode, index,
31553+ 0 /* find last real one */ );
31554+}
31555+
31556+/* Set left coord when unit is not found after node_lookup()
31557+ This takes into account that there can be holes in a sequence
31558+ of disk clusters */
31559+
31560+static void adjust_left_coord(coord_t * left_coord)
31561+{
31562+ switch (left_coord->between) {
31563+ case AFTER_UNIT:
31564+ left_coord->between = AFTER_ITEM;
31565+ case AFTER_ITEM:
31566+ case BEFORE_UNIT:
31567+ break;
31568+ default:
31569+ impossible("edward-1204", "bad left coord to cut");
31570+ }
31571+ return;
31572+}
31573+
31574+#define CRC_CUT_TREE_MIN_ITERATIONS 64
31575+
31576+/* plugin->cut_tree_worker */
31577+int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
31578+ const reiser4_key * to_key,
31579+ reiser4_key * smallest_removed,
31580+ struct inode *object, int truncate,
31581+ int *progress)
31582+{
31583+ lock_handle next_node_lock;
31584+ coord_t left_coord;
31585+ int result;
31586+
31587+ assert("edward-1158", tap->coord->node != NULL);
31588+ assert("edward-1159", znode_is_write_locked(tap->coord->node));
31589+ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
31590+
31591+ *progress = 0;
31592+ init_lh(&next_node_lock);
31593+
31594+ while (1) {
31595+ znode *node; /* node from which items are cut */
31596+ node_plugin *nplug; /* node plugin for @node */
31597+
31598+ node = tap->coord->node;
31599+
31600+ /* Move next_node_lock to the next node on the left. */
31601+ result =
31602+ reiser4_get_left_neighbor(&next_node_lock, node,
31603+ ZNODE_WRITE_LOCK,
31604+ GN_CAN_USE_UPPER_LEVELS);
31605+ if (result != 0 && result != -E_NO_NEIGHBOR)
31606+ break;
31607+ /* FIXME-EDWARD: Check can we delete the node as a whole. */
31608+ result = reiser4_tap_load(tap);
31609+ if (result)
31610+ return result;
31611+
31612+ /* Prepare the second (right) point for cut_node() */
31613+ if (*progress)
31614+ coord_init_last_unit(tap->coord, node);
31615+
31616+ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
31617+ /* set rightmost unit for the items without lookup method */
31618+ tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
31619+
31620+ nplug = node->nplug;
31621+
31622+ assert("edward-1161", nplug);
31623+ assert("edward-1162", nplug->lookup);
31624+
31625+ /* left_coord is leftmost unit cut from @node */
31626+ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
31627+
31628+ if (IS_CBKERR(result))
31629+ break;
31630+
31631+ if (result == CBK_COORD_NOTFOUND)
31632+ adjust_left_coord(&left_coord);
31633+
31634+ /* adjust coordinates so that they are set to existing units */
31635+ if (coord_set_to_right(&left_coord)
31636+ || coord_set_to_left(tap->coord)) {
31637+ result = 0;
31638+ break;
31639+ }
31640+
31641+ if (coord_compare(&left_coord, tap->coord) ==
31642+ COORD_CMP_ON_RIGHT) {
31643+ /* keys from @from_key to @to_key are not in the tree */
31644+ result = 0;
31645+ break;
31646+ }
31647+
31648+ /* cut data from one node */
31649+ *smallest_removed = *reiser4_min_key();
31650+ result = kill_node_content(&left_coord,
31651+ tap->coord,
31652+ from_key,
31653+ to_key,
31654+ smallest_removed,
31655+ next_node_lock.node,
31656+ object, truncate);
31657+ reiser4_tap_relse(tap);
31658+
31659+ if (result)
31660+ break;
31661+
31662+ ++(*progress);
31663+
31664+ /* Check whether all items with keys >= from_key were removed
31665+ * from the tree. */
31666+ if (keyle(smallest_removed, from_key))
31667+ /* result = 0; */
31668+ break;
31669+
31670+ if (next_node_lock.node == NULL)
31671+ break;
31672+
31673+ result = reiser4_tap_move(tap, &next_node_lock);
31674+ done_lh(&next_node_lock);
31675+ if (result)
31676+ break;
31677+
31678+ /* Break long cut_tree operation (deletion of a large file) if
31679+ * atom requires commit. */
31680+ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
31681+ && current_atom_should_commit()) {
31682+ result = -E_REPEAT;
31683+ break;
31684+ }
31685+ }
31686+ done_lh(&next_node_lock);
31687+ return result;
31688+}
31689+
31690+/* Append or expand hole in two steps:
31691+ * 1) set zeroes to the rightmost page of the rightmost non-fake
31692+ * logical cluster;
31693+ * 2) expand hole via fake logical clusters (just increase i_size)
31694+ */
31695+static int cryptcompress_append_hole(struct inode *inode /* with old size */,
31696+ loff_t new_size)
31697+{
31698+ int result = 0;
31699+ hint_t *hint;
31700+ lock_handle *lh;
31701+ loff_t hole_size;
31702+ int nr_zeroes;
31703+ struct reiser4_slide win;
31704+ struct cluster_handle clust;
31705+
31706+ assert("edward-1133", inode->i_size < new_size);
31707+ assert("edward-1134", reiser4_schedulable());
31708+ assert("edward-1135", cryptcompress_inode_ok(inode));
31709+ assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
31710+ assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
31711+
31712+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31713+ if (hint == NULL)
31714+ return RETERR(-ENOMEM);
31715+ hint_init_zero(hint);
31716+ lh = &hint->lh;
31717+
31718+ reiser4_slide_init(&win);
31719+ cluster_init_read(&clust, &win);
31720+ clust.hint = hint;
31721+
31722+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31723+ if (result)
31724+ goto out;
31725+ if (off_to_cloff(inode->i_size, inode) == 0)
31726+ goto append_fake;
31727+ hole_size = new_size - inode->i_size;
31728+ nr_zeroes =
31729+ inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
31730+ if (hole_size < nr_zeroes)
31731+ nr_zeroes = hole_size;
31732+ set_window(&clust, &win, inode, inode->i_size,
31733+ inode->i_size + nr_zeroes);
31734+ win.stat = HOLE_WINDOW;
31735+
31736+ assert("edward-1137",
31737+ clust.index == off_to_clust(inode->i_size, inode));
31738+
31739+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV);
31740+
31741+ assert("edward-1271", !result || result == -ENOSPC);
31742+ if (result)
31743+ goto out;
31744+ assert("edward-1139",
31745+ clust.dstat == PREP_DISK_CLUSTER ||
31746+ clust.dstat == UNPR_DISK_CLUSTER);
31747+
31748+ assert("edward-1431", hole_size >= nr_zeroes);
31749+ if (hole_size == nr_zeroes)
31750+ /* nothing to append anymore */
31751+ goto out;
31752+ append_fake:
31753+ INODE_SET_SIZE(inode, new_size);
31754+ out:
31755+ done_lh(lh);
31756+ kfree(hint);
31757+ put_cluster_handle(&clust);
31758+ return result;
31759+}
31760+
31761+static int update_cryptcompress_size(struct inode *inode, loff_t new_size,
31762+ int update_sd)
31763+{
31764+ return (new_size & ((loff_t) (inode_cluster_size(inode)) - 1)
31765+ ? 0 : reiser4_update_file_size(inode, new_size, update_sd));
31766+}
31767+
31768+/* Prune cryptcompress file in two steps:
31769+ * 1) cut all nominated logical clusters except the leftmost one which
31770+ * is to be partially truncated. Note, that there can be "holes"
31771+ * represented by fake logical clusters.
31772+ * 2) set zeroes and capture leftmost partially truncated logical
31773+ * cluster, if it is not fake; otherwise prune fake logical cluster
31774+ * (just decrease i_size).
31775+ */
31776+static int prune_cryptcompress(struct inode *inode, loff_t new_size,
31777+ int update_sd, cloff_t aidx)
31778+{
31779+ int result = 0;
31780+ unsigned nr_zeroes;
31781+ loff_t to_prune;
31782+ loff_t old_size;
31783+ cloff_t ridx;
31784+
31785+ hint_t *hint;
31786+ lock_handle *lh;
31787+ struct reiser4_slide win;
31788+ struct cluster_handle clust;
31789+
31790+ assert("edward-1140", inode->i_size >= new_size);
31791+ assert("edward-1141", reiser4_schedulable());
31792+ assert("edward-1142", cryptcompress_inode_ok(inode));
31793+ assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
31794+
31795+ old_size = inode->i_size;
31796+
31797+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31798+ if (hint == NULL)
31799+ return RETERR(-ENOMEM);
31800+ hint_init_zero(hint);
31801+ lh = &hint->lh;
31802+
31803+ reiser4_slide_init(&win);
31804+ cluster_init_read(&clust, &win);
31805+ clust.hint = hint;
31806+
31807+ /* calculate index of the rightmost logical cluster
31808+ that will be completely truncated */
31809+ ridx = size_in_lc(new_size, inode);
31810+
31811+ /* truncate all disk clusters starting from @ridx */
31812+ assert("edward-1174", ridx <= aidx);
31813+ old_size = inode->i_size;
31814+ if (ridx != aidx) {
31815+ struct cryptcompress_info * info;
31816+ info = cryptcompress_inode_data(inode);
31817+ result = cut_file_items(inode,
31818+ clust_to_off(ridx, inode),
31819+ update_sd,
31820+ clust_to_off(aidx, inode),
31821+ update_cryptcompress_size);
31822+ info->trunc_index = ULONG_MAX;
31823+ if (result)
31824+ goto out;
31825+ }
31826+ /*
31827+ * there can be pages of fake logical clusters, truncate them
31828+ */
31829+ truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode));
31830+ assert("edward-1524",
31831+ pages_truncate_ok(inode, clust_to_pg(ridx, inode)));
31832+ /*
31833+ * now perform partial truncate of last logical cluster
31834+ */
31835+ if (!off_to_cloff(new_size, inode)) {
31836+ /* no partial truncate is needed */
31837+ assert("edward-1145", inode->i_size == new_size);
31838+ goto truncate_fake;
31839+ }
31840+ assert("edward-1146", new_size < inode->i_size);
31841+
31842+ to_prune = inode->i_size - new_size;
31843+
31844+ /* check if the last logical cluster is fake */
31845+ result = lookup_disk_cluster(inode, &aidx, ridx);
31846+ if (result)
31847+ goto out;
31848+ if (!aidx)
31849+ /* yup, this is fake one */
31850+ goto truncate_fake;
31851+
31852+ assert("edward-1148", aidx == ridx);
31853+
31854+ /* do partial truncate of the last page cluster,
31855+ and try to capture this one */
31856+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31857+ if (result)
31858+ goto out;
31859+ nr_zeroes = (off_to_pgoff(new_size) ?
31860+ PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
31861+ set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
31862+ win.stat = HOLE_WINDOW;
31863+
31864+ assert("edward-1149", clust.index == ridx - 1);
31865+
31866+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC);
31867+ if (result)
31868+ goto out;
31869+ assert("edward-1151",
31870+ clust.dstat == PREP_DISK_CLUSTER ||
31871+ clust.dstat == UNPR_DISK_CLUSTER);
31872+
31873+ assert("edward-1191", inode->i_size == new_size);
31874+ assert("edward-1206", body_truncate_ok(inode, ridx));
31875+ truncate_fake:
31876+ /* drop all the pages that don't have jnodes (i.e. pages
31877+ which can not be truncated by cut_file_items() because
31878+ of holes represented by fake disk clusters) including
31879+ the pages of partially truncated cluster which was
31880+ released by prepare_logical_cluster() */
31881+ INODE_SET_SIZE(inode, new_size);
31882+ truncate_inode_pages(inode->i_mapping, new_size);
31883+ out:
31884+ assert("edward-1334", !result || result == -ENOSPC);
31885+ assert("edward-1497",
31886+ pages_truncate_ok(inode, size_in_pages(new_size)));
31887+
31888+ done_lh(lh);
31889+ kfree(hint);
31890+ put_cluster_handle(&clust);
31891+ return result;
31892+}
31893+
31894+/* Prepare cryptcompress file for truncate:
31895+ * prune or append rightmost fake logical clusters (if any)
31896+ */
31897+static int start_truncate_fake(struct inode *inode, cloff_t aidx,
31898+ loff_t new_size, int update_sd)
31899+{
31900+ int result = 0;
31901+ int bytes;
31902+
31903+ if (new_size > inode->i_size) {
31904+ /* append */
31905+ if (inode->i_size < clust_to_off(aidx, inode))
31906+ /* no fake bytes */
31907+ return 0;
31908+ bytes = new_size - inode->i_size;
31909+ INODE_SET_SIZE(inode, inode->i_size + bytes);
31910+ } else {
31911+ /* prune */
31912+ if (inode->i_size <= clust_to_off(aidx, inode))
31913+ /* no fake bytes */
31914+ return 0;
31915+ bytes = inode->i_size -
31916+ max(new_size, clust_to_off(aidx, inode));
31917+ if (!bytes)
31918+ return 0;
31919+ INODE_SET_SIZE(inode, inode->i_size - bytes);
31920+ /* In the case of fake prune we need to drop page cluster.
31921+ There are only 2 cases for partially truncated page:
31922+ 1. If is is dirty, therefore it is anonymous
31923+ (was dirtied via mmap), and will be captured
31924+ later via ->capture().
31925+ 2. If is clean, therefore it is filled by zeroes.
31926+ In both cases we don't need to make it dirty and
31927+ capture here.
31928+ */
31929+ truncate_inode_pages(inode->i_mapping, inode->i_size);
31930+ }
31931+ if (update_sd)
31932+ result = update_sd_cryptcompress(inode);
31933+ return result;
31934+}
31935+
31936+/**
31937+ * This is called in setattr_cryptcompress when it is used to truncate,
31938+ * and in delete_object_cryptcompress
31939+ */
31940+static int cryptcompress_truncate(struct inode *inode, /* old size */
31941+ loff_t new_size, /* new size */
31942+ int update_sd)
31943+{
31944+ int result;
31945+ cloff_t aidx;
31946+
31947+ result = find_fake_appended(inode, &aidx);
31948+ if (result)
31949+ return result;
31950+ assert("edward-1208",
31951+ ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
31952+
31953+ result = start_truncate_fake(inode, aidx, new_size, update_sd);
31954+ if (result)
31955+ return result;
31956+ if (inode->i_size == new_size)
31957+ /* nothing to truncate anymore */
31958+ return 0;
31959+ result = (inode->i_size < new_size ?
31960+ cryptcompress_append_hole(inode, new_size) :
31961+ prune_cryptcompress(inode, new_size, update_sd, aidx));
31962+ if (!result && update_sd)
31963+ result = update_sd_cryptcompress(inode);
31964+ return result;
31965+}
31966+
31967+/* Capture an anonymous pager cluster. (Page cluser is
31968+ * anonymous if it contains at least one anonymous page
31969+ */
31970+static int capture_anon_page_cluster(struct cluster_handle * clust,
31971+ struct inode * inode)
31972+{
31973+ int result;
31974+
31975+ assert("edward-1073", clust != NULL);
31976+ assert("edward-1074", inode != NULL);
31977+ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
31978+
31979+ result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
31980+ if (result)
31981+ return result;
31982+ set_cluster_pages_dirty(clust, inode);
31983+ result = checkin_logical_cluster(clust, inode);
31984+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
31985+ if (unlikely(result))
31986+ put_page_cluster(clust, inode, WRITE_OP);
31987+ return result;
31988+}
31989+
31990+/* Starting from @index find tagged pages of the same page cluster.
31991+ * Clear the tag for each of them. Return number of found pages.
31992+ */
31993+static int find_anon_page_cluster(struct address_space * mapping,
31994+ pgoff_t * index, struct page ** pages)
31995+{
31996+ int i = 0;
31997+ int found;
31998+ write_lock_irq(&mapping->tree_lock);
31999+ do {
32000+ /* looking for one page */
32001+ found = radix_tree_gang_lookup_tag(&mapping->page_tree,
32002+ (void **)&pages[i],
32003+ *index, 1,
32004+ PAGECACHE_TAG_REISER4_MOVED);
32005+ if (!found)
32006+ break;
32007+ if (!same_page_cluster(pages[0], pages[i]))
32008+ break;
32009+
32010+ /* found */
32011+ page_cache_get(pages[i]);
32012+ *index = pages[i]->index + 1;
32013+
32014+ radix_tree_tag_clear(&mapping->page_tree,
32015+ pages[i]->index,
32016+ PAGECACHE_TAG_REISER4_MOVED);
32017+ if (last_page_in_cluster(pages[i++]))
32018+ break;
32019+ } while (1);
32020+ write_unlock_irq(&mapping->tree_lock);
32021+ return i;
32022+}
32023+
32024+#define MAX_PAGES_TO_CAPTURE (1024)
32025+
32026+/* Capture anonymous page clusters */
32027+static int capture_anon_pages(struct address_space * mapping, pgoff_t * index,
32028+ int to_capture)
32029+{
32030+ int count = 0;
32031+ int found = 0;
32032+ int result = 0;
32033+ hint_t *hint;
32034+ lock_handle *lh;
32035+ struct inode * inode;
32036+ struct cluster_handle clust;
32037+ struct page * pages[MAX_CLUSTER_NRPAGES];
32038+
32039+ assert("edward-1127", mapping != NULL);
32040+ assert("edward-1128", mapping->host != NULL);
32041+ assert("edward-1440", mapping->host->i_mapping == mapping);
32042+
32043+ inode = mapping->host;
32044+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
32045+ if (hint == NULL)
32046+ return RETERR(-ENOMEM);
32047+ hint_init_zero(hint);
32048+ lh = &hint->lh;
32049+
32050+ cluster_init_read(&clust, NULL);
32051+ clust.hint = hint;
32052+
32053+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
32054+ if (result)
32055+ goto out;
32056+
32057+ while (to_capture > 0) {
32058+ found = find_anon_page_cluster(mapping, index, pages);
32059+ if (!found) {
32060+ *index = (pgoff_t) - 1;
32061+ break;
32062+ }
32063+ move_cluster_forward(&clust, inode, pages[0]->index);
32064+ result = capture_anon_page_cluster(&clust, inode);
32065+
32066+ put_found_pages(pages, found); /* find_anon_page_cluster */
32067+ if (result)
32068+ break;
32069+ to_capture -= clust.nr_pages;
32070+ count += clust.nr_pages;
32071+ }
32072+ if (result) {
32073+ warning("edward-1077",
32074+ "Capture failed (inode %llu, result=%i, captured=%d)\n",
32075+ (unsigned long long)get_inode_oid(inode), result, count);
32076+ } else {
32077+ assert("edward-1078", ergo(found > 0, count > 0));
32078+ if (to_capture <= 0)
32079+ /* there may be left more pages */
32080+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
32081+ result = count;
32082+ }
32083+ out:
32084+ done_lh(lh);
32085+ kfree(hint);
32086+ put_cluster_handle(&clust);
32087+ return result;
32088+}
32089+
32090+/* Returns true if inode's mapping has dirty pages
32091+ which do not belong to any atom */
32092+static int cryptcompress_inode_has_anon_pages(struct inode *inode)
32093+{
32094+ int result;
32095+ read_lock_irq(&inode->i_mapping->tree_lock);
32096+ result = radix_tree_tagged(&inode->i_mapping->page_tree,
32097+ PAGECACHE_TAG_REISER4_MOVED);
32098+ read_unlock_irq(&inode->i_mapping->tree_lock);
32099+ return result;
32100+}
32101+
32102+/* plugin->writepages */
32103+int writepages_cryptcompress(struct address_space *mapping,
32104+ struct writeback_control *wbc)
32105+{
32106+ int result = 0;
32107+ long to_capture;
32108+ pgoff_t nrpages;
32109+ pgoff_t index = 0;
32110+ struct inode *inode;
32111+ struct cryptcompress_info *info;
32112+
32113+ inode = mapping->host;
32114+ if (!cryptcompress_inode_has_anon_pages(inode))
32115+ goto end;
32116+ info = cryptcompress_inode_data(inode);
32117+ nrpages = size_in_pages(i_size_read(inode));
32118+
32119+ if (wbc->sync_mode != WB_SYNC_ALL)
32120+ to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE);
32121+ else
32122+ to_capture = MAX_PAGES_TO_CAPTURE;
32123+ do {
32124+ reiser4_context *ctx;
32125+
32126+ ctx = reiser4_init_context(inode->i_sb);
32127+ if (IS_ERR(ctx)) {
32128+ result = PTR_ERR(ctx);
32129+ break;
32130+ }
32131+ /* avoid recursive calls to ->sync_inodes */
32132+ ctx->nobalance = 1;
32133+
32134+ assert("edward-1079",
32135+ lock_stack_isclean(get_current_lock_stack()));
32136+
32137+ reiser4_txn_restart_current();
32138+
32139+ if (get_current_context()->entd) {
32140+ if (mutex_trylock(&info->checkin_mutex) == 0) {
32141+ /* the mutex might be occupied by
32142+ entd caller */
32143+ result = RETERR(-EBUSY);
32144+ reiser4_exit_context(ctx);
32145+ break;
32146+ }
32147+ } else
32148+ mutex_lock(&info->checkin_mutex);
32149+
32150+ result = capture_anon_pages(inode->i_mapping, &index,
32151+ to_capture);
32152+ mutex_unlock(&info->checkin_mutex);
32153+
32154+ if (result < 0) {
32155+ reiser4_exit_context(ctx);
32156+ break;
32157+ }
32158+ wbc->nr_to_write -= result;
32159+ if (wbc->sync_mode != WB_SYNC_ALL) {
32160+ reiser4_exit_context(ctx);
32161+ break;
32162+ }
32163+ result = txnmgr_force_commit_all(inode->i_sb, 0);
32164+ reiser4_exit_context(ctx);
32165+ } while (result >= 0 && index < nrpages);
32166+
32167+ end:
32168+ if (is_in_reiser4_context()) {
32169+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
32170+ /* there are already pages to flush, flush them out,
32171+ do not delay until end of reiser4_sync_inodes */
32172+ reiser4_writeout(inode->i_sb, wbc);
32173+ get_current_context()->nr_captured = 0;
32174+ }
32175+ }
32176+ return result;
32177+}
32178+
32179+/* plugin->ioctl */
32180+int ioctl_cryptcompress(struct inode *inode, struct file *filp,
32181+ unsigned int cmd, unsigned long arg)
32182+{
32183+ return RETERR(-ENOSYS);
32184+}
32185+
32186+/* plugin->mmap */
32187+int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
32188+{
32189+ int result;
32190+ struct inode *inode;
32191+ reiser4_context *ctx;
32192+
32193+ inode = file->f_dentry->d_inode;
32194+ ctx = reiser4_init_context(inode->i_sb);
32195+ if (IS_ERR(ctx))
32196+ return PTR_ERR(ctx);
32197+ /*
32198+ * generic_file_mmap will do update_atime. Grab space for stat data
32199+ * update.
32200+ */
32201+ result = reiser4_grab_space_force
32202+ (inode_file_plugin(inode)->estimate.update(inode),
32203+ BA_CAN_COMMIT);
32204+ if (result) {
32205+ reiser4_exit_context(ctx);
32206+ return result;
32207+ }
32208+ result = generic_file_mmap(file, vma);
32209+ reiser4_exit_context(ctx);
32210+ return result;
32211+}
32212+
32213+/* plugin->delete_object */
32214+int delete_object_cryptcompress(struct inode *inode)
32215+{
32216+ int result;
32217+ struct cryptcompress_info * info;
32218+
32219+ assert("edward-429", inode->i_nlink == 0);
32220+
32221+ reiser4_txn_restart_current();
32222+ info = cryptcompress_inode_data(inode);
32223+
32224+ mutex_lock(&info->checkin_mutex);
32225+ result = cryptcompress_truncate(inode, 0, 0);
32226+ mutex_unlock(&info->checkin_mutex);
32227+
32228+ if (result) {
32229+ warning("edward-430",
32230+ "cannot truncate cryptcompress file %lli: %i",
32231+ (unsigned long long)get_inode_oid(inode),
32232+ result);
32233+ }
32234+ truncate_inode_pages(inode->i_mapping, 0);
32235+ assert("edward-1487", pages_truncate_ok(inode, 0));
32236+ /* and remove stat data */
32237+ return reiser4_delete_object_common(inode);
32238+}
32239+
32240+/*
32241+ * plugin->setattr
32242+ * This implements actual truncate (see comments in reiser4/page_cache.c)
32243+ */
32244+int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
32245+{
32246+ int result;
32247+ struct inode *inode;
32248+ struct cryptcompress_info * info;
32249+
32250+ inode = dentry->d_inode;
32251+ info = cryptcompress_inode_data(inode);
32252+
32253+ if (attr->ia_valid & ATTR_SIZE) {
32254+ if (i_size_read(inode) != attr->ia_size) {
32255+ reiser4_context *ctx;
32256+ loff_t old_size;
32257+
32258+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
32259+ if (IS_ERR(ctx))
32260+ return PTR_ERR(ctx);
32261+ result = setattr_pschedule_hook(inode);
32262+ if (result) {
32263+ context_set_commit_async(ctx);
32264+ reiser4_exit_context(ctx);
32265+ return result;
32266+ }
32267+ old_size = i_size_read(inode);
32268+ inode_check_scale(inode, old_size, attr->ia_size);
32269+
32270+ mutex_lock(&info->checkin_mutex);
32271+ result = cryptcompress_truncate(inode,
32272+ attr->ia_size,
32273+ 1/* update sd */);
32274+ mutex_unlock(&info->checkin_mutex);
32275+ if (result) {
32276+ warning("edward-1192",
32277+ "truncate_cryptcompress failed: oid %lli, "
32278+ "old size %lld, new size %lld, retval %d",
32279+ (unsigned long long)
32280+ get_inode_oid(inode), old_size,
32281+ attr->ia_size, result);
32282+ }
32283+ context_set_commit_async(ctx);
32284+ reiser4_exit_context(ctx);
32285+ } else
32286+ result = 0;
32287+ } else
32288+ result = reiser4_setattr_common(dentry, attr);
32289+ return result;
32290+}
32291+
32292+/* plugin->release */
32293+int release_cryptcompress(struct inode *inode, struct file *file)
32294+{
32295+ reiser4_context *ctx = reiser4_init_context(inode->i_sb);
32296+
32297+ if (IS_ERR(ctx))
32298+ return PTR_ERR(ctx);
32299+ reiser4_free_file_fsdata(file);
32300+ reiser4_exit_context(ctx);
32301+ return 0;
32302+}
32303+
32304+/* plugin->prepare_write */
32305+int prepare_write_cryptcompress(struct file *file, struct page *page,
32306+ unsigned from, unsigned to)
32307+{
32308+ return -EINVAL;
32309+}
32310+
32311+/* plugin->commit_write */
32312+int commit_write_cryptcompress(struct file *file, struct page *page,
32313+ unsigned from, unsigned to)
32314+{
32315+ BUG();
32316+ return 0;
32317+}
32318+
32319+/* plugin->bmap */
32320+sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock)
32321+{
32322+ return -EINVAL;
32323+}
32324+
32325+/*
32326+ Local variables:
32327+ c-indentation-style: "K&R"
32328+ mode-name: "LC"
32329+ c-basic-offset: 8
32330+ tab-width: 8
32331+ fill-column: 80
32332+ scroll-step: 1
32333+ End:
32334+*/
32335diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.24/fs/reiser4/plugin/file/cryptcompress.h
32336--- linux-2.6.24.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 03:00:00.000000000 +0300
32337+++ linux-2.6.24/fs/reiser4/plugin/file/cryptcompress.h 2008-01-25 11:39:06.980219023 +0300
32338@@ -0,0 +1,616 @@
32339+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
32340+/* See http://www.namesys.com/cryptcompress_design.html */
32341+
32342+#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
32343+#define __FS_REISER4_CRYPTCOMPRESS_H__
32344+
32345+#include "../../page_cache.h"
32346+#include "../compress/compress.h"
32347+#include "../crypto/cipher.h"
32348+
32349+#include <linux/pagemap.h>
32350+
32351+#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
32352+#define MAX_CLUSTER_SHIFT 16
32353+#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
32354+#define DC_CHECKSUM_SIZE 4
32355+
32356+#define MIN_LATTICE_FACTOR 1
32357+#define MAX_LATTICE_FACTOR 32
32358+
32359+/* this mask contains all non-standard plugins that might
32360+ be present in reiser4-specific part of inode managed by
32361+ cryptcompress file plugin */
32362+#define cryptcompress_mask \
32363+ ((1 << PSET_FILE) | \
32364+ (1 << PSET_CLUSTER) | \
32365+ (1 << PSET_CIPHER) | \
32366+ (1 << PSET_DIGEST) | \
32367+ (1 << PSET_COMPRESSION) | \
32368+ (1 << PSET_COMPRESSION_MODE))
32369+
32370+#if REISER4_DEBUG
32371+static inline int cluster_shift_ok(int shift)
32372+{
32373+ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
32374+}
32375+#endif
32376+
32377+#if REISER4_DEBUG
32378+#define INODE_PGCOUNT(inode) \
32379+({ \
32380+ assert("edward-1530", inode_file_plugin(inode) == \
32381+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
32382+ atomic_read(&cryptcompress_inode_data(inode)->pgcount); \
32383+ })
32384+#define INODE_PGCOUNT_INC(inode) \
32385+do { \
32386+ assert("edward-1531", inode_file_plugin(inode) == \
32387+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
32388+ atomic_inc(&cryptcompress_inode_data(inode)->pgcount); \
32389+} while (0)
32390+#define INODE_PGCOUNT_DEC(inode) \
32391+do { \
32392+ if (inode_file_plugin(inode) == \
32393+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) \
32394+ atomic_dec(&cryptcompress_inode_data(inode)->pgcount); \
32395+} while (0)
32396+#else
32397+#define INODE_PGCOUNT(inode) (0)
32398+#define INODE_PGCOUNT_INC(inode)
32399+#define INODE_PGCOUNT_DEC(inode)
32400+#endif /* REISER4_DEBUG */
32401+
32402+struct tfm_stream {
32403+ __u8 *data;
32404+ size_t size;
32405+};
32406+
32407+typedef enum {
32408+ INPUT_STREAM,
32409+ OUTPUT_STREAM,
32410+ LAST_STREAM
32411+} tfm_stream_id;
32412+
32413+typedef struct tfm_stream * tfm_unit[LAST_STREAM];
32414+
32415+static inline __u8 *ts_data(struct tfm_stream * stm)
32416+{
32417+ assert("edward-928", stm != NULL);
32418+ return stm->data;
32419+}
32420+
32421+static inline size_t ts_size(struct tfm_stream * stm)
32422+{
32423+ assert("edward-929", stm != NULL);
32424+ return stm->size;
32425+}
32426+
32427+static inline void set_ts_size(struct tfm_stream * stm, size_t size)
32428+{
32429+ assert("edward-930", stm != NULL);
32430+
32431+ stm->size = size;
32432+}
32433+
32434+static inline int alloc_ts(struct tfm_stream ** stm)
32435+{
32436+ assert("edward-931", stm);
32437+ assert("edward-932", *stm == NULL);
32438+
32439+ *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get());
32440+ if (!*stm)
32441+ return -ENOMEM;
32442+ return 0;
32443+}
32444+
32445+static inline void free_ts(struct tfm_stream * stm)
32446+{
32447+ assert("edward-933", !ts_data(stm));
32448+ assert("edward-934", !ts_size(stm));
32449+
32450+ kfree(stm);
32451+}
32452+
32453+static inline int alloc_ts_data(struct tfm_stream * stm, size_t size)
32454+{
32455+ assert("edward-935", !ts_data(stm));
32456+ assert("edward-936", !ts_size(stm));
32457+ assert("edward-937", size != 0);
32458+
32459+ stm->data = reiser4_vmalloc(size);
32460+ if (!stm->data)
32461+ return -ENOMEM;
32462+ set_ts_size(stm, size);
32463+ return 0;
32464+}
32465+
32466+static inline void free_ts_data(struct tfm_stream * stm)
32467+{
32468+ assert("edward-938", equi(ts_data(stm), ts_size(stm)));
32469+
32470+ if (ts_data(stm))
32471+ vfree(ts_data(stm));
32472+ memset(stm, 0, sizeof *stm);
32473+}
32474+
32475+/* Write modes for item conversion in flush convert phase */
32476+typedef enum {
32477+ CRC_APPEND_ITEM = 1,
32478+ CRC_OVERWRITE_ITEM = 2,
32479+ CRC_CUT_ITEM = 3
32480+} cryptcompress_write_mode_t;
32481+
32482+typedef enum {
32483+ LC_INVAL = 0, /* invalid value */
32484+ LC_APPOV = 1, /* append and/or overwrite */
32485+ LC_TRUNC = 2 /* truncate */
32486+} logical_cluster_op;
32487+
32488+/* Transform cluster.
32489+ * Intermediate state between page cluster and disk cluster
32490+ * Is used for data transform (compression/encryption)
32491+ */
32492+struct tfm_cluster {
32493+ coa_set coa; /* compression algorithms info */
32494+ tfm_unit tun; /* plain and transformed streams */
32495+ tfm_action act;
32496+ int uptodate;
32497+ int lsize; /* number of bytes in logical cluster */
32498+ int len; /* length of the transform stream */
32499+};
32500+
32501+static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32502+ tfm_action act)
32503+{
32504+ return tc->coa[id][act];
32505+}
32506+
32507+static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32508+ tfm_action act, coa_t coa)
32509+{
32510+ tc->coa[id][act] = coa;
32511+}
32512+
32513+static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32514+{
32515+ coa_t coa;
32516+
32517+ coa = cplug->alloc(tc->act);
32518+ if (IS_ERR(coa))
32519+ return PTR_ERR(coa);
32520+ set_coa(tc, cplug->h.id, tc->act, coa);
32521+ return 0;
32522+}
32523+
32524+static inline int
32525+grab_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32526+{
32527+ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
32528+ alloc_coa(tc, cplug) : 0);
32529+}
32530+
32531+static inline void free_coa_set(struct tfm_cluster * tc)
32532+{
32533+ tfm_action j;
32534+ reiser4_compression_id i;
32535+ compression_plugin *cplug;
32536+
32537+ assert("edward-810", tc != NULL);
32538+
32539+ for (j = 0; j < TFMA_LAST; j++)
32540+ for (i = 0; i < LAST_COMPRESSION_ID; i++) {
32541+ if (!get_coa(tc, i, j))
32542+ continue;
32543+ cplug = compression_plugin_by_id(i);
32544+ assert("edward-812", cplug->free != NULL);
32545+ cplug->free(get_coa(tc, i, j), j);
32546+ set_coa(tc, i, j, 0);
32547+ }
32548+ return;
32549+}
32550+
32551+static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc,
32552+ tfm_stream_id id)
32553+{
32554+ return tc->tun[id];
32555+}
32556+
32557+static inline void set_tfm_stream(struct tfm_cluster * tc,
32558+ tfm_stream_id id, struct tfm_stream * ts)
32559+{
32560+ tc->tun[id] = ts;
32561+}
32562+
32563+static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id)
32564+{
32565+ return ts_data(get_tfm_stream(tc, id));
32566+}
32567+
32568+static inline void set_tfm_stream_data(struct tfm_cluster * tc,
32569+ tfm_stream_id id, __u8 * data)
32570+{
32571+ get_tfm_stream(tc, id)->data = data;
32572+}
32573+
32574+static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id)
32575+{
32576+ return ts_size(get_tfm_stream(tc, id));
32577+}
32578+
32579+static inline void
32580+set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size)
32581+{
32582+ get_tfm_stream(tc, id)->size = size;
32583+}
32584+
32585+static inline int
32586+alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32587+{
32588+ assert("edward-939", tc != NULL);
32589+ assert("edward-940", !get_tfm_stream(tc, id));
32590+
32591+ tc->tun[id] = kzalloc(sizeof(struct tfm_stream),
32592+ reiser4_ctx_gfp_mask_get());
32593+ if (!tc->tun[id])
32594+ return -ENOMEM;
32595+ return alloc_ts_data(get_tfm_stream(tc, id), size);
32596+}
32597+
32598+static inline int
32599+realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32600+{
32601+ assert("edward-941", tfm_stream_size(tc, id) < size);
32602+ free_ts_data(get_tfm_stream(tc, id));
32603+ return alloc_ts_data(get_tfm_stream(tc, id), size);
32604+}
32605+
32606+static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id)
32607+{
32608+ free_ts_data(get_tfm_stream(tc, id));
32609+ free_ts(get_tfm_stream(tc, id));
32610+ set_tfm_stream(tc, id, 0);
32611+}
32612+
32613+static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
32614+{
32615+ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
32616+}
32617+
32618+static inline void free_tfm_unit(struct tfm_cluster * tc)
32619+{
32620+ tfm_stream_id id;
32621+ for (id = 0; id < LAST_STREAM; id++) {
32622+ if (!get_tfm_stream(tc, id))
32623+ continue;
32624+ free_tfm_stream(tc, id);
32625+ }
32626+}
32627+
32628+static inline void put_tfm_cluster(struct tfm_cluster * tc)
32629+{
32630+ assert("edward-942", tc != NULL);
32631+ free_coa_set(tc);
32632+ free_tfm_unit(tc);
32633+}
32634+
32635+static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc)
32636+{
32637+ assert("edward-943", tc != NULL);
32638+ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
32639+ return (tc->uptodate == 1);
32640+}
32641+
32642+static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc)
32643+{
32644+ assert("edward-945", tc != NULL);
32645+ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
32646+ tc->uptodate = 1;
32647+ return;
32648+}
32649+
32650+static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc)
32651+{
32652+ assert("edward-947", tc != NULL);
32653+ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
32654+ tc->uptodate = 0;
32655+ return;
32656+}
32657+
32658+static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id)
32659+{
32660+ return (get_tfm_stream(tc, id) &&
32661+ tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
32662+}
32663+
32664+static inline int tfm_cluster_is_set(struct tfm_cluster * tc)
32665+{
32666+ int i;
32667+ for (i = 0; i < LAST_STREAM; i++)
32668+ if (!tfm_stream_is_set(tc, i))
32669+ return 0;
32670+ return 1;
32671+}
32672+
32673+static inline void alternate_streams(struct tfm_cluster * tc)
32674+{
32675+ struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM);
32676+
32677+ set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM));
32678+ set_tfm_stream(tc, OUTPUT_STREAM, tmp);
32679+}
32680+
32681+/* Set of states to indicate a kind of data
32682+ * that will be written to the window */
32683+typedef enum {
32684+ DATA_WINDOW, /* user's data */
32685+ HOLE_WINDOW /* zeroes (such kind of data can be written
32686+ * if we start to write from offset > i_size) */
32687+} window_stat;
32688+
32689+/* Window (of logical cluster size) discretely sliding along a file.
32690+ * Is used to locate hole region in a logical cluster to be properly
32691+ * represented on disk.
32692+ * We split a write to cryptcompress file into writes to its logical
32693+ * clusters. Before writing to a logical cluster we set a window, i.e.
32694+ * calculate values of the following fields:
32695+ */
32696+struct reiser4_slide {
32697+ unsigned off; /* offset to write from */
32698+ unsigned count; /* number of bytes to write */
32699+ unsigned delta; /* number of bytes to append to the hole */
32700+ window_stat stat; /* what kind of data will be written starting
32701+ from @off */
32702+};
32703+
32704+/* Possible states of a disk cluster */
32705+typedef enum {
32706+ INVAL_DISK_CLUSTER, /* unknown state */
32707+ PREP_DISK_CLUSTER, /* disk cluster got converted by flush
32708+ * at least 1 time */
32709+ UNPR_DISK_CLUSTER, /* disk cluster just created and should be
32710+ * converted by flush */
32711+ FAKE_DISK_CLUSTER, /* disk cluster doesn't exist neither in memory
32712+ * nor on disk */
32713+ TRNC_DISK_CLUSTER /* disk cluster is partially truncated */
32714+} disk_cluster_stat;
32715+
32716+/* The following structure represents various stages of the same logical
32717+ * cluster of index @index:
32718+ * . fixed slide
32719+ * . page cluster (stage in primary cache)
32720+ * . transform cluster (transition stage)
32721+ * . disk cluster (stage in secondary cache)
32722+ * This structure is used in transition and synchronizing operations, e.g.
32723+ * transform cluster is a transition state when synchronizing page cluster
32724+ * and disk cluster.
32725+ * FIXME: Encapsulate page cluster, disk cluster.
32726+ */
32727+struct cluster_handle {
32728+ cloff_t index; /* offset in a file (unit is a cluster size) */
32729+ int index_valid; /* for validating the index above, if needed */
32730+ struct file *file; /* host file */
32731+
32732+ /* logical cluster */
32733+ struct reiser4_slide *win; /* sliding window to locate holes */
32734+ logical_cluster_op op; /* logical cluster operation (truncate or
32735+ append/overwrite) */
32736+ /* transform cluster */
32737+ struct tfm_cluster tc; /* contains all needed info to synchronize
32738+ page cluster and disk cluster) */
32739+ /* page cluster */
32740+ int nr_pages; /* number of pages of current checkin action */
32741+ int old_nrpages; /* number of pages of last checkin action */
32742+ struct page **pages; /* attached pages */
32743+ jnode * node; /* jnode for capture */
32744+
32745+ /* disk cluster */
32746+ hint_t *hint; /* current position in the tree */
32747+ disk_cluster_stat dstat; /* state of the current disk cluster */
32748+ int reserved; /* is space for disk cluster reserved */
32749+#if REISER4_DEBUG
32750+ reiser4_context *ctx;
32751+ int reserved_prepped;
32752+ int reserved_unprepped;
32753+#endif
32754+
32755+};
32756+
32757+static inline __u8 * tfm_input_data (struct cluster_handle * clust)
32758+{
32759+ return tfm_stream_data(&clust->tc, INPUT_STREAM);
32760+}
32761+
32762+static inline __u8 * tfm_output_data (struct cluster_handle * clust)
32763+{
32764+ return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
32765+}
32766+
32767+static inline int reset_cluster_pgset(struct cluster_handle * clust,
32768+ int nrpages)
32769+{
32770+ assert("edward-1057", clust->pages != NULL);
32771+ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
32772+ return 0;
32773+}
32774+
32775+static inline int alloc_cluster_pgset(struct cluster_handle * clust,
32776+ int nrpages)
32777+{
32778+ assert("edward-949", clust != NULL);
32779+ assert("edward-1362", clust->pages == NULL);
32780+ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
32781+
32782+ clust->pages = kzalloc(sizeof(*clust->pages) * nrpages,
32783+ reiser4_ctx_gfp_mask_get());
32784+ if (!clust->pages)
32785+ return RETERR(-ENOMEM);
32786+ return 0;
32787+}
32788+
32789+static inline void move_cluster_pgset(struct cluster_handle *clust,
32790+ struct page ***pages, int * nr_pages)
32791+{
32792+ assert("edward-1545", clust != NULL && clust->pages != NULL);
32793+ assert("edward-1546", pages != NULL && *pages == NULL);
32794+ *pages = clust->pages;
32795+ *nr_pages = clust->nr_pages;
32796+ clust->pages = NULL;
32797+}
32798+
32799+static inline void free_cluster_pgset(struct cluster_handle * clust)
32800+{
32801+ assert("edward-951", clust->pages != NULL);
32802+ kfree(clust->pages);
32803+ clust->pages = NULL;
32804+}
32805+
32806+static inline void put_cluster_handle(struct cluster_handle * clust)
32807+{
32808+ assert("edward-435", clust != NULL);
32809+
32810+ put_tfm_cluster(&clust->tc);
32811+ if (clust->pages)
32812+ free_cluster_pgset(clust);
32813+ memset(clust, 0, sizeof *clust);
32814+}
32815+
32816+static inline void inc_keyload_count(struct reiser4_crypto_info * data)
32817+{
32818+ assert("edward-1410", data != NULL);
32819+ data->keyload_count++;
32820+}
32821+
32822+static inline void dec_keyload_count(struct reiser4_crypto_info * data)
32823+{
32824+ assert("edward-1411", data != NULL);
32825+ assert("edward-1412", data->keyload_count > 0);
32826+ data->keyload_count--;
32827+}
32828+
32829+static inline int capture_cluster_jnode(jnode * node)
32830+{
32831+ return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
32832+}
32833+
32834+/* cryptcompress specific part of reiser4_inode */
32835+struct cryptcompress_info {
32836+ struct mutex checkin_mutex; /* This is to serialize
32837+ * checkin_logical_cluster operations */
32838+ cloff_t trunc_index; /* Index of the leftmost truncated disk
32839+ * cluster (to resolve races with read) */
32840+ struct reiser4_crypto_info *crypt;
32841+ /*
32842+ * the following 2 fields are controlled by compression mode plugin
32843+ */
32844+ int compress_toggle; /* Current status of compressibility */
32845+ int lattice_factor; /* Factor of dynamic lattice. FIXME: Have
32846+ * a compression_toggle to keep the factor
32847+ */
32848+#if REISER4_DEBUG
32849+ atomic_t pgcount; /* number of grabbed pages */
32850+#endif
32851+};
32852+
32853+static inline void set_compression_toggle (struct cryptcompress_info * info, int val)
32854+{
32855+ info->compress_toggle = val;
32856+}
32857+
32858+static inline int get_compression_toggle (struct cryptcompress_info * info)
32859+{
32860+ return info->compress_toggle;
32861+}
32862+
32863+static inline int compression_is_on(struct cryptcompress_info * info)
32864+{
32865+ return get_compression_toggle(info) == 1;
32866+}
32867+
32868+static inline void turn_on_compression(struct cryptcompress_info * info)
32869+{
32870+ set_compression_toggle(info, 1);
32871+}
32872+
32873+static inline void turn_off_compression(struct cryptcompress_info * info)
32874+{
32875+ set_compression_toggle(info, 0);
32876+}
32877+
32878+static inline void set_lattice_factor(struct cryptcompress_info * info, int val)
32879+{
32880+ info->lattice_factor = val;
32881+}
32882+
32883+static inline int get_lattice_factor(struct cryptcompress_info * info)
32884+{
32885+ return info->lattice_factor;
32886+}
32887+
32888+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *);
32889+int equal_to_rdk(znode *, const reiser4_key *);
32890+int goto_right_neighbor(coord_t *, lock_handle *);
32891+int cryptcompress_inode_ok(struct inode *inode);
32892+int coord_is_unprepped_ctail(const coord_t * coord);
32893+extern int do_readpage_ctail(struct inode *, struct cluster_handle *,
32894+ struct page * page, znode_lock_mode mode);
32895+extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
32896+ struct inode * inode);
32897+extern int readpages_cryptcompress(struct file*, struct address_space*,
32898+ struct list_head*, unsigned);
32899+int bind_cryptcompress(struct inode *child, struct inode *parent);
32900+void destroy_inode_cryptcompress(struct inode * inode);
32901+int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
32902+ rw_op rw);
32903+int write_pschedule_hook(struct file *file, struct inode * inode,
32904+ loff_t pos, struct cluster_handle * clust,
32905+ struct psched_context * cont);
32906+int setattr_pschedule_hook(struct inode * inode);
32907+struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
32908+void inherit_crypto_info_common(struct inode * parent, struct inode * object,
32909+ int (*can_inherit)(struct inode * child,
32910+ struct inode * parent));
32911+void reiser4_attach_crypto_info(struct inode * inode,
32912+ struct reiser4_crypto_info * info);
32913+void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new);
32914+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode);
32915+
32916+static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info)
32917+{
32918+ return info->cipher;
32919+}
32920+
32921+static inline void info_set_cipher(struct reiser4_crypto_info * info,
32922+ struct crypto_blkcipher * tfm)
32923+{
32924+ info->cipher = tfm;
32925+}
32926+
32927+static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info)
32928+{
32929+ return info->digest;
32930+}
32931+
32932+static inline void info_set_digest(struct reiser4_crypto_info * info,
32933+ struct crypto_hash * tfm)
32934+{
32935+ info->digest = tfm;
32936+}
32937+
32938+static inline void put_cluster_page(struct page * page)
32939+{
32940+ page_cache_release(page);
32941+}
32942+
32943+#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
32944+
32945+/* Make Linus happy.
32946+ Local variables:
32947+ c-indentation-style: "K&R"
32948+ mode-name: "LC"
32949+ c-basic-offset: 8
32950+ tab-width: 8
32951+ fill-column: 120
32952+ scroll-step: 1
32953+ End:
32954+*/
32955diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/file.c linux-2.6.24/fs/reiser4/plugin/file/file.c
32956--- linux-2.6.24.orig/fs/reiser4/plugin/file/file.c 1970-01-01 03:00:00.000000000 +0300
32957+++ linux-2.6.24/fs/reiser4/plugin/file/file.c 2008-01-25 11:40:16.694168755 +0300
32958@@ -0,0 +1,2724 @@
32959+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
32960+ * reiser4/README */
32961+
32962+/*
32963+ * this file contains implementations of inode/file/address_space/file plugin
32964+ * operations specific for "unix file plugin" (plugin id is
32965+ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
32966+ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
32967+ * no items but stat data)
32968+ */
32969+
32970+#include "../../inode.h"
32971+#include "../../super.h"
32972+#include "../../tree_walk.h"
32973+#include "../../carry.h"
32974+#include "../../page_cache.h"
32975+#include "../../ioctl.h"
32976+#include "../object.h"
32977+#include "../cluster.h"
32978+#include "../../safe_link.h"
32979+
32980+#include <linux/writeback.h>
32981+#include <linux/pagevec.h>
32982+#include <linux/syscalls.h>
32983+
32984+
32985+static int unpack(struct file *file, struct inode *inode, int forever);
32986+static void drop_access(struct unix_file_info *);
32987+static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
32988+ znode_lock_mode lock_mode);
32989+
32990+/* Get exclusive access and make sure that file is not partially
32991+ * converted (It may happen that another process is doing tail
32992+ * conversion. If so, wait until it completes)
32993+ */
32994+static inline void get_exclusive_access_careful(struct unix_file_info * uf_info,
32995+ struct inode *inode)
32996+{
32997+ do {
32998+ get_exclusive_access(uf_info);
32999+ if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
33000+ break;
33001+ drop_exclusive_access(uf_info);
33002+ schedule();
33003+ } while (1);
33004+}
33005+
33006+/* get unix file plugin specific portion of inode */
33007+struct unix_file_info *unix_file_inode_data(const struct inode *inode)
33008+{
33009+ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
33010+}
33011+
33012+/**
33013+ * equal_to_rdk - compare key and znode's right delimiting key
33014+ * @node: node whose right delimiting key to compare with @key
33015+ * @key: key to compare with @node's right delimiting key
33016+ *
33017+ * Returns true if @key is equal to right delimiting key of @node.
33018+ */
33019+int equal_to_rdk(znode *node, const reiser4_key *key)
33020+{
33021+ int result;
33022+
33023+ read_lock_dk(znode_get_tree(node));
33024+ result = keyeq(key, znode_get_rd_key(node));
33025+ read_unlock_dk(znode_get_tree(node));
33026+ return result;
33027+}
33028+
33029+#if REISER4_DEBUG
33030+
33031+/**
33032+ * equal_to_ldk - compare key and znode's left delimiting key
33033+ * @node: node whose left delimiting key to compare with @key
33034+ * @key: key to compare with @node's left delimiting key
33035+ *
33036+ * Returns true if @key is equal to left delimiting key of @node.
33037+ */
33038+int equal_to_ldk(znode *node, const reiser4_key *key)
33039+{
33040+ int result;
33041+
33042+ read_lock_dk(znode_get_tree(node));
33043+ result = keyeq(key, znode_get_ld_key(node));
33044+ read_unlock_dk(znode_get_tree(node));
33045+ return result;
33046+}
33047+
33048+/**
33049+ * check_coord - check whether coord corresponds to key
33050+ * @coord: coord to check
33051+ * @key: key @coord has to correspond to
33052+ *
33053+ * Returns true if @coord is set as if it was set as result of lookup with @key
33054+ * in coord->node.
33055+ */
33056+static int check_coord(const coord_t *coord, const reiser4_key *key)
33057+{
33058+ coord_t twin;
33059+
33060+ node_plugin_by_node(coord->node)->lookup(coord->node, key,
33061+ FIND_MAX_NOT_MORE_THAN, &twin);
33062+ return coords_equal(coord, &twin);
33063+}
33064+
33065+#endif /* REISER4_DEBUG */
33066+
33067+/**
33068+ * init_uf_coord - initialize extended coord
33069+ * @uf_coord:
33070+ * @lh:
33071+ *
33072+ *
33073+ */
33074+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
33075+{
33076+ coord_init_zero(&uf_coord->coord);
33077+ coord_clear_iplug(&uf_coord->coord);
33078+ uf_coord->lh = lh;
33079+ init_lh(lh);
33080+ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
33081+ uf_coord->valid = 0;
33082+}
33083+
33084+static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
33085+{
33086+ assert("vs-1333", uf_coord->valid == 0);
33087+
33088+ if (coord_is_between_items(&uf_coord->coord))
33089+ return;
33090+
33091+ assert("vs-1348",
33092+ item_plugin_by_coord(&uf_coord->coord)->s.file.
33093+ init_coord_extension);
33094+
33095+ item_body_by_coord(&uf_coord->coord);
33096+ item_plugin_by_coord(&uf_coord->coord)->s.file.
33097+ init_coord_extension(uf_coord, offset);
33098+}
33099+
33100+/**
33101+ * goto_right_neighbor - lock right neighbor, drop current node lock
33102+ * @coord:
33103+ * @lh:
33104+ *
33105+ * Obtain lock on right neighbor and drop lock on current node.
33106+ */
33107+int goto_right_neighbor(coord_t *coord, lock_handle *lh)
33108+{
33109+ int result;
33110+ lock_handle lh_right;
33111+
33112+ assert("vs-1100", znode_is_locked(coord->node));
33113+
33114+ init_lh(&lh_right);
33115+ result = reiser4_get_right_neighbor(&lh_right, coord->node,
33116+ znode_is_wlocked(coord->node) ?
33117+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
33118+ GN_CAN_USE_UPPER_LEVELS);
33119+ if (result) {
33120+ done_lh(&lh_right);
33121+ return result;
33122+ }
33123+
33124+ /*
33125+ * we hold two longterm locks on neighboring nodes. Unlock left of
33126+ * them
33127+ */
33128+ done_lh(lh);
33129+
33130+ coord_init_first_unit_nocheck(coord, lh_right.node);
33131+ move_lh(lh, &lh_right);
33132+
33133+ return 0;
33134+
33135+}
33136+
33137+/**
33138+ * set_file_state
33139+ * @uf_info:
33140+ * @cbk_result:
33141+ * @level:
33142+ *
33143+ * This is to be used by find_file_item and in find_file_state to
33144+ * determine real state of file
33145+ */
33146+static void set_file_state(struct unix_file_info *uf_info, int cbk_result,
33147+ tree_level level)
33148+{
33149+ if (cbk_errored(cbk_result))
33150+ /* error happened in find_file_item */
33151+ return;
33152+
33153+ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
33154+
33155+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33156+ if (cbk_result == CBK_COORD_NOTFOUND)
33157+ uf_info->container = UF_CONTAINER_EMPTY;
33158+ else if (level == LEAF_LEVEL)
33159+ uf_info->container = UF_CONTAINER_TAILS;
33160+ else
33161+ uf_info->container = UF_CONTAINER_EXTENTS;
33162+ } else {
33163+ /*
33164+ * file state is known, check whether it is set correctly if
33165+ * file is not being tail converted
33166+ */
33167+ if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
33168+ REISER4_PART_IN_CONV)) {
33169+ assert("vs-1162",
33170+ ergo(level == LEAF_LEVEL &&
33171+ cbk_result == CBK_COORD_FOUND,
33172+ uf_info->container == UF_CONTAINER_TAILS));
33173+ assert("vs-1165",
33174+ ergo(level == TWIG_LEVEL &&
33175+ cbk_result == CBK_COORD_FOUND,
33176+ uf_info->container == UF_CONTAINER_EXTENTS));
33177+ }
33178+ }
33179+}
33180+
33181+int find_file_item_nohint(coord_t *coord, lock_handle *lh,
33182+ const reiser4_key *key, znode_lock_mode lock_mode,
33183+ struct inode *inode)
33184+{
33185+ return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
33186+ FIND_MAX_NOT_MORE_THAN,
33187+ TWIG_LEVEL, LEAF_LEVEL,
33188+ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
33189+ (CBK_UNIQUE | CBK_FOR_INSERT),
33190+ NULL /* ra_info */ );
33191+}
33192+
33193+/**
33194+ * find_file_item - look for file item in the tree
33195+ * @hint: provides coordinate, lock handle, seal
33196+ * @key: key for search
33197+ * @mode: mode of lock to put on returned node
33198+ * @ra_info:
33199+ * @inode:
33200+ *
33201+ * This finds position in the tree corresponding to @key. It first tries to use
33202+ * @hint's seal if it is set.
33203+ */
33204+int find_file_item(hint_t *hint, const reiser4_key *key,
33205+ znode_lock_mode lock_mode,
33206+ struct inode *inode)
33207+{
33208+ int result;
33209+ coord_t *coord;
33210+ lock_handle *lh;
33211+
33212+ assert("nikita-3030", reiser4_schedulable());
33213+ assert("vs-1707", hint != NULL);
33214+ assert("vs-47", inode != NULL);
33215+
33216+ coord = &hint->ext_coord.coord;
33217+ lh = hint->ext_coord.lh;
33218+ init_lh(lh);
33219+
33220+ result = hint_validate(hint, key, 1 /* check key */, lock_mode);
33221+ if (!result) {
33222+ if (coord->between == AFTER_UNIT &&
33223+ equal_to_rdk(coord->node, key)) {
33224+ result = goto_right_neighbor(coord, lh);
33225+ if (result == -E_NO_NEIGHBOR)
33226+ return RETERR(-EIO);
33227+ if (result)
33228+ return result;
33229+ assert("vs-1152", equal_to_ldk(coord->node, key));
33230+ /*
33231+ * we moved to different node. Invalidate coord
33232+ * extension, zload is necessary to init it again
33233+ */
33234+ hint->ext_coord.valid = 0;
33235+ }
33236+
33237+ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
33238+ znode_get_level(coord->node));
33239+
33240+ return CBK_COORD_FOUND;
33241+ }
33242+
33243+ coord_init_zero(coord);
33244+ result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
33245+ set_file_state(unix_file_inode_data(inode), result,
33246+ znode_get_level(coord->node));
33247+
33248+ /* FIXME: we might already have coord extension initialized */
33249+ hint->ext_coord.valid = 0;
33250+ return result;
33251+}
33252+
33253+/* plugin->u.file.write_flowom = NULL
33254+ plugin->u.file.read_flow = NULL */
33255+
33256+void hint_init_zero(hint_t * hint)
33257+{
33258+ memset(hint, 0, sizeof(*hint));
33259+ init_lh(&hint->lh);
33260+ hint->ext_coord.lh = &hint->lh;
33261+}
33262+
33263+static int find_file_state(struct inode *inode, struct unix_file_info *uf_info)
33264+{
33265+ int result;
33266+ reiser4_key key;
33267+ coord_t coord;
33268+ lock_handle lh;
33269+
33270+ assert("vs-1628", ea_obtained(uf_info));
33271+
33272+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33273+ key_by_inode_and_offset_common(inode, 0, &key);
33274+ init_lh(&lh);
33275+ result = find_file_item_nohint(&coord, &lh, &key,
33276+ ZNODE_READ_LOCK, inode);
33277+ set_file_state(uf_info, result, znode_get_level(coord.node));
33278+ done_lh(&lh);
33279+ if (!cbk_errored(result))
33280+ result = 0;
33281+ } else
33282+ result = 0;
33283+ assert("vs-1074",
33284+ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
33285+ reiser4_txn_restart_current();
33286+ return result;
33287+}
33288+
33289+/**
33290+ * Estimate and reserve space needed to truncate page
33291+ * which gets partially truncated: one block for page
33292+ * itself, stat-data update (estimate_one_insert_into_item)
33293+ * and one item insertion (estimate_one_insert_into_item)
33294+ * which may happen if page corresponds to hole extent and
33295+ * unallocated one will have to be created
33296+ */
33297+static int reserve_partial_page(reiser4_tree * tree)
33298+{
33299+ grab_space_enable();
33300+ return reiser4_grab_reserved(reiser4_get_current_sb(),
33301+ 1 +
33302+ 2 * estimate_one_insert_into_item(tree),
33303+ BA_CAN_COMMIT);
33304+}
33305+
33306+/* estimate and reserve space needed to cut one item and update one stat data */
33307+static int reserve_cut_iteration(reiser4_tree * tree)
33308+{
33309+ __u64 estimate = estimate_one_item_removal(tree)
33310+ + estimate_one_insert_into_item(tree);
33311+
33312+ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
33313+
33314+ grab_space_enable();
33315+ /* We need to double our estimate now that we can delete more than one
33316+ node. */
33317+ return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
33318+ BA_CAN_COMMIT);
33319+}
33320+
33321+int reiser4_update_file_size(struct inode *inode, loff_t new_size,
33322+ int update_sd)
33323+{
33324+ int result = 0;
33325+
33326+ INODE_SET_SIZE(inode, new_size);
33327+ if (update_sd) {
33328+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
33329+ result = reiser4_update_sd(inode);
33330+ }
33331+ return result;
33332+}
33333+
33334+/**
33335+ * Cut file items one by one starting from the last one until
33336+ * new file size (inode->i_size) is reached. Reserve space
33337+ * and update file stat data on every single cut from the tree
33338+ */
33339+int cut_file_items(struct inode *inode, loff_t new_size,
33340+ int update_sd, loff_t cur_size,
33341+ int (*update_actor) (struct inode *, loff_t, int))
33342+{
33343+ reiser4_key from_key, to_key;
33344+ reiser4_key smallest_removed;
33345+ file_plugin *fplug = inode_file_plugin(inode);
33346+ int result;
33347+ int progress = 0;
33348+
33349+ assert("vs-1248",
33350+ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
33351+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
33352+
33353+ fplug->key_by_inode(inode, new_size, &from_key);
33354+ to_key = from_key;
33355+ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
33356+ /* this loop normally runs just once */
33357+ while (1) {
33358+ result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
33359+ if (result)
33360+ break;
33361+
33362+ result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
33363+ &smallest_removed, inode, 1,
33364+ &progress);
33365+ if (result == -E_REPEAT) {
33366+ /**
33367+ * -E_REPEAT is a signal to interrupt a long
33368+ * file truncation process
33369+ */
33370+ if (progress) {
33371+ result = update_actor(inode,
33372+ get_key_offset(&smallest_removed),
33373+ update_sd);
33374+ if (result)
33375+ break;
33376+ }
33377+ /* the below does up(sbinfo->delete_mutex).
33378+ * Do not get folled */
33379+ reiser4_release_reserved(inode->i_sb);
33380+ /**
33381+ * reiser4_cut_tree_object() was interrupted probably
33382+ * because current atom requires commit, we have to
33383+ * release transaction handle to allow atom commit.
33384+ */
33385+ reiser4_txn_restart_current();
33386+ continue;
33387+ }
33388+ if (result
33389+ && !(result == CBK_COORD_NOTFOUND && new_size == 0
33390+ && inode->i_size == 0))
33391+ break;
33392+
33393+ set_key_offset(&smallest_removed, new_size);
33394+ /* Final sd update after the file gets its correct size */
33395+ result = update_actor(inode, get_key_offset(&smallest_removed),
33396+ update_sd);
33397+ break;
33398+ }
33399+
33400+ /* the below does up(sbinfo->delete_mutex). Do not get folled */
33401+ reiser4_release_reserved(inode->i_sb);
33402+
33403+ return result;
33404+}
33405+
33406+int find_or_create_extent(struct page *page);
33407+
33408+/* part of truncate_file_body: it is called when truncate is used to make file
33409+ shorter */
33410+static int shorten_file(struct inode *inode, loff_t new_size)
33411+{
33412+ int result;
33413+ struct page *page;
33414+ int padd_from;
33415+ unsigned long index;
33416+ struct unix_file_info *uf_info;
33417+
33418+ /*
33419+ * all items of ordinary reiser4 file are grouped together. That is why
33420+ * we can use reiser4_cut_tree. Plan B files (for instance) can not be
33421+ * truncated that simply
33422+ */
33423+ result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
33424+ get_key_offset(reiser4_max_key()),
33425+ reiser4_update_file_size);
33426+ if (result)
33427+ return result;
33428+
33429+ uf_info = unix_file_inode_data(inode);
33430+ assert("vs-1105", new_size == inode->i_size);
33431+ if (new_size == 0) {
33432+ uf_info->container = UF_CONTAINER_EMPTY;
33433+ return 0;
33434+ }
33435+
33436+ result = find_file_state(inode, uf_info);
33437+ if (result)
33438+ return result;
33439+ if (uf_info->container == UF_CONTAINER_TAILS)
33440+ /*
33441+ * No need to worry about zeroing last page after new file
33442+ * end
33443+ */
33444+ return 0;
33445+
33446+ padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
33447+ if (!padd_from)
33448+ /* file is truncated to page boundary */
33449+ return 0;
33450+
33451+ result = reserve_partial_page(reiser4_tree_by_inode(inode));
33452+ if (result) {
33453+ reiser4_release_reserved(inode->i_sb);
33454+ return result;
33455+ }
33456+
33457+ /* last page is partially truncated - zero its content */
33458+ index = (inode->i_size >> PAGE_CACHE_SHIFT);
33459+ page = read_mapping_page(inode->i_mapping, index, NULL);
33460+ if (IS_ERR(page)) {
33461+ /*
33462+ * the below does up(sbinfo->delete_mutex). Do not get
33463+ * confused
33464+ */
33465+ reiser4_release_reserved(inode->i_sb);
33466+ if (likely(PTR_ERR(page) == -EINVAL)) {
33467+ /* looks like file is built of tail items */
33468+ return 0;
33469+ }
33470+ return PTR_ERR(page);
33471+ }
33472+ wait_on_page_locked(page);
33473+ if (!PageUptodate(page)) {
33474+ page_cache_release(page);
33475+ /*
33476+ * the below does up(sbinfo->delete_mutex). Do not get
33477+ * confused
33478+ */
33479+ reiser4_release_reserved(inode->i_sb);
33480+ return RETERR(-EIO);
33481+ }
33482+
33483+ /*
33484+ * if page correspons to hole extent unit - unallocated one will be
33485+ * created here. This is not necessary
33486+ */
33487+ result = find_or_create_extent(page);
33488+
33489+ /*
33490+ * FIXME: cut_file_items has already updated inode. Probably it would
33491+ * be better to update it here when file is really truncated
33492+ */
33493+ if (result) {
33494+ page_cache_release(page);
33495+ /*
33496+ * the below does up(sbinfo->delete_mutex). Do not get
33497+ * confused
33498+ */
33499+ reiser4_release_reserved(inode->i_sb);
33500+ return result;
33501+ }
33502+
33503+ lock_page(page);
33504+ assert("vs-1066", PageLocked(page));
33505+ zero_user_page(page, padd_from, PAGE_CACHE_SIZE - padd_from, KM_USER0);
33506+ unlock_page(page);
33507+ page_cache_release(page);
33508+ /* the below does up(sbinfo->delete_mutex). Do not get confused */
33509+ reiser4_release_reserved(inode->i_sb);
33510+ return 0;
33511+}
33512+
33513+/**
33514+ * should_have_notail
33515+ * @uf_info:
33516+ * @new_size:
33517+ *
33518+ * Calls formatting plugin to see whether file of size @new_size has to be
33519+ * stored in unformatted nodes or in tail items. 0 is returned for later case.
33520+ */
33521+static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size)
33522+{
33523+ if (!uf_info->tplug)
33524+ return 1;
33525+ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
33526+ new_size);
33527+
33528+}
33529+
33530+/**
33531+ * truncate_file_body - change length of file
33532+ * @inode: inode of file
33533+ * @new_size: new file length
33534+ *
33535+ * Adjusts items file @inode is built of to match @new_size. It may either cut
33536+ * items or add them to represent a hole at the end of file. The caller has to
33537+ * obtain exclusive access to the file.
33538+ */
33539+static int truncate_file_body(struct inode *inode, struct iattr *attr)
33540+{
33541+ int result;
33542+ loff_t new_size = attr->ia_size;
33543+
33544+ if (inode->i_size < new_size) {
33545+ /* expanding truncate */
33546+ struct unix_file_info *uf_info = unix_file_inode_data(inode);
33547+
33548+ result = find_file_state(inode, uf_info);
33549+ if (result)
33550+ return result;
33551+
33552+ if (should_have_notail(uf_info, new_size)) {
33553+ /*
33554+ * file of size @new_size has to be built of
33555+ * extents. If it is built of tails - convert to
33556+ * extents
33557+ */
33558+ if (uf_info->container == UF_CONTAINER_TAILS) {
33559+ /*
33560+ * if file is being convered by another process
33561+ * - wait until it completes
33562+ */
33563+ while (1) {
33564+ if (reiser4_inode_get_flag(inode,
33565+ REISER4_PART_IN_CONV)) {
33566+ drop_exclusive_access(uf_info);
33567+ schedule();
33568+ get_exclusive_access(uf_info);
33569+ continue;
33570+ }
33571+ break;
33572+ }
33573+
33574+ if (uf_info->container == UF_CONTAINER_TAILS) {
33575+ result = tail2extent(uf_info);
33576+ if (result)
33577+ return result;
33578+ }
33579+ }
33580+ result = reiser4_write_extent(NULL, inode, NULL,
33581+ 0, &new_size);
33582+ if (result)
33583+ return result;
33584+ uf_info->container = UF_CONTAINER_EXTENTS;
33585+ } else {
33586+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
33587+ result = reiser4_write_extent(NULL, inode, NULL,
33588+ 0, &new_size);
33589+ if (result)
33590+ return result;
33591+ } else {
33592+ result = reiser4_write_tail(NULL, inode, NULL,
33593+ 0, &new_size);
33594+ if (result)
33595+ return result;
33596+ uf_info->container = UF_CONTAINER_TAILS;
33597+ }
33598+ }
33599+ BUG_ON(result > 0);
33600+ result = reiser4_update_file_size(inode, new_size, 1);
33601+ BUG_ON(result != 0);
33602+ } else
33603+ result = shorten_file(inode, new_size);
33604+ return result;
33605+}
33606+
33607+/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
33608+
33609+/**
33610+ * load_file_hint - copy hint from struct file to local variable
33611+ * @file: file to get hint from
33612+ * @hint: structure to fill
33613+ *
33614+ * Reiser4 specific portion of struct file may contain information (hint)
33615+ * stored on exiting from previous read or write. That information includes
33616+ * seal of znode and coord within that znode where previous read or write
33617+ * stopped. This function copies that information to @hint if it was stored or
33618+ * initializes @hint by 0s otherwise.
33619+ */
33620+int load_file_hint(struct file *file, hint_t *hint)
33621+{
33622+ reiser4_file_fsdata *fsdata;
33623+
33624+ if (file) {
33625+ fsdata = reiser4_get_file_fsdata(file);
33626+ if (IS_ERR(fsdata))
33627+ return PTR_ERR(fsdata);
33628+
33629+ spin_lock_inode(file->f_dentry->d_inode);
33630+ if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
33631+ *hint = fsdata->reg.hint;
33632+ init_lh(&hint->lh);
33633+ hint->ext_coord.lh = &hint->lh;
33634+ spin_unlock_inode(file->f_dentry->d_inode);
33635+ /*
33636+ * force re-validation of the coord on the first
33637+ * iteration of the read/write loop.
33638+ */
33639+ hint->ext_coord.valid = 0;
33640+ assert("nikita-19892", coords_equal(&hint->seal.coord1,
33641+ &hint->ext_coord.
33642+ coord));
33643+ return 0;
33644+ }
33645+ memset(&fsdata->reg.hint, 0, sizeof(hint_t));
33646+ spin_unlock_inode(file->f_dentry->d_inode);
33647+ }
33648+ hint_init_zero(hint);
33649+ return 0;
33650+}
33651+
33652+/**
33653+ * save_file_hint - copy hint to reiser4 private struct file's part
33654+ * @file: file to save hint in
33655+ * @hint: hint to save
33656+ *
33657+ * This copies @hint to reiser4 private part of struct file. It can help
33658+ * speedup future accesses to the file.
33659+ */
33660+void save_file_hint(struct file *file, const hint_t *hint)
33661+{
33662+ reiser4_file_fsdata *fsdata;
33663+
33664+ assert("edward-1337", hint != NULL);
33665+
33666+ if (!file || !reiser4_seal_is_set(&hint->seal))
33667+ return;
33668+ fsdata = reiser4_get_file_fsdata(file);
33669+ assert("vs-965", !IS_ERR(fsdata));
33670+ assert("nikita-19891",
33671+ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
33672+ assert("vs-30", hint->lh.owner == NULL);
33673+ spin_lock_inode(file->f_dentry->d_inode);
33674+ fsdata->reg.hint = *hint;
33675+ spin_unlock_inode(file->f_dentry->d_inode);
33676+ return;
33677+}
33678+
33679+void reiser4_unset_hint(hint_t * hint)
33680+{
33681+ assert("vs-1315", hint);
33682+ hint->ext_coord.valid = 0;
33683+ reiser4_seal_done(&hint->seal);
33684+ done_lh(&hint->lh);
33685+}
33686+
33687+/* coord must be set properly. So, that reiser4_set_hint
33688+ has nothing to do */
33689+void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
33690+ znode_lock_mode mode)
33691+{
33692+ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
33693+ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
33694+
33695+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
33696+ hint->offset = get_key_offset(key);
33697+ hint->mode = mode;
33698+ done_lh(&hint->lh);
33699+}
33700+
33701+int hint_is_set(const hint_t * hint)
33702+{
33703+ return reiser4_seal_is_set(&hint->seal);
33704+}
33705+
33706+#if REISER4_DEBUG
33707+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
33708+{
33709+ return (get_key_locality(k1) == get_key_locality(k2) &&
33710+ get_key_type(k1) == get_key_type(k2) &&
33711+ get_key_band(k1) == get_key_band(k2) &&
33712+ get_key_ordering(k1) == get_key_ordering(k2) &&
33713+ get_key_objectid(k1) == get_key_objectid(k2));
33714+}
33715+#endif
33716+
33717+static int
33718+hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
33719+ znode_lock_mode lock_mode)
33720+{
33721+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
33722+ /* hint either not set or set by different operation */
33723+ return RETERR(-E_REPEAT);
33724+
33725+ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
33726+
33727+ if (check_key && get_key_offset(key) != hint->offset)
33728+ /* hint is set for different key */
33729+ return RETERR(-E_REPEAT);
33730+
33731+ assert("vs-31", hint->ext_coord.lh == &hint->lh);
33732+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
33733+ hint->ext_coord.lh, lock_mode,
33734+ ZNODE_LOCK_LOPRI);
33735+}
33736+
33737+/**
33738+ * Look for place at twig level for extent corresponding to page,
33739+ * call extent's writepage method to create unallocated extent if
33740+ * it does not exist yet, initialize jnode, capture page
33741+ */
33742+int find_or_create_extent(struct page *page)
33743+{
33744+ int result;
33745+ struct inode *inode;
33746+ int plugged_hole;
33747+
33748+ jnode *node;
33749+
33750+ assert("vs-1065", page->mapping && page->mapping->host);
33751+ inode = page->mapping->host;
33752+
33753+ lock_page(page);
33754+ node = jnode_of_page(page);
33755+ if (IS_ERR(node)) {
33756+ unlock_page(page);
33757+ return PTR_ERR(node);
33758+ }
33759+ JF_SET(node, JNODE_WRITE_PREPARED);
33760+ unlock_page(page);
33761+
33762+ if (node->blocknr == 0) {
33763+ plugged_hole = 0;
33764+ result = reiser4_update_extent(inode, node, page_offset(page),
33765+ &plugged_hole);
33766+ if (result) {
33767+ JF_CLR(node, JNODE_WRITE_PREPARED);
33768+ jput(node);
33769+ warning("edward-1549",
33770+ "reiser4_update_extent failed: %d", result);
33771+ return result;
33772+ }
33773+ if (plugged_hole)
33774+ reiser4_update_sd(inode);
33775+ } else {
33776+ spin_lock_jnode(node);
33777+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
33778+ BUG_ON(result != 0);
33779+ jnode_make_dirty_locked(node);
33780+ spin_unlock_jnode(node);
33781+ }
33782+
33783+ BUG_ON(node->atom == NULL);
33784+ JF_CLR(node, JNODE_WRITE_PREPARED);
33785+ jput(node);
33786+
33787+ if (get_current_context()->entd) {
33788+ entd_context *ent = get_entd_context(node->tree->super);
33789+
33790+ if (ent->cur_request->page == page)
33791+ ent->cur_request->node = node;
33792+ }
33793+ return 0;
33794+}
33795+
33796+/**
33797+ * has_anonymous_pages - check whether inode has pages dirtied via mmap
33798+ * @inode: inode to check
33799+ *
33800+ * Returns true if inode's mapping has dirty pages which do not belong to any
33801+ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
33802+ * tree or were eflushed and can be found via jnodes tagged
33803+ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
33804+ */
33805+static int has_anonymous_pages(struct inode *inode)
33806+{
33807+ int result;
33808+
33809+ read_lock_irq(&inode->i_mapping->tree_lock);
33810+ result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
33811+ read_unlock_irq(&inode->i_mapping->tree_lock);
33812+ return result;
33813+}
33814+
33815+/**
33816+ * capture_page_and_create_extent -
33817+ * @page: page to be captured
33818+ *
33819+ * Grabs space for extent creation and stat data update and calls function to
33820+ * do actual work.
33821+ */
33822+static int capture_page_and_create_extent(struct page *page)
33823+{
33824+ int result;
33825+ struct inode *inode;
33826+
33827+ assert("vs-1084", page->mapping && page->mapping->host);
33828+ inode = page->mapping->host;
33829+ assert("vs-1139",
33830+ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
33831+ /* page belongs to file */
33832+ assert("vs-1393",
33833+ inode->i_size > page_offset(page));
33834+
33835+ /* page capture may require extent creation (if it does not exist yet)
33836+ and stat data's update (number of blocks changes on extent
33837+ creation) */
33838+ grab_space_enable();
33839+ result = reiser4_grab_space(2 * estimate_one_insert_into_item
33840+ (reiser4_tree_by_inode(inode)),
33841+ BA_CAN_COMMIT);
33842+ if (likely(!result))
33843+ result = find_or_create_extent(page);
33844+
33845+ if (result != 0)
33846+ SetPageError(page);
33847+ return result;
33848+}
33849+
33850+/* this is implementation of method commit_write of struct
33851+ address_space_operations for unix file plugin */
33852+int
33853+commit_write_unix_file(struct file *file, struct page *page,
33854+ unsigned from, unsigned to)
33855+{
33856+ reiser4_context *ctx;
33857+ struct inode *inode;
33858+ int result;
33859+
33860+ assert("umka-3101", file != NULL);
33861+ assert("umka-3102", page != NULL);
33862+ assert("umka-3093", PageLocked(page));
33863+
33864+ SetPageUptodate(page);
33865+
33866+ inode = page->mapping->host;
33867+ ctx = reiser4_init_context(page->mapping->host->i_sb);
33868+ if (IS_ERR(ctx))
33869+ return PTR_ERR(ctx);
33870+ page_cache_get(page);
33871+ unlock_page(page);
33872+ result = capture_page_and_create_extent(page);
33873+ lock_page(page);
33874+ page_cache_release(page);
33875+
33876+ /* don't commit transaction under inode semaphore */
33877+ context_set_commit_async(ctx);
33878+ reiser4_exit_context(ctx);
33879+ return result;
33880+}
33881+
33882+/*
33883+ * Support for "anonymous" pages and jnodes.
33884+ *
33885+ * When file is write-accessed through mmap pages can be dirtied from the user
33886+ * level. In this case kernel is not notified until one of following happens:
33887+ *
33888+ * (1) msync()
33889+ *
33890+ * (2) truncate() (either explicit or through unlink)
33891+ *
33892+ * (3) VM scanner starts reclaiming mapped pages, dirtying them before
33893+ * starting write-back.
33894+ *
33895+ * As a result of (3) ->writepage may be called on a dirty page without
33896+ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
33897+ * (iozone) generate huge number of anonymous pages. Emergency flush handles
33898+ * this situation by creating jnode for anonymous page, starting IO on the
33899+ * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
33900+ * memory. Such jnode is also called anonymous.
33901+ *
33902+ * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
33903+ * tree. This is done by capture_anonymous_*() functions below.
33904+ */
33905+
33906+/**
33907+ * capture_anonymous_page - involve page into transaction
33908+ * @pg: page to deal with
33909+ *
33910+ * Takes care that @page has corresponding metadata in the tree, creates jnode
33911+ * for @page and captures it. On success 1 is returned.
33912+ */
33913+static int capture_anonymous_page(struct page *page)
33914+{
33915+ int result;
33916+
33917+ if (PageWriteback(page))
33918+ /* FIXME: do nothing? */
33919+ return 0;
33920+
33921+ result = capture_page_and_create_extent(page);
33922+ if (result == 0) {
33923+ result = 1;
33924+ } else
33925+ warning("nikita-3329",
33926+ "Cannot capture anon page: %i", result);
33927+
33928+ return result;
33929+}
33930+
33931+/**
33932+ * capture_anonymous_pages - find and capture pages dirtied via mmap
33933+ * @mapping: address space where to look for pages
33934+ * @index: start index
33935+ * @to_capture: maximum number of pages to capture
33936+ *
33937+ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
33938+ * captures (involves into atom) them, returns number of captured pages,
33939+ * updates @index to next page after the last captured one.
33940+ */
33941+static int
33942+capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
33943+ unsigned int to_capture)
33944+{
33945+ int result;
33946+ struct pagevec pvec;
33947+ unsigned int i, count;
33948+ int nr;
33949+
33950+ pagevec_init(&pvec, 0);
33951+ count = min(pagevec_space(&pvec), to_capture);
33952+ nr = 0;
33953+
33954+ /* find pages tagged MOVED */
33955+ write_lock_irq(&mapping->tree_lock);
33956+ pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
33957+ (void **)pvec.pages, *index, count,
33958+ PAGECACHE_TAG_REISER4_MOVED);
33959+ if (pagevec_count(&pvec) == 0) {
33960+ /*
33961+ * there are no pages tagged MOVED in mapping->page_tree
33962+ * starting from *index
33963+ */
33964+ write_unlock_irq(&mapping->tree_lock);
33965+ *index = (pgoff_t)-1;
33966+ return 0;
33967+ }
33968+
33969+ /* clear MOVED tag for all found pages */
33970+ for (i = 0; i < pagevec_count(&pvec); i++) {
33971+ page_cache_get(pvec.pages[i]);
33972+ radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
33973+ PAGECACHE_TAG_REISER4_MOVED);
33974+ }
33975+ write_unlock_irq(&mapping->tree_lock);
33976+
33977+
33978+ *index = pvec.pages[i - 1]->index + 1;
33979+
33980+ for (i = 0; i < pagevec_count(&pvec); i++) {
33981+ /*
33982+ * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
33983+ * reiser4_set_page_dirty_internal which is called when jnode is
33984+ * captured
33985+ */
33986+ result = capture_anonymous_page(pvec.pages[i]);
33987+ if (result == 1)
33988+ nr++;
33989+ else {
33990+ if (result < 0) {
33991+ warning("vs-1454",
33992+ "failed to capture page: "
33993+ "result=%d, captured=%d)\n",
33994+ result, i);
33995+
33996+ /*
33997+ * set MOVED tag to all pages which left not
33998+ * captured
33999+ */
34000+ write_lock_irq(&mapping->tree_lock);
34001+ for (; i < pagevec_count(&pvec); i ++) {
34002+ radix_tree_tag_set(&mapping->page_tree,
34003+ pvec.pages[i]->index,
34004+ PAGECACHE_TAG_REISER4_MOVED);
34005+ }
34006+ write_unlock_irq(&mapping->tree_lock);
34007+
34008+ pagevec_release(&pvec);
34009+ return result;
34010+ } else {
34011+ /*
34012+ * result == 0. capture_anonymous_page returns
34013+ * 0 for Writeback-ed page. Set MOVED tag on
34014+ * that page
34015+ */
34016+ write_lock_irq(&mapping->tree_lock);
34017+ radix_tree_tag_set(&mapping->page_tree,
34018+ pvec.pages[i]->index,
34019+ PAGECACHE_TAG_REISER4_MOVED);
34020+ write_unlock_irq(&mapping->tree_lock);
34021+ if (i == 0)
34022+ *index = pvec.pages[0]->index;
34023+ else
34024+ *index = pvec.pages[i - 1]->index + 1;
34025+ }
34026+ }
34027+ }
34028+ pagevec_release(&pvec);
34029+ return nr;
34030+}
34031+
34032+/**
34033+ * capture_anonymous_jnodes - find and capture anonymous jnodes
34034+ * @mapping: address space where to look for jnodes
34035+ * @from: start index
34036+ * @to: end index
34037+ * @to_capture: maximum number of jnodes to capture
34038+ *
34039+ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
34040+ * the range of indexes @from-@to and captures them, returns number of captured
34041+ * jnodes, updates @from to next jnode after the last captured one.
34042+ */
34043+static int
34044+capture_anonymous_jnodes(struct address_space *mapping,
34045+ pgoff_t *from, pgoff_t to, int to_capture)
34046+{
34047+ *from = to;
34048+ return 0;
34049+}
34050+
34051+/*
34052+ * Commit atom of the jnode of a page.
34053+ */
34054+static int sync_page(struct page *page)
34055+{
34056+ int result;
34057+ do {
34058+ jnode *node;
34059+ txn_atom *atom;
34060+
34061+ lock_page(page);
34062+ node = jprivate(page);
34063+ if (node != NULL) {
34064+ spin_lock_jnode(node);
34065+ atom = jnode_get_atom(node);
34066+ spin_unlock_jnode(node);
34067+ } else
34068+ atom = NULL;
34069+ unlock_page(page);
34070+ result = reiser4_sync_atom(atom);
34071+ } while (result == -E_REPEAT);
34072+ /*
34073+ * ZAM-FIXME-HANS: document the logic of this loop, is it just to
34074+ * handle the case where more pages get added to the atom while we are
34075+ * syncing it?
34076+ */
34077+ assert("nikita-3485", ergo(result == 0,
34078+ get_current_context()->trans->atom == NULL));
34079+ return result;
34080+}
34081+
34082+/*
34083+ * Commit atoms of pages on @pages list.
34084+ * call sync_page for each page from mapping's page tree
34085+ */
34086+static int sync_page_list(struct inode *inode)
34087+{
34088+ int result;
34089+ struct address_space *mapping;
34090+ unsigned long from; /* start index for radix_tree_gang_lookup */
34091+ unsigned int found; /* return value for radix_tree_gang_lookup */
34092+
34093+ mapping = inode->i_mapping;
34094+ from = 0;
34095+ result = 0;
34096+ read_lock_irq(&mapping->tree_lock);
34097+ while (result == 0) {
34098+ struct page *page;
34099+
34100+ found =
34101+ radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
34102+ from, 1);
34103+ assert("edward-1550", found < 2);
34104+ if (found == 0)
34105+ break;
34106+ /**
34107+ * page may not leave radix tree because it is protected from
34108+ * truncating by inode->i_mutex locked by sys_fsync
34109+ */
34110+ page_cache_get(page);
34111+ read_unlock_irq(&mapping->tree_lock);
34112+
34113+ from = page->index + 1;
34114+
34115+ result = sync_page(page);
34116+
34117+ page_cache_release(page);
34118+ read_lock_irq(&mapping->tree_lock);
34119+ }
34120+
34121+ read_unlock_irq(&mapping->tree_lock);
34122+ return result;
34123+}
34124+
34125+static int commit_file_atoms(struct inode *inode)
34126+{
34127+ int result;
34128+ struct unix_file_info *uf_info;
34129+
34130+ uf_info = unix_file_inode_data(inode);
34131+
34132+ get_exclusive_access(uf_info);
34133+ /*
34134+ * find what items file is made from
34135+ */
34136+ result = find_file_state(inode, uf_info);
34137+ drop_exclusive_access(uf_info);
34138+ if (result != 0)
34139+ return result;
34140+
34141+ /*
34142+ * file state cannot change because we are under ->i_mutex
34143+ */
34144+ switch (uf_info->container) {
34145+ case UF_CONTAINER_EXTENTS:
34146+ /* find_file_state might open join an atom */
34147+ reiser4_txn_restart_current();
34148+ result =
34149+ /*
34150+ * when we are called by
34151+ * filemap_fdatawrite->
34152+ * do_writepages()->
34153+ * reiser4_writepages()
34154+ *
34155+ * inode->i_mapping->dirty_pages are spices into
34156+ * ->io_pages, leaving ->dirty_pages dirty.
34157+ *
34158+ * When we are called from
34159+ * reiser4_fsync()->sync_unix_file(), we have to
34160+ * commit atoms of all pages on the ->dirty_list.
34161+ *
34162+ * So for simplicity we just commit ->io_pages and
34163+ * ->dirty_pages.
34164+ */
34165+ sync_page_list(inode);
34166+ break;
34167+ case UF_CONTAINER_TAILS:
34168+ /*
34169+ * NOTE-NIKITA probably we can be smarter for tails. For now
34170+ * just commit all existing atoms.
34171+ */
34172+ result = txnmgr_force_commit_all(inode->i_sb, 0);
34173+ break;
34174+ case UF_CONTAINER_EMPTY:
34175+ result = 0;
34176+ break;
34177+ case UF_CONTAINER_UNKNOWN:
34178+ default:
34179+ result = -EIO;
34180+ break;
34181+ }
34182+
34183+ /*
34184+ * commit current transaction: there can be captured nodes from
34185+ * find_file_state() and finish_conversion().
34186+ */
34187+ reiser4_txn_restart_current();
34188+ return result;
34189+}
34190+
34191+/**
34192+ * writepages_unix_file - writepages of struct address_space_operations
34193+ * @mapping:
34194+ * @wbc:
34195+ *
34196+ * This captures anonymous pages and anonymous jnodes. Anonymous pages are
34197+ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
34198+ * created by reiser4_writepage.
34199+ */
34200+int writepages_unix_file(struct address_space *mapping,
34201+ struct writeback_control *wbc)
34202+{
34203+ int result;
34204+ struct unix_file_info *uf_info;
34205+ pgoff_t pindex, jindex, nr_pages;
34206+ long to_capture;
34207+ struct inode *inode;
34208+
34209+ inode = mapping->host;
34210+ if (!has_anonymous_pages(inode)) {
34211+ result = 0;
34212+ goto end;
34213+ }
34214+ jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
34215+ result = 0;
34216+ nr_pages = size_in_pages(i_size_read(inode));
34217+
34218+ uf_info = unix_file_inode_data(inode);
34219+
34220+ do {
34221+ reiser4_context *ctx;
34222+
34223+ if (wbc->sync_mode != WB_SYNC_ALL)
34224+ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
34225+ else
34226+ to_capture = CAPTURE_APAGE_BURST;
34227+
34228+ ctx = reiser4_init_context(inode->i_sb);
34229+ if (IS_ERR(ctx)) {
34230+ result = PTR_ERR(ctx);
34231+ break;
34232+ }
34233+ /* avoid recursive calls to ->sync_inodes */
34234+ ctx->nobalance = 1;
34235+ assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
34236+ assert("edward-1551", LOCK_CNT_NIL(inode_sem_w));
34237+ assert("edward-1552", LOCK_CNT_NIL(inode_sem_r));
34238+
34239+ reiser4_txn_restart_current();
34240+
34241+ /* we have to get nonexclusive access to the file */
34242+ if (get_current_context()->entd) {
34243+ /*
34244+ * use nonblocking version of nonexclusive_access to
34245+ * avoid deadlock which might look like the following:
34246+ * process P1 holds NEA on file F1 and called entd to
34247+ * reclaim some memory. Entd works for P1 and is going
34248+ * to capture pages of file F2. To do that entd has to
34249+ * get NEA to F2. F2 is held by process P2 which also
34250+ * called entd. But entd is serving P1 at the moment
34251+ * and P2 has to wait. Process P3 trying to get EA to
34252+ * file F2. Existence of pending EA request to file F2
34253+ * makes impossible for entd to get NEA to file
34254+ * F2. Neither of these process can continue. Using
34255+ * nonblocking version of gettign NEA is supposed to
34256+ * avoid this deadlock.
34257+ */
34258+ if (try_to_get_nonexclusive_access(uf_info) == 0) {
34259+ result = RETERR(-EBUSY);
34260+ reiser4_exit_context(ctx);
34261+ break;
34262+ }
34263+ } else
34264+ get_nonexclusive_access(uf_info);
34265+
34266+ while (to_capture > 0) {
34267+ pgoff_t start;
34268+
34269+ assert("vs-1727", jindex <= pindex);
34270+ if (pindex == jindex) {
34271+ start = pindex;
34272+ result =
34273+ capture_anonymous_pages(inode->i_mapping,
34274+ &pindex,
34275+ to_capture);
34276+ if (result <= 0)
34277+ break;
34278+ to_capture -= result;
34279+ wbc->nr_to_write -= result;
34280+ if (start + result == pindex) {
34281+ jindex = pindex;
34282+ continue;
34283+ }
34284+ if (to_capture <= 0)
34285+ break;
34286+ }
34287+ /* deal with anonymous jnodes between jindex and pindex */
34288+ result =
34289+ capture_anonymous_jnodes(inode->i_mapping, &jindex,
34290+ pindex, to_capture);
34291+ if (result < 0)
34292+ break;
34293+ to_capture -= result;
34294+ get_current_context()->nr_captured += result;
34295+
34296+ if (jindex == (pgoff_t) - 1) {
34297+ assert("vs-1728", pindex == (pgoff_t) - 1);
34298+ break;
34299+ }
34300+ }
34301+ if (to_capture <= 0)
34302+ /* there may be left more pages */
34303+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
34304+
34305+ drop_nonexclusive_access(uf_info);
34306+ if (result < 0) {
34307+ /* error happened */
34308+ reiser4_exit_context(ctx);
34309+ return result;
34310+ }
34311+ if (wbc->sync_mode != WB_SYNC_ALL) {
34312+ reiser4_exit_context(ctx);
34313+ return 0;
34314+ }
34315+ result = commit_file_atoms(inode);
34316+ reiser4_exit_context(ctx);
34317+ if (pindex >= nr_pages && jindex == pindex)
34318+ break;
34319+ } while (1);
34320+
34321+ end:
34322+ if (is_in_reiser4_context()) {
34323+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34324+ /*
34325+ * there are already pages to flush, flush them out, do
34326+ * not delay until end of reiser4_sync_inodes
34327+ */
34328+ reiser4_writeout(inode->i_sb, wbc);
34329+ get_current_context()->nr_captured = 0;
34330+ }
34331+ }
34332+ return result;
34333+}
34334+
34335+/**
34336+ * readpage_unix_file_nolock - readpage of struct address_space_operations
34337+ * @file:
34338+ * @page:
34339+ *
34340+ * Compose a key and search for item containing information about @page
34341+ * data. If item is found - its readpage method is called.
34342+ */
34343+int readpage_unix_file(struct file *file, struct page *page)
34344+{
34345+ reiser4_context *ctx;
34346+ int result;
34347+ struct inode *inode;
34348+ reiser4_key key;
34349+ item_plugin *iplug;
34350+ hint_t *hint;
34351+ lock_handle *lh;
34352+ coord_t *coord;
34353+
34354+ assert("vs-1062", PageLocked(page));
34355+ assert("vs-976", !PageUptodate(page));
34356+ assert("vs-1061", page->mapping && page->mapping->host);
34357+
34358+ if (page->mapping->host->i_size <= page_offset(page)) {
34359+ /* page is out of file */
34360+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
34361+ SetPageUptodate(page);
34362+ unlock_page(page);
34363+ return 0;
34364+ }
34365+
34366+ inode = page->mapping->host;
34367+ ctx = reiser4_init_context(inode->i_sb);
34368+ if (IS_ERR(ctx)) {
34369+ unlock_page(page);
34370+ return PTR_ERR(ctx);
34371+ }
34372+
34373+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34374+ if (hint == NULL) {
34375+ unlock_page(page);
34376+ reiser4_exit_context(ctx);
34377+ return RETERR(-ENOMEM);
34378+ }
34379+
34380+ result = load_file_hint(file, hint);
34381+ if (result) {
34382+ kfree(hint);
34383+ unlock_page(page);
34384+ reiser4_exit_context(ctx);
34385+ return result;
34386+ }
34387+ lh = &hint->lh;
34388+
34389+ /* get key of first byte of the page */
34390+ key_by_inode_and_offset_common(inode, page_offset(page), &key);
34391+
34392+ /* look for file metadata corresponding to first byte of page */
34393+ page_cache_get(page);
34394+ unlock_page(page);
34395+ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
34396+ lock_page(page);
34397+ page_cache_release(page);
34398+
34399+ if (page->mapping == NULL) {
34400+ /*
34401+ * readpage allows truncate to run concurrently. Page was
34402+ * truncated while it was not locked
34403+ */
34404+ done_lh(lh);
34405+ kfree(hint);
34406+ unlock_page(page);
34407+ reiser4_txn_restart(ctx);
34408+ reiser4_exit_context(ctx);
34409+ return -EINVAL;
34410+ }
34411+
34412+ if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
34413+ if (result == CBK_COORD_FOUND &&
34414+ hint->ext_coord.coord.between != AT_UNIT)
34415+ /* file is truncated */
34416+ result = -EINVAL;
34417+ done_lh(lh);
34418+ kfree(hint);
34419+ unlock_page(page);
34420+ reiser4_txn_restart(ctx);
34421+ reiser4_exit_context(ctx);
34422+ return result;
34423+ }
34424+
34425+ /*
34426+ * item corresponding to page is found. It can not be removed because
34427+ * znode lock is held
34428+ */
34429+ if (PageUptodate(page)) {
34430+ done_lh(lh);
34431+ kfree(hint);
34432+ unlock_page(page);
34433+ reiser4_txn_restart(ctx);
34434+ reiser4_exit_context(ctx);
34435+ return 0;
34436+ }
34437+
34438+ coord = &hint->ext_coord.coord;
34439+ result = zload(coord->node);
34440+ if (result) {
34441+ done_lh(lh);
34442+ kfree(hint);
34443+ unlock_page(page);
34444+ reiser4_txn_restart(ctx);
34445+ reiser4_exit_context(ctx);
34446+ return result;
34447+ }
34448+
34449+ validate_extended_coord(&hint->ext_coord, page_offset(page));
34450+
34451+ if (!coord_is_existing_unit(coord)) {
34452+ /* this indicates corruption */
34453+ warning("vs-280",
34454+ "Looking for page %lu of file %llu (size %lli). "
34455+ "No file items found (%d). File is corrupted?\n",
34456+ page->index, (unsigned long long)get_inode_oid(inode),
34457+ inode->i_size, result);
34458+ zrelse(coord->node);
34459+ done_lh(lh);
34460+ kfree(hint);
34461+ unlock_page(page);
34462+ reiser4_txn_restart(ctx);
34463+ reiser4_exit_context(ctx);
34464+ return RETERR(-EIO);
34465+ }
34466+
34467+ /*
34468+ * get plugin of found item or use plugin if extent if there are no
34469+ * one
34470+ */
34471+ iplug = item_plugin_by_coord(coord);
34472+ if (iplug->s.file.readpage)
34473+ result = iplug->s.file.readpage(coord, page);
34474+ else
34475+ result = RETERR(-EINVAL);
34476+
34477+ if (!result) {
34478+ set_key_offset(&key,
34479+ (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
34480+ /* FIXME should call reiser4_set_hint() */
34481+ reiser4_unset_hint(hint);
34482+ } else {
34483+ unlock_page(page);
34484+ reiser4_unset_hint(hint);
34485+ }
34486+ assert("vs-979",
34487+ ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
34488+ assert("vs-9791", ergo(result != 0, !PageLocked(page)));
34489+
34490+ zrelse(coord->node);
34491+ done_lh(lh);
34492+
34493+ save_file_hint(file, hint);
34494+ kfree(hint);
34495+
34496+ /*
34497+ * FIXME: explain why it is needed. HINT: page allocation in write can
34498+ * not be done when atom is not NULL because reiser4_writepage can not
34499+ * kick entd and have to eflush
34500+ */
34501+ reiser4_txn_restart(ctx);
34502+ reiser4_exit_context(ctx);
34503+ return result;
34504+}
34505+
34506+struct uf_readpages_context {
34507+ lock_handle lh;
34508+ coord_t coord;
34509+};
34510+
34511+/* A callback function for readpages_unix_file/read_cache_pages.
34512+ * If the file is build of tails, then return error (-ENOENT).
34513+ *
34514+ * @data -- a pointer to reiser4_readpages_context object,
34515+ * to save the twig lock and the coord between
34516+ * read_cache_page iterations.
34517+ * @page -- page to start read.
34518+ */
34519+static int uf_readpages_filler(void * data, struct page * page)
34520+{
34521+ struct uf_readpages_context *rc = data;
34522+ jnode * node;
34523+ int ret = 0;
34524+ reiser4_extent *ext;
34525+ __u64 ext_index;
34526+ int cbk_done = 0;
34527+ struct address_space * mapping = page->mapping;
34528+
34529+ if (PageUptodate(page)) {
34530+ unlock_page(page);
34531+ return 0;
34532+ }
34533+ page_cache_get(page);
34534+
34535+ if (rc->lh.node == 0) {
34536+ /* no twig lock - have to do tree search. */
34537+ reiser4_key key;
34538+ repeat:
34539+ unlock_page(page);
34540+ key_by_inode_and_offset_common(
34541+ mapping->host, page_offset(page), &key);
34542+ ret = coord_by_key(
34543+ &get_super_private(mapping->host->i_sb)->tree,
34544+ &key, &rc->coord, &rc->lh,
34545+ ZNODE_READ_LOCK, FIND_EXACT,
34546+ TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
34547+ if (unlikely(ret))
34548+ goto exit;
34549+ lock_page(page);
34550+ if (PageUptodate(page))
34551+ goto unlock;
34552+ cbk_done = 1;
34553+ }
34554+ ret = zload(rc->coord.node);
34555+ if (unlikely(ret))
34556+ goto unlock;
34557+ if (!coord_is_existing_item(&rc->coord) ||
34558+ !item_is_extent(&rc->coord)) {
34559+ zrelse(rc->coord.node);
34560+ ret = RETERR(-EIO);
34561+ goto unlock;
34562+ }
34563+ ext = extent_by_coord(&rc->coord);
34564+ ext_index = extent_unit_index(&rc->coord);
34565+ if (page->index < ext_index ||
34566+ page->index >= ext_index + extent_get_width(ext)) {
34567+ /* the page index doesn't belong to the extent unit
34568+ which the coord points to - release the lock and
34569+ repeat with tree search. */
34570+ zrelse(rc->coord.node);
34571+ done_lh(&rc->lh);
34572+ /* we can be here after a CBK call only in case of
34573+ corruption of the tree or the tree lookup algorithm bug. */
34574+ if (unlikely(cbk_done)) {
34575+ ret = RETERR(-EIO);
34576+ goto unlock;
34577+ }
34578+ goto repeat;
34579+ }
34580+ node = jnode_of_page(page);
34581+ if (unlikely(IS_ERR(node))) {
34582+ zrelse(rc->coord.node);
34583+ ret = PTR_ERR(node);
34584+ goto unlock;
34585+ }
34586+ ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
34587+ jput(node);
34588+ zrelse(rc->coord.node);
34589+ if (likely(!ret))
34590+ goto exit;
34591+ unlock:
34592+ unlock_page(page);
34593+ exit:
34594+ page_cache_release(page);
34595+ return ret;
34596+}
34597+
34598+/**
34599+ * readpages_unix_file - called by the readahead code, starts reading for each
34600+ * page of given list of pages
34601+ */
34602+int readpages_unix_file(
34603+ struct file *file, struct address_space *mapping,
34604+ struct list_head *pages, unsigned nr_pages)
34605+{
34606+ reiser4_context *ctx;
34607+ struct uf_readpages_context rc;
34608+ int ret;
34609+
34610+ ctx = reiser4_init_context(mapping->host->i_sb);
34611+ if (IS_ERR(ctx)) {
34612+ put_pages_list(pages);
34613+ return PTR_ERR(ctx);
34614+ }
34615+ init_lh(&rc.lh);
34616+ ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc);
34617+ done_lh(&rc.lh);
34618+ context_set_commit_async(ctx);
34619+ /* close the transaction to protect further page allocation from deadlocks */
34620+ reiser4_txn_restart(ctx);
34621+ reiser4_exit_context(ctx);
34622+ return ret;
34623+}
34624+
34625+static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
34626+ loff_t count UNUSED_ARG)
34627+{
34628+ /* We should reserve one block, because of updating of the stat data
34629+ item */
34630+ assert("vs-1249",
34631+ inode_file_plugin(inode)->estimate.update ==
34632+ estimate_update_common);
34633+ return estimate_update_common(inode);
34634+}
34635+
34636+/* this is called with nonexclusive access obtained, file's container can not change */
34637+static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */
34638+ char __user *buf, /* address of user-space buffer */
34639+ size_t count, /* number of bytes to read */
34640+ loff_t *off)
34641+{
34642+ int result;
34643+ struct inode *inode;
34644+ flow_t flow;
34645+ int (*read_f) (struct file *, flow_t *, hint_t *);
34646+ coord_t *coord;
34647+ znode *loaded;
34648+
34649+ inode = file->f_dentry->d_inode;
34650+
34651+ /* build flow */
34652+ assert("vs-1250",
34653+ inode_file_plugin(inode)->flow_by_inode ==
34654+ flow_by_inode_unix_file);
34655+ result =
34656+ flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
34657+ *off, READ_OP, &flow);
34658+ if (unlikely(result))
34659+ return result;
34660+
34661+ /* get seal and coord sealed with it from reiser4 private data
34662+ of struct file. The coord will tell us where our last read
34663+ of this file finished, and the seal will help to determine
34664+ if that location is still valid.
34665+ */
34666+ coord = &hint->ext_coord.coord;
34667+ while (flow.length && result == 0) {
34668+ result =
34669+ find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
34670+ if (cbk_errored(result))
34671+ /* error happened */
34672+ break;
34673+
34674+ if (coord->between != AT_UNIT) {
34675+ /* there were no items corresponding to given offset */
34676+ done_lh(hint->ext_coord.lh);
34677+ break;
34678+ }
34679+
34680+ loaded = coord->node;
34681+ result = zload(loaded);
34682+ if (unlikely(result)) {
34683+ done_lh(hint->ext_coord.lh);
34684+ break;
34685+ }
34686+
34687+ if (hint->ext_coord.valid == 0)
34688+ validate_extended_coord(&hint->ext_coord,
34689+ get_key_offset(&flow.key));
34690+
34691+ assert("vs-4", hint->ext_coord.valid == 1);
34692+ assert("vs-33", hint->ext_coord.lh == &hint->lh);
34693+ /* call item's read method */
34694+ read_f = item_plugin_by_coord(coord)->s.file.read;
34695+ result = read_f(file, &flow, hint);
34696+ zrelse(loaded);
34697+ done_lh(hint->ext_coord.lh);
34698+ }
34699+
34700+ return (count - flow.length) ? (count - flow.length) : result;
34701+}
34702+
34703+static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
34704+
34705+/**
34706+ * read_unix_file - read of struct file_operations
34707+ * @file: file to read from
34708+ * @buf: address of user-space buffer
34709+ * @read_amount: number of bytes to read
34710+ * @off: position in file to read from
34711+ *
34712+ * This is implementation of vfs's read method of struct file_operations for
34713+ * unix file plugin.
34714+ */
34715+ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
34716+ loff_t *off)
34717+{
34718+ reiser4_context *ctx;
34719+ ssize_t result;
34720+ struct inode *inode;
34721+ struct unix_file_info *uf_info;
34722+
34723+ if (unlikely(read_amount == 0))
34724+ return 0;
34725+
34726+ assert("umka-072", file != NULL);
34727+ assert("umka-074", off != NULL);
34728+ inode = file->f_dentry->d_inode;
34729+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34730+
34731+ ctx = reiser4_init_context(inode->i_sb);
34732+ if (IS_ERR(ctx))
34733+ return PTR_ERR(ctx);
34734+ uf_info = unix_file_inode_data(inode);
34735+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
34736+ get_exclusive_access(uf_info);
34737+ result = find_file_state(inode, uf_info);
34738+ if (unlikely(result != 0))
34739+ goto out;
34740+ } else
34741+ get_nonexclusive_access(uf_info);
34742+ result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
34743+ BA_CAN_COMMIT);
34744+ if (unlikely(result != 0))
34745+ goto out;
34746+ if (uf_info->container == UF_CONTAINER_EXTENTS){
34747+ result = do_sync_read(file, buf, read_amount, off);
34748+ } else if (uf_info->container == UF_CONTAINER_TAILS ||
34749+ reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
34750+ reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
34751+ result = read_unix_file_container_tails(file, buf, read_amount, off);
34752+ } else {
34753+ assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
34754+ result = 0;
34755+ }
34756+out:
34757+ drop_access(uf_info);
34758+ context_set_commit_async(ctx);
34759+ reiser4_exit_context(ctx);
34760+ return result;
34761+}
34762+
34763+static ssize_t read_unix_file_container_tails(
34764+ struct file *file, char __user *buf, size_t read_amount, loff_t *off)
34765+{
34766+ int result;
34767+ struct inode *inode;
34768+ hint_t *hint;
34769+ struct unix_file_info *uf_info;
34770+ size_t count, read, left;
34771+ loff_t size;
34772+
34773+ assert("umka-072", file != NULL);
34774+ assert("umka-074", off != NULL);
34775+ inode = file->f_dentry->d_inode;
34776+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34777+
34778+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34779+ if (hint == NULL)
34780+ return RETERR(-ENOMEM);
34781+
34782+ result = load_file_hint(file, hint);
34783+ if (result) {
34784+ kfree(hint);
34785+ return result;
34786+ }
34787+
34788+ left = read_amount;
34789+ count = 0;
34790+ uf_info = unix_file_inode_data(inode);
34791+ while (left > 0) {
34792+ reiser4_txn_restart_current();
34793+ size = i_size_read(inode);
34794+ if (*off >= size)
34795+ /* position to read from is past the end of file */
34796+ break;
34797+ if (*off + left > size)
34798+ left = size - *off;
34799+ /* faultin user page */
34800+ result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
34801+ if (result)
34802+ return RETERR(-EFAULT);
34803+
34804+ read = read_file(hint, file, buf,
34805+ left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
34806+ off);
34807+ if (read < 0) {
34808+ result = read;
34809+ break;
34810+ }
34811+ left -= read;
34812+ buf += read;
34813+
34814+ /* update position in a file */
34815+ *off += read;
34816+ /* total number of read bytes */
34817+ count += read;
34818+ }
34819+ done_lh(&hint->lh);
34820+ save_file_hint(file, hint);
34821+ kfree(hint);
34822+ if (count)
34823+ file_accessed(file);
34824+ /* return number of read bytes or error code if nothing is read */
34825+ return count ? count : result;
34826+}
34827+
34828+/* This function takes care about @file's pages. First of all it checks if
34829+ filesystems readonly and if so gets out. Otherwise, it throws out all
34830+ pages of file if it was mapped for read and going to be mapped for write
34831+ and consists of tails. This is done in order to not manage few copies
34832+ of the data (first in page cache and second one in tails them selves)
34833+ for the case of mapping files consisting tails.
34834+
34835+ Here also tail2extent conversion is performed if it is allowed and file
34836+ is going to be written or mapped for write. This functions may be called
34837+ from write_unix_file() or mmap_unix_file(). */
34838+static int check_pages_unix_file(struct file *file, struct inode *inode)
34839+{
34840+ reiser4_invalidate_pages(inode->i_mapping, 0,
34841+ (inode->i_size + PAGE_CACHE_SIZE -
34842+ 1) >> PAGE_CACHE_SHIFT, 0);
34843+ return unpack(file, inode, 0 /* not forever */ );
34844+}
34845+
34846+/**
34847+ * mmap_unix_file - mmap of struct file_operations
34848+ * @file: file to mmap
34849+ * @vma:
34850+ *
34851+ * This is implementation of vfs's mmap method of struct file_operations for
34852+ * unix file plugin. It converts file to extent if necessary. Sets
34853+ * reiser4_inode's flag - REISER4_HAS_MMAP.
34854+ */
34855+int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
34856+{
34857+ reiser4_context *ctx;
34858+ int result;
34859+ struct inode *inode;
34860+ struct unix_file_info *uf_info;
34861+ reiser4_block_nr needed;
34862+
34863+ inode = file->f_dentry->d_inode;
34864+ ctx = reiser4_init_context(inode->i_sb);
34865+ if (IS_ERR(ctx))
34866+ return PTR_ERR(ctx);
34867+
34868+ uf_info = unix_file_inode_data(inode);
34869+
34870+ get_exclusive_access_careful(uf_info, inode);
34871+
34872+ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
34873+ /*
34874+ * we need file built of extent items. If it is still built of
34875+ * tail items we have to convert it. Find what items the file
34876+ * is built of
34877+ */
34878+ result = find_file_state(inode, uf_info);
34879+ if (result != 0) {
34880+ drop_exclusive_access(uf_info);
34881+ reiser4_exit_context(ctx);
34882+ return result;
34883+ }
34884+
34885+ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
34886+ uf_info->container == UF_CONTAINER_EXTENTS ||
34887+ uf_info->container == UF_CONTAINER_EMPTY));
34888+ if (uf_info->container == UF_CONTAINER_TAILS) {
34889+ /*
34890+ * invalidate all pages and convert file from tails to
34891+ * extents
34892+ */
34893+ result = check_pages_unix_file(file, inode);
34894+ if (result) {
34895+ drop_exclusive_access(uf_info);
34896+ reiser4_exit_context(ctx);
34897+ return result;
34898+ }
34899+ }
34900+ }
34901+
34902+ /*
34903+ * generic_file_mmap will do update_atime. Grab space for stat data
34904+ * update.
34905+ */
34906+ needed = inode_file_plugin(inode)->estimate.update(inode);
34907+ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
34908+ if (result) {
34909+ drop_exclusive_access(uf_info);
34910+ reiser4_exit_context(ctx);
34911+ return result;
34912+ }
34913+
34914+ result = generic_file_mmap(file, vma);
34915+ if (result == 0) {
34916+ /* mark file as having mapping. */
34917+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
34918+ }
34919+
34920+ drop_exclusive_access(uf_info);
34921+ reiser4_exit_context(ctx);
34922+ return result;
34923+}
34924+
34925+/**
34926+ * find_first_item
34927+ * @inode:
34928+ *
34929+ * Finds file item which is responsible for first byte in the file.
34930+ */
34931+static int find_first_item(struct inode *inode)
34932+{
34933+ coord_t coord;
34934+ lock_handle lh;
34935+ reiser4_key key;
34936+ int result;
34937+
34938+ coord_init_zero(&coord);
34939+ init_lh(&lh);
34940+ inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
34941+ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
34942+ inode);
34943+ if (result == CBK_COORD_FOUND) {
34944+ if (coord.between == AT_UNIT) {
34945+ result = zload(coord.node);
34946+ if (result == 0) {
34947+ result = item_id_by_coord(&coord);
34948+ zrelse(coord.node);
34949+ if (result != EXTENT_POINTER_ID &&
34950+ result != FORMATTING_ID)
34951+ result = RETERR(-EIO);
34952+ }
34953+ } else
34954+ result = RETERR(-EIO);
34955+ }
34956+ done_lh(&lh);
34957+ return result;
34958+}
34959+
34960+/**
34961+ * open_unix_file
34962+ * @inode:
34963+ * @file:
34964+ *
34965+ * If filesystem is not readonly - complete uncompleted tail conversion if
34966+ * there was one
34967+ */
34968+int open_unix_file(struct inode *inode, struct file *file)
34969+{
34970+ int result;
34971+ reiser4_context *ctx;
34972+ struct unix_file_info *uf_info;
34973+
34974+ if (IS_RDONLY(inode))
34975+ return 0;
34976+
34977+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
34978+ return 0;
34979+
34980+ ctx = reiser4_init_context(inode->i_sb);
34981+ if (IS_ERR(ctx))
34982+ return PTR_ERR(ctx);
34983+
34984+ uf_info = unix_file_inode_data(inode);
34985+
34986+ get_exclusive_access_careful(uf_info, inode);
34987+
34988+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
34989+ /*
34990+ * other process completed the conversion
34991+ */
34992+ drop_exclusive_access(uf_info);
34993+ reiser4_exit_context(ctx);
34994+ return 0;
34995+ }
34996+
34997+ /*
34998+ * file left in semi converted state after unclean shutdown or another
34999+ * thread is doing conversion and dropped exclusive access which doing
35000+ * balance dirty pages. Complete the conversion
35001+ */
35002+ result = find_first_item(inode);
35003+ if (result == EXTENT_POINTER_ID)
35004+ /*
35005+ * first item is extent, therefore there was incomplete
35006+ * tail2extent conversion. Complete it
35007+ */
35008+ result = tail2extent(unix_file_inode_data(inode));
35009+ else if (result == FORMATTING_ID)
35010+ /*
35011+ * first item is formatting item, therefore there was
35012+ * incomplete extent2tail conversion. Complete it
35013+ */
35014+ result = extent2tail(file, unix_file_inode_data(inode));
35015+ else
35016+ result = -EIO;
35017+
35018+ assert("vs-1712",
35019+ ergo(result == 0,
35020+ (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
35021+ !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
35022+ drop_exclusive_access(uf_info);
35023+ reiser4_exit_context(ctx);
35024+ return result;
35025+}
35026+
35027+#define NEITHER_OBTAINED 0
35028+#define EA_OBTAINED 1
35029+#define NEA_OBTAINED 2
35030+
35031+static void drop_access(struct unix_file_info *uf_info)
35032+{
35033+ if (uf_info->exclusive_use)
35034+ drop_exclusive_access(uf_info);
35035+ else
35036+ drop_nonexclusive_access(uf_info);
35037+}
35038+
35039+#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
35040+ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
35041+
35042+/**
35043+ * write_unix_file - private ->write() method of unix_file plugin.
35044+ *
35045+ * @file: file to write to
35046+ * @buf: address of user-space buffer
35047+ * @count: number of bytes to write
35048+ * @pos: position in file to write to
35049+ * @cont: unused argument, as we don't perform plugin conversion when being
35050+ * managed by unix_file plugin.
35051+ */
35052+ssize_t write_unix_file(struct file *file, const char __user *buf,
35053+ size_t count, loff_t *pos, struct psched_context *cont)
35054+{
35055+ int result;
35056+ reiser4_context *ctx;
35057+ struct inode *inode;
35058+ struct unix_file_info *uf_info;
35059+ ssize_t written;
35060+ int try_free_space;
35061+ int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
35062+ size_t left;
35063+ ssize_t (*write_op)(struct file *, struct inode *,
35064+ const char __user *, size_t,
35065+ loff_t *pos);
35066+ int ea;
35067+ loff_t new_size;
35068+
35069+ ctx = get_current_context();
35070+ inode = file->f_dentry->d_inode;
35071+
35072+ assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
35073+ assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
35074+
35075+ /* check amount of bytes to write and writing position */
35076+ result = generic_write_checks(file, pos, &count, 0);
35077+ if (result) {
35078+ context_set_commit_async(ctx);
35079+ return result;
35080+ }
35081+
35082+ result = remove_suid(file->f_dentry);
35083+ if (result) {
35084+ context_set_commit_async(ctx);
35085+ return result;
35086+ }
35087+ /* remove_suid might create a transaction */
35088+ reiser4_txn_restart(ctx);
35089+
35090+ uf_info = unix_file_inode_data(inode);
35091+
35092+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
35093+ written = 0;
35094+ try_free_space = 0;
35095+ left = count;
35096+ ea = NEITHER_OBTAINED;
35097+
35098+ new_size = i_size_read(inode);
35099+ if (*pos + count > new_size)
35100+ new_size = *pos + count;
35101+
35102+ while (left) {
35103+ if (left < to_write)
35104+ to_write = left;
35105+
35106+ if (uf_info->container == UF_CONTAINER_EMPTY) {
35107+ get_exclusive_access(uf_info);
35108+ ea = EA_OBTAINED;
35109+ if (uf_info->container != UF_CONTAINER_EMPTY) {
35110+ /* file is made not empty by another process */
35111+ drop_exclusive_access(uf_info);
35112+ ea = NEITHER_OBTAINED;
35113+ continue;
35114+ }
35115+ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35116+ /*
35117+ * get exclusive access directly just to not have to
35118+ * re-obtain it if file will appear empty
35119+ */
35120+ get_exclusive_access(uf_info);
35121+ ea = EA_OBTAINED;
35122+ result = find_file_state(inode, uf_info);
35123+ if (result) {
35124+ drop_exclusive_access(uf_info);
35125+ ea = NEITHER_OBTAINED;
35126+ break;
35127+ }
35128+ } else {
35129+ get_nonexclusive_access(uf_info);
35130+ ea = NEA_OBTAINED;
35131+ }
35132+
35133+ /* either EA or NEA is obtained. Choose item write method */
35134+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
35135+ /* file is built of extent items */
35136+ write_op = reiser4_write_extent;
35137+ } else if (uf_info->container == UF_CONTAINER_EMPTY) {
35138+ /* file is empty */
35139+ if (should_have_notail(uf_info, new_size))
35140+ write_op = reiser4_write_extent;
35141+ else
35142+ write_op = reiser4_write_tail;
35143+ } else {
35144+ /* file is built of tail items */
35145+ if (should_have_notail(uf_info, new_size)) {
35146+ if (ea == NEA_OBTAINED) {
35147+ drop_nonexclusive_access(uf_info);
35148+ get_exclusive_access(uf_info);
35149+ ea = EA_OBTAINED;
35150+ }
35151+ if (uf_info->container == UF_CONTAINER_TAILS) {
35152+ /*
35153+ * if file is being convered by another
35154+ * process - wait until it completes
35155+ */
35156+ while (1) {
35157+ if (reiser4_inode_get_flag(inode,
35158+ REISER4_PART_IN_CONV)) {
35159+ drop_exclusive_access(uf_info);
35160+ schedule();
35161+ get_exclusive_access(uf_info);
35162+ continue;
35163+ }
35164+ break;
35165+ }
35166+ if (uf_info->container == UF_CONTAINER_TAILS) {
35167+ result = tail2extent(uf_info);
35168+ if (result)
35169+ break;
35170+ }
35171+ }
35172+ drop_exclusive_access(uf_info);
35173+ ea = NEITHER_OBTAINED;
35174+ continue;
35175+ }
35176+ write_op = reiser4_write_tail;
35177+ }
35178+
35179+ written = write_op(file, inode, buf, to_write, pos);
35180+ if (written == -ENOSPC && try_free_space) {
35181+ drop_access(uf_info);
35182+ txnmgr_force_commit_all(inode->i_sb, 0);
35183+ try_free_space = 0;
35184+ continue;
35185+ }
35186+ if (written < 0) {
35187+ drop_access(uf_info);
35188+ result = written;
35189+ break;
35190+ }
35191+ /* something is written. */
35192+ if (uf_info->container == UF_CONTAINER_EMPTY) {
35193+ assert("edward-1553", ea == EA_OBTAINED);
35194+ uf_info->container =
35195+ (write_op == reiser4_write_extent) ?
35196+ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
35197+ } else {
35198+ assert("edward-1554", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
35199+ write_op == reiser4_write_extent));
35200+ assert("edward-1555", ergo(uf_info->container == UF_CONTAINER_TAILS,
35201+ write_op == reiser4_write_tail));
35202+ }
35203+ if (*pos + written > inode->i_size)
35204+ INODE_SET_FIELD(inode, i_size, *pos + written);
35205+ file_update_time(file);
35206+ result = reiser4_update_sd(inode);
35207+ if (result) {
35208+ current->backing_dev_info = NULL;
35209+ drop_access(uf_info);
35210+ context_set_commit_async(ctx);
35211+ return result;
35212+ }
35213+ drop_access(uf_info);
35214+ ea = NEITHER_OBTAINED;
35215+ reiser4_txn_restart(ctx);
35216+ current->journal_info = NULL;
35217+ /*
35218+ * tell VM how many pages were dirtied. Maybe number of pages
35219+ * which were dirty already should not be counted
35220+ */
35221+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
35222+ (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
35223+ current->journal_info = ctx;
35224+
35225+ left -= written;
35226+ buf += written;
35227+ *pos += written;
35228+ }
35229+ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
35230+ reiser4_txn_restart_current();
35231+ grab_space_enable();
35232+ result = reiser4_sync_file_common(file, file->f_dentry,
35233+ 0 /* data and stat data */);
35234+ if (result)
35235+ warning("reiser4-7", "failed to sync file %llu",
35236+ (unsigned long long)get_inode_oid(inode));
35237+ }
35238+
35239+ current->backing_dev_info = NULL;
35240+
35241+ /*
35242+ * return number of written bytes or error code if nothing is
35243+ * written. Note, that it does not work correctly in case when
35244+ * sync_unix_file returns error
35245+ */
35246+ return (count - left) ? (count - left) : result;
35247+}
35248+
35249+/**
35250+ * release_unix_file - release of struct file_operations
35251+ * @inode: inode of released file
35252+ * @file: file to release
35253+ *
35254+ * Implementation of release method of struct file_operations for unix file
35255+ * plugin. If last reference to indode is released - convert all extent items
35256+ * into tail items if necessary. Frees reiser4 specific file data.
35257+ */
35258+int release_unix_file(struct inode *inode, struct file *file)
35259+{
35260+ reiser4_context *ctx;
35261+ struct unix_file_info *uf_info;
35262+ int result;
35263+ int in_reiser4;
35264+
35265+ in_reiser4 = is_in_reiser4_context();
35266+
35267+ ctx = reiser4_init_context(inode->i_sb);
35268+ if (IS_ERR(ctx))
35269+ return PTR_ERR(ctx);
35270+
35271+ result = 0;
35272+ if (in_reiser4 == 0) {
35273+ uf_info = unix_file_inode_data(inode);
35274+
35275+ get_exclusive_access_careful(uf_info, inode);
35276+ if (atomic_read(&file->f_dentry->d_count) == 1 &&
35277+ uf_info->container == UF_CONTAINER_EXTENTS &&
35278+ !should_have_notail(uf_info, inode->i_size) &&
35279+ !rofs_inode(inode)) {
35280+ result = extent2tail(file, uf_info);
35281+ if (result != 0) {
35282+ warning("nikita-3233",
35283+ "Failed (%d) to convert in %s (%llu)",
35284+ result, __FUNCTION__,
35285+ (unsigned long long)
35286+ get_inode_oid(inode));
35287+ }
35288+ }
35289+ drop_exclusive_access(uf_info);
35290+ } else {
35291+ /*
35292+ we are within reiser4 context already. How latter is
35293+ possible? Simple:
35294+
35295+ (gdb) bt
35296+ #0 get_exclusive_access ()
35297+ #2 0xc01e56d3 in release_unix_file ()
35298+ #3 0xc01c3643 in reiser4_release ()
35299+ #4 0xc014cae0 in __fput ()
35300+ #5 0xc013ffc3 in remove_vm_struct ()
35301+ #6 0xc0141786 in exit_mmap ()
35302+ #7 0xc0118480 in mmput ()
35303+ #8 0xc0133205 in oom_kill ()
35304+ #9 0xc01332d1 in out_of_memory ()
35305+ #10 0xc013bc1d in try_to_free_pages ()
35306+ #11 0xc013427b in __alloc_pages ()
35307+ #12 0xc013f058 in do_anonymous_page ()
35308+ #13 0xc013f19d in do_no_page ()
35309+ #14 0xc013f60e in handle_mm_fault ()
35310+ #15 0xc01131e5 in do_page_fault ()
35311+ #16 0xc0104935 in error_code ()
35312+ #17 0xc025c0c6 in __copy_to_user_ll ()
35313+ #18 0xc01d496f in reiser4_read_tail ()
35314+ #19 0xc01e4def in read_unix_file ()
35315+ #20 0xc01c3504 in reiser4_read ()
35316+ #21 0xc014bd4f in vfs_read ()
35317+ #22 0xc014bf66 in sys_read ()
35318+ */
35319+ warning("vs-44", "out of memory?");
35320+ }
35321+
35322+ reiser4_free_file_fsdata(file);
35323+
35324+ reiser4_exit_context(ctx);
35325+ return result;
35326+}
35327+
35328+static void set_file_notail(struct inode *inode)
35329+{
35330+ reiser4_inode *state;
35331+ formatting_plugin *tplug;
35332+
35333+ state = reiser4_inode_data(inode);
35334+ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
35335+ force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
35336+}
35337+
35338+/* if file is built of tails - convert it to extents */
35339+static int unpack(struct file *filp, struct inode *inode, int forever)
35340+{
35341+ int result = 0;
35342+ struct unix_file_info *uf_info;
35343+
35344+ uf_info = unix_file_inode_data(inode);
35345+ assert("vs-1628", ea_obtained(uf_info));
35346+
35347+ result = find_file_state(inode, uf_info);
35348+ if (result)
35349+ return result;
35350+ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
35351+
35352+ if (uf_info->container == UF_CONTAINER_TAILS) {
35353+ /*
35354+ * if file is being convered by another process - wait until it
35355+ * completes
35356+ */
35357+ while (1) {
35358+ if (reiser4_inode_get_flag(inode,
35359+ REISER4_PART_IN_CONV)) {
35360+ drop_exclusive_access(uf_info);
35361+ schedule();
35362+ get_exclusive_access(uf_info);
35363+ continue;
35364+ }
35365+ break;
35366+ }
35367+ if (uf_info->container == UF_CONTAINER_TAILS) {
35368+ result = tail2extent(uf_info);
35369+ if (result)
35370+ return result;
35371+ }
35372+ }
35373+ if (forever) {
35374+ /* safe new formatting plugin in stat data */
35375+ __u64 tograb;
35376+
35377+ set_file_notail(inode);
35378+
35379+ grab_space_enable();
35380+ tograb = inode_file_plugin(inode)->estimate.update(inode);
35381+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
35382+ result = reiser4_update_sd(inode);
35383+ }
35384+
35385+ return result;
35386+}
35387+
35388+/* implentation of vfs' ioctl method of struct file_operations for unix file
35389+ plugin
35390+*/
35391+int
35392+ioctl_unix_file(struct inode *inode, struct file *filp,
35393+ unsigned int cmd, unsigned long arg UNUSED_ARG)
35394+{
35395+ reiser4_context *ctx;
35396+ int result;
35397+
35398+ ctx = reiser4_init_context(inode->i_sb);
35399+ if (IS_ERR(ctx))
35400+ return PTR_ERR(ctx);
35401+
35402+ switch (cmd) {
35403+ case REISER4_IOC_UNPACK:
35404+ get_exclusive_access(unix_file_inode_data(inode));
35405+ result = unpack(filp, inode, 1 /* forever */ );
35406+ drop_exclusive_access(unix_file_inode_data(inode));
35407+ break;
35408+
35409+ default:
35410+ result = RETERR(-ENOSYS);
35411+ break;
35412+ }
35413+ reiser4_exit_context(ctx);
35414+ return result;
35415+}
35416+
35417+/* implentation of vfs' bmap method of struct address_space_operations for unix
35418+ file plugin
35419+*/
35420+sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
35421+{
35422+ reiser4_context *ctx;
35423+ sector_t result;
35424+ reiser4_key key;
35425+ coord_t coord;
35426+ lock_handle lh;
35427+ struct inode *inode;
35428+ item_plugin *iplug;
35429+ sector_t block;
35430+
35431+ inode = mapping->host;
35432+
35433+ ctx = reiser4_init_context(inode->i_sb);
35434+ if (IS_ERR(ctx))
35435+ return PTR_ERR(ctx);
35436+ key_by_inode_and_offset_common(inode,
35437+ (loff_t) lblock * current_blocksize,
35438+ &key);
35439+
35440+ init_lh(&lh);
35441+ result =
35442+ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
35443+ if (cbk_errored(result)) {
35444+ done_lh(&lh);
35445+ reiser4_exit_context(ctx);
35446+ return result;
35447+ }
35448+
35449+ result = zload(coord.node);
35450+ if (result) {
35451+ done_lh(&lh);
35452+ reiser4_exit_context(ctx);
35453+ return result;
35454+ }
35455+
35456+ iplug = item_plugin_by_coord(&coord);
35457+ if (iplug->s.file.get_block) {
35458+ result = iplug->s.file.get_block(&coord, lblock, &block);
35459+ if (result == 0)
35460+ result = block;
35461+ } else
35462+ result = RETERR(-EINVAL);
35463+
35464+ zrelse(coord.node);
35465+ done_lh(&lh);
35466+ reiser4_exit_context(ctx);
35467+ return result;
35468+}
35469+
35470+/**
35471+ * flow_by_inode_unix_file - initizlize structure flow
35472+ * @inode: inode of file for which read or write is abou
35473+ * @buf: buffer to perform read to or write from
35474+ * @user: flag showing whether @buf is user space or kernel space
35475+ * @size: size of buffer @buf
35476+ * @off: start offset fro read or write
35477+ * @op: READ or WRITE
35478+ * @flow:
35479+ *
35480+ * Initializes fields of @flow: key, size of data, i/o mode (read or write).
35481+ */
35482+int flow_by_inode_unix_file(struct inode *inode,
35483+ const char __user *buf, int user,
35484+ loff_t size, loff_t off,
35485+ rw_op op, flow_t *flow)
35486+{
35487+ assert("nikita-1100", inode != NULL);
35488+
35489+ flow->length = size;
35490+ memcpy(&flow->data, &buf, sizeof(buf));
35491+ flow->user = user;
35492+ flow->op = op;
35493+ assert("nikita-1931", inode_file_plugin(inode) != NULL);
35494+ assert("nikita-1932",
35495+ inode_file_plugin(inode)->key_by_inode ==
35496+ key_by_inode_and_offset_common);
35497+ /* calculate key of write position and insert it into flow->key */
35498+ return key_by_inode_and_offset_common(inode, off, &flow->key);
35499+}
35500+
35501+/* plugin->u.file.set_plug_in_sd = NULL
35502+ plugin->u.file.set_plug_in_inode = NULL
35503+ plugin->u.file.create_blank_sd = NULL */
35504+/* plugin->u.file.delete */
35505+/*
35506+ plugin->u.file.add_link = reiser4_add_link_common
35507+ plugin->u.file.rem_link = NULL */
35508+
35509+/* plugin->u.file.owns_item
35510+ this is common_file_owns_item with assertion */
35511+/* Audited by: green(2002.06.15) */
35512+int
35513+owns_item_unix_file(const struct inode *inode /* object to check against */ ,
35514+ const coord_t * coord /* coord to check */ )
35515+{
35516+ int result;
35517+
35518+ result = owns_item_common(inode, coord);
35519+ if (!result)
35520+ return 0;
35521+ if (!plugin_of_group(item_plugin_by_coord(coord),
35522+ UNIX_FILE_METADATA_ITEM_TYPE))
35523+ return 0;
35524+ assert("vs-547",
35525+ item_id_by_coord(coord) == EXTENT_POINTER_ID ||
35526+ item_id_by_coord(coord) == FORMATTING_ID);
35527+ return 1;
35528+}
35529+
35530+static int setattr_truncate(struct inode *inode, struct iattr *attr)
35531+{
35532+ int result;
35533+ int s_result;
35534+ loff_t old_size;
35535+ reiser4_tree *tree;
35536+
35537+ inode_check_scale(inode, inode->i_size, attr->ia_size);
35538+
35539+ old_size = inode->i_size;
35540+ tree = reiser4_tree_by_inode(inode);
35541+
35542+ result = safe_link_grab(tree, BA_CAN_COMMIT);
35543+ if (result == 0)
35544+ result = safe_link_add(inode, SAFE_TRUNCATE);
35545+ if (result == 0)
35546+ result = truncate_file_body(inode, attr);
35547+ if (result)
35548+ warning("vs-1588", "truncate_file failed: oid %lli, "
35549+ "old size %lld, new size %lld, retval %d",
35550+ (unsigned long long)get_inode_oid(inode),
35551+ old_size, attr->ia_size, result);
35552+
35553+ s_result = safe_link_grab(tree, BA_CAN_COMMIT);
35554+ if (s_result == 0)
35555+ s_result =
35556+ safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
35557+ if (s_result != 0) {
35558+ warning("nikita-3417", "Cannot kill safelink %lli: %i",
35559+ (unsigned long long)get_inode_oid(inode), s_result);
35560+ }
35561+ safe_link_release(tree);
35562+ return result;
35563+}
35564+
35565+/* plugin->u.file.setattr method */
35566+/* This calls inode_setattr and if truncate is in effect it also takes
35567+ exclusive inode access to avoid races */
35568+int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
35569+ struct iattr *attr /* change description */ )
35570+{
35571+ int result;
35572+
35573+ if (attr->ia_valid & ATTR_SIZE) {
35574+ reiser4_context *ctx;
35575+ struct unix_file_info *uf_info;
35576+
35577+ /* truncate does reservation itself and requires exclusive
35578+ access obtained */
35579+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
35580+ if (IS_ERR(ctx))
35581+ return PTR_ERR(ctx);
35582+
35583+ uf_info = unix_file_inode_data(dentry->d_inode);
35584+ get_exclusive_access_careful(uf_info, dentry->d_inode);
35585+ result = setattr_truncate(dentry->d_inode, attr);
35586+ drop_exclusive_access(uf_info);
35587+ context_set_commit_async(ctx);
35588+ reiser4_exit_context(ctx);
35589+ } else
35590+ result = reiser4_setattr_common(dentry, attr);
35591+
35592+ return result;
35593+}
35594+
35595+/* plugin->u.file.init_inode_data */
35596+void
35597+init_inode_data_unix_file(struct inode *inode,
35598+ reiser4_object_create_data * crd, int create)
35599+{
35600+ struct unix_file_info *data;
35601+
35602+ data = unix_file_inode_data(inode);
35603+ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
35604+ init_rwsem(&data->latch);
35605+ data->tplug = inode_formatting_plugin(inode);
35606+ data->exclusive_use = 0;
35607+
35608+#if REISER4_DEBUG
35609+ data->ea_owner = NULL;
35610+ atomic_set(&data->nr_neas, 0);
35611+#endif
35612+ init_inode_ordering(inode, crd, create);
35613+}
35614+
35615+/**
35616+ * delete_unix_file - delete_object of file_plugin
35617+ * @inode: inode to be deleted
35618+ *
35619+ * Truncates file to length 0, removes stat data and safe link.
35620+ */
35621+int delete_object_unix_file(struct inode *inode)
35622+{
35623+ struct unix_file_info *uf_info;
35624+ int result;
35625+
35626+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
35627+ return 0;
35628+
35629+ /* truncate file bogy first */
35630+ uf_info = unix_file_inode_data(inode);
35631+ get_exclusive_access(uf_info);
35632+ result = shorten_file(inode, 0 /* size */ );
35633+ drop_exclusive_access(uf_info);
35634+
35635+ if (result)
35636+ warning("edward-1556",
35637+ "failed to truncate file (%llu) on removal: %d",
35638+ get_inode_oid(inode), result);
35639+
35640+ /* remove stat data and safe link */
35641+ return reiser4_delete_object_common(inode);
35642+}
35643+
35644+int
35645+prepare_write_unix_file(struct file *file, struct page *page,
35646+ unsigned from, unsigned to)
35647+{
35648+ reiser4_context *ctx;
35649+ struct unix_file_info *uf_info;
35650+ int ret;
35651+
35652+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
35653+ if (IS_ERR(ctx))
35654+ return PTR_ERR(ctx);
35655+
35656+ uf_info = unix_file_inode_data(file->f_dentry->d_inode);
35657+ get_exclusive_access(uf_info);
35658+ ret = find_file_state(file->f_dentry->d_inode, uf_info);
35659+ if (ret == 0) {
35660+ if (uf_info->container == UF_CONTAINER_TAILS)
35661+ ret = -EINVAL;
35662+ else
35663+ ret = do_prepare_write(file, page, from, to);
35664+ }
35665+ drop_exclusive_access(uf_info);
35666+
35667+ /* don't commit transaction under inode semaphore */
35668+ context_set_commit_async(ctx);
35669+ reiser4_exit_context(ctx);
35670+ return ret;
35671+}
35672+
35673+/*
35674+ * Local variables:
35675+ * c-indentation-style: "K&R"
35676+ * mode-name: "LC"
35677+ * c-basic-offset: 8
35678+ * tab-width: 8
35679+ * fill-column: 79
35680+ * scroll-step: 1
35681+ * End:
35682+ */
35683diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.24/fs/reiser4/plugin/file/file_conversion.c
35684--- linux-2.6.24.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 03:00:00.000000000 +0300
35685+++ linux-2.6.24/fs/reiser4/plugin/file/file_conversion.c 2008-01-25 11:39:06.988221084 +0300
35686@@ -0,0 +1,689 @@
35687+/* Copyright 2001, 2002, 2003 by Hans Reiser,
35688+ licensing governed by reiser4/README */
35689+
35690+/**
35691+ * This file contains plugin schedule hooks, and plugin conversion methods.
35692+ *
35693+ * Plugin schedule hook makes a decision (at plugin schedule point) about the
35694+ * most reasonable plugins for managing a regular file. Usually such decisions
35695+ * is made by some O(1)-heuristic.
35696+ *
35697+ * By default we assign a unix_file plugin id when writing incompressible file
35698+ * managed by cryptcompress plugin id. Currently used heuristic for estimating
35699+ * compressibility is very simple: if first complete logical cluster (64K by
35700+ * default) of a file is incompressible, then we make a decision, that the whole
35701+ * file is incompressible (*).
35702+ *
35703+ * To enable a conversion we install a special "magic" compression mode plugin
35704+ * (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for details)
35705+ * at file creation time (**).
35706+ *
35707+ * Note, that we don't perform back conversion (unix_file->cryptcompress)
35708+ * because of compatibility reasons (see http://dev.namesys.com/Version4.X.Y
35709+ * for details).
35710+ *
35711+ * The conversion is accompanied by rebuilding disk structures of a file, so it
35712+ * is important to protect them from being interacted with other plugins which
35713+ * don't expect them to be in such inconsistent state. For this to be protected
35714+ * we serialize readers and writers of a file's conversion set (FCS).
35715+ *
35716+ * We define FCS as a file plugin installed in inode's pset plus file's data
35717+ * and metadata that this file plugin manipulates with (items, etc).
35718+ * Note, that FCS is defined per file.
35719+ * FCS reader is defined as a set of instruction of the following type:
35720+ * {inode_file_plugin(inode)->method()} (I.e. retrieving a file plugin id
35721+ * conjoined with all method's instructions should be atomic).
35722+ * FCS writer is a set of instructions that perform file plugin conversion
35723+ * (convert items, update pset, etc).
35724+ * Example:
35725+ * reiser4_write_careful() supplied to VFS as a ->write() file operation is
35726+ * composed of the following (optional) instructions:
35727+ * 1 2 3
35728+ * *********************** ####### -------------------------------------------->
35729+ *
35730+ * 1) "****" are instructions performed on behalf of cryptcompress file plugin;
35731+ * 2) "####" is a FCS writer (performing a conversion cryptcompress->unix_file);
35732+ * 3) "----" are instructions performed on behalf of unix_file plugin;
35733+ * Here (1) and (3) are FCS readers.
35734+ *
35735+ * In this example FCS readers and writers are already serialized (by design),
35736+ * however there can be readers and writers executing at the same time in
35737+ * different contexts, so we need a common mechanism of serialization.
35738+ *
35739+ * Currently serialization of FCS readers and writers is performed via acquiring
35740+ * a special per-inode rw-semaphore (conv_sem). And yes, {down, up}_read is for
35741+ * FCS readers, and {down, up}_write is for FCS writers, see the macros below
35742+ * for passive/active protection.
35743+ *
35744+ * ---
35745+ * (*) This heuristic can be changed to a better one (benchmarking is needed).
35746+ * (**) Such technique allows to keep enable/disable state on disk.
35747+ */
35748+
35749+#include "../../inode.h"
35750+#include "../cluster.h"
35751+#include "file.h"
35752+
35753+#define conversion_enabled(inode) \
35754+ (inode_compression_mode_plugin(inode) == \
35755+ compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
35756+
35757+/**
35758+ * Located sections (readers and writers of @pset) are not permanently
35759+ * critical: cryptcompress file can be converted only if the conversion
35760+ * is enabled (see the macrio above). Also we don't perform back
35761+ * conversion. The following helper macro is a sanity check to decide
35762+ * if we need the protection (locks are always additional overheads).
35763+ */
35764+#define should_protect(inode) \
35765+ (inode_file_plugin(inode) == \
35766+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \
35767+ conversion_enabled(inode))
35768+/**
35769+ * To avoid confusion with read/write file operations, we'll speak about
35770+ * "passive" protection for FCS readers and "active" protection for FCS
35771+ * writers. All methods with active or passive protection have suffix
35772+ * "careful".
35773+ */
35774+/**
35775+ * Macros for passive protection.
35776+ *
35777+ * Construct invariant operation to be supplied to VFS.
35778+ * The macro accepts the following lexemes:
35779+ * @type - type of the value represented by the compound statement;
35780+ * @method - name of an operation to be supplied to VFS (reiser4 file
35781+ * plugin also should contain a method with such name).
35782+ */
35783+#define PROT_PASSIVE(type, method, args) \
35784+({ \
35785+ type _result; \
35786+ struct rw_semaphore * guard = \
35787+ &reiser4_inode_data(inode)->conv_sem; \
35788+ \
35789+ if (should_protect(inode)) { \
35790+ down_read(guard); \
35791+ if (!should_protect(inode)) \
35792+ up_read(guard); \
35793+ } \
35794+ _result = inode_file_plugin(inode)->method args; \
35795+ if (should_protect(inode)) \
35796+ up_read(guard); \
35797+ _result; \
35798+})
35799+
35800+#define PROT_PASSIVE_VOID(method, args) \
35801+({ \
35802+ struct rw_semaphore * guard = \
35803+ &reiser4_inode_data(inode)->conv_sem; \
35804+ \
35805+ if (should_protect(inode)) { \
35806+ down_read(guard); \
35807+ if (!should_protect(inode)) \
35808+ up_read(guard); \
35809+ } \
35810+ inode_file_plugin(inode)->method args; \
35811+ \
35812+ if (should_protect(inode)) \
35813+ up_read(guard); \
35814+})
35815+
35816+/* Pass management to the unix-file plugin with "notail" policy */
35817+static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
35818+{
35819+ int result;
35820+ reiser4_inode *info;
35821+ struct unix_file_info * uf;
35822+ info = reiser4_inode_data(inode);
35823+
35824+ result = aset_set_unsafe(&info->pset,
35825+ PSET_FILE,
35826+ (reiser4_plugin *)
35827+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
35828+ if (result)
35829+ return result;
35830+ result = aset_set_unsafe(&info->pset,
35831+ PSET_FORMATTING,
35832+ (reiser4_plugin *)
35833+ formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
35834+ if (result)
35835+ return result;
35836+ /* get rid of non-standard plugins */
35837+ info->plugin_mask &= ~cryptcompress_mask;
35838+ /* get rid of plugin stat-data extension */
35839+ info->extmask &= ~(1 << PLUGIN_STAT);
35840+
35841+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
35842+
35843+ /* FIXME use init_inode_data_unix_file() instead,
35844+ but aviod init_inode_ordering() */
35845+ /* Init unix-file specific part of inode */
35846+ uf = unix_file_inode_data(inode);
35847+ uf->container = UF_CONTAINER_UNKNOWN;
35848+ init_rwsem(&uf->latch);
35849+ uf->tplug = inode_formatting_plugin(inode);
35850+ uf->exclusive_use = 0;
35851+#if REISER4_DEBUG
35852+ uf->ea_owner = NULL;
35853+ atomic_set(&uf->nr_neas, 0);
35854+#endif
35855+ /**
35856+ * we was carefull for file_ops, inode_ops and as_ops
35857+ * to be invariant for plugin conversion, so there is
35858+ * no need to update ones already installed in the
35859+ * vfs's residence.
35860+ */
35861+ return 0;
35862+}
35863+
35864+#if REISER4_DEBUG
35865+static int disabled_conversion_inode_ok(struct inode * inode)
35866+{
35867+ __u64 extmask = reiser4_inode_data(inode)->extmask;
35868+ __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
35869+
35870+ return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
35871+ (extmask & (1 << UNIX_STAT)) &&
35872+ (extmask & (1 << LARGE_TIMES_STAT)) &&
35873+ (extmask & (1 << PLUGIN_STAT)) &&
35874+ (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
35875+}
35876+#endif
35877+
35878+/**
35879+ * Disable future attempts to schedule/convert file plugin.
35880+ * This function is called by plugin schedule hooks.
35881+ *
35882+ * To disable conversion we assign any compression mode plugin id
35883+ * different from CONVX_COMPRESSION_MODE_ID.
35884+ */
35885+static int disable_conversion(struct inode * inode)
35886+{
35887+ int result;
35888+ result =
35889+ force_plugin_pset(inode,
35890+ PSET_COMPRESSION_MODE,
35891+ (reiser4_plugin *)compression_mode_plugin_by_id
35892+ (LATTD_COMPRESSION_MODE_ID));
35893+ assert("edward-1500",
35894+ ergo(!result, disabled_conversion_inode_ok(inode)));
35895+ return result;
35896+}
35897+
35898+/**
35899+ * Check if we really have achieved plugin scheduling point
35900+ */
35901+static int check_psched_point(struct inode * inode,
35902+ loff_t pos /* position in the
35903+ file to write from */,
35904+ struct cluster_handle * clust,
35905+ struct psched_context * cont)
35906+{
35907+ assert("edward-1505", conversion_enabled(inode));
35908+ /*
35909+ * if file size is more then cluster size, then compressible
35910+ * status must be figured out (i.e. compression was disabled,
35911+ * or file plugin was converted to unix_file)
35912+ */
35913+ assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
35914+
35915+ if (pos > inode->i_size)
35916+ /* first logical cluster will contain a (partial) hole */
35917+ return disable_conversion(inode);
35918+ if (pos < inode_cluster_size(inode))
35919+ /* writing to the first logical cluster */
35920+ return 0;
35921+ /*
35922+ * here we have:
35923+ * cluster_size <= pos <= i_size <= cluster_size,
35924+ * and, hence, pos == i_size == cluster_size
35925+ */
35926+ assert("edward-1498",
35927+ pos == inode->i_size &&
35928+ pos == inode_cluster_size(inode));
35929+ assert("edward-1539", cont != NULL);
35930+ assert("edward-1540", cont->state == PSCHED_INVAL_STATE);
35931+
35932+ cont->state = PSCHED_SCHED_POINT;
35933+ return 0;
35934+}
35935+
35936+static void start_check_compressibility(struct inode * inode,
35937+ struct cluster_handle * clust,
35938+ hint_t * hint)
35939+{
35940+ assert("edward-1507", clust->index == 1);
35941+ assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
35942+ assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
35943+
35944+ hint_init_zero(hint);
35945+ clust->hint = hint;
35946+ clust->index --;
35947+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
35948+
35949+ /* first logical cluster (of index #0) must be complete */
35950+ assert("edward-1510", lbytes(clust->index, inode) ==
35951+ inode_cluster_size(inode));
35952+}
35953+
35954+static void finish_check_compressibility(struct inode * inode,
35955+ struct cluster_handle * clust,
35956+ hint_t * hint)
35957+{
35958+ reiser4_unset_hint(clust->hint);
35959+ clust->hint = hint;
35960+ clust->index ++;
35961+}
35962+
35963+#if REISER4_DEBUG
35964+static int prepped_dclust_ok(hint_t * hint)
35965+{
35966+ reiser4_key key;
35967+ coord_t * coord = &hint->ext_coord.coord;
35968+
35969+ item_key_by_coord(coord, &key);
35970+ return (item_id_by_coord(coord) == CTAIL_ID &&
35971+ !coord_is_unprepped_ctail(coord) &&
35972+ (get_key_offset(&key) + nr_units_ctail(coord) ==
35973+ dclust_get_extension_dsize(hint)));
35974+}
35975+#endif
35976+
35977+#define fifty_persent(size) (size >> 1)
35978+/* evaluation of data compressibility */
35979+#define data_is_compressible(osize, isize) \
35980+ (osize < fifty_persent(isize))
35981+
35982+/**
35983+ * A simple O(1)-heuristic for compressibility.
35984+ * This is called not more then one time per file's life.
35985+ * Read first logical cluster (of index #0) and estimate its compressibility.
35986+ * Save estimation result in @cont.
35987+ */
35988+static int read_check_compressibility(struct inode * inode,
35989+ struct cluster_handle * clust,
35990+ struct psched_context * cont)
35991+{
35992+ int i;
35993+ int result;
35994+ __u32 dst_len;
35995+ hint_t tmp_hint;
35996+ hint_t * cur_hint = clust->hint;
35997+ assert("edward-1541", cont->state == PSCHED_SCHED_POINT);
35998+
35999+ start_check_compressibility(inode, clust, &tmp_hint);
36000+
36001+ reset_cluster_pgset(clust, cluster_nrpages(inode));
36002+ result = grab_page_cluster(inode, clust, READ_OP);
36003+ if (result)
36004+ return result;
36005+ /* Read page cluster here */
36006+ for (i = 0; i < clust->nr_pages; i++) {
36007+ struct page *page = clust->pages[i];
36008+ lock_page(page);
36009+ result = do_readpage_ctail(inode, clust, page,
36010+ ZNODE_READ_LOCK);
36011+ unlock_page(page);
36012+ if (result)
36013+ goto error;
36014+ }
36015+ tfm_cluster_clr_uptodate(&clust->tc);
36016+
36017+ cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
36018+
36019+ if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
36020+ /* lenght of compressed data is known, no need to compress */
36021+ assert("edward-1511",
36022+ znode_is_any_locked(tmp_hint.lh.node));
36023+ assert("edward-1512",
36024+ WITH_DATA(tmp_hint.ext_coord.coord.node,
36025+ prepped_dclust_ok(&tmp_hint)));
36026+ dst_len = dclust_get_extension_dsize(&tmp_hint);
36027+ }
36028+ else {
36029+ struct tfm_cluster * tc = &clust->tc;
36030+ compression_plugin * cplug = inode_compression_plugin(inode);
36031+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
36032+ if (result)
36033+ goto error;
36034+ for (i = 0; i < clust->nr_pages; i++) {
36035+ char *data;
36036+ lock_page(clust->pages[i]);
36037+ BUG_ON(!PageUptodate(clust->pages[i]));
36038+ data = kmap(clust->pages[i]);
36039+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
36040+ data, PAGE_CACHE_SIZE);
36041+ kunmap(clust->pages[i]);
36042+ unlock_page(clust->pages[i]);
36043+ }
36044+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
36045+ if (result)
36046+ goto error;
36047+ result = grab_coa(tc, cplug);
36048+ if (result)
36049+ goto error;
36050+ tc->len = tc->lsize = lbytes(clust->index, inode);
36051+ assert("edward-1513", tc->len == inode_cluster_size(inode));
36052+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
36053+ cplug->compress(get_coa(tc, cplug->h.id, tc->act),
36054+ tfm_input_data(clust), tc->len,
36055+ tfm_output_data(clust), &dst_len);
36056+ assert("edward-1514",
36057+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
36058+ }
36059+ finish_check_compressibility(inode, clust, cur_hint);
36060+ cont->state =
36061+ (data_is_compressible(dst_len, inode_cluster_size(inode)) ?
36062+ PSCHED_REMAINS_OLD :
36063+ PSCHED_ASSIGNED_NEW);
36064+ return 0;
36065+ error:
36066+ put_page_cluster(clust, inode, READ_OP);
36067+ return result;
36068+}
36069+
36070+/* Cut disk cluster of index @idx */
36071+static int cut_disk_cluster(struct inode * inode, cloff_t idx)
36072+{
36073+ reiser4_key from, to;
36074+ assert("edward-1515", inode_file_plugin(inode) ==
36075+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
36076+ key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
36077+ to = from;
36078+ set_key_offset(&to,
36079+ get_key_offset(&from) + inode_cluster_size(inode) - 1);
36080+ return reiser4_cut_tree(reiser4_tree_by_inode(inode),
36081+ &from, &to, inode, 0);
36082+}
36083+
36084+static int reserve_cryptcompress2unixfile(struct inode *inode)
36085+{
36086+ reiser4_block_nr unformatted_nodes;
36087+ reiser4_tree *tree;
36088+
36089+ tree = reiser4_tree_by_inode(inode);
36090+
36091+ /* number of unformatted nodes which will be created */
36092+ unformatted_nodes = cluster_nrpages(inode); /* N */
36093+
36094+ /*
36095+ * space required for one iteration of extent->tail conversion:
36096+ *
36097+ * 1. kill ctail items
36098+ *
36099+ * 2. insert N unformatted nodes
36100+ *
36101+ * 3. insert N (worst-case single-block
36102+ * extents) extent units.
36103+ *
36104+ * 4. drilling to the leaf level by coord_by_key()
36105+ *
36106+ * 5. possible update of stat-data
36107+ *
36108+ */
36109+ grab_space_enable();
36110+ return reiser4_grab_space
36111+ (2 * tree->height +
36112+ unformatted_nodes +
36113+ unformatted_nodes * estimate_one_insert_into_item(tree) +
36114+ 1 + estimate_one_insert_item(tree) +
36115+ inode_file_plugin(inode)->estimate.update(inode),
36116+ BA_CAN_COMMIT);
36117+}
36118+
36119+/**
36120+ * Convert cryptcompress file plugin to unix_file plugin.
36121+ */
36122+static int cryptcompress2unixfile(struct file * file, struct inode * inode,
36123+ struct psched_context * cont)
36124+{
36125+ int i;
36126+ int result = 0;
36127+ struct cryptcompress_info *cr_info;
36128+ struct unix_file_info *uf_info;
36129+ assert("edward-1516", cont->pages[0]->index == 0);
36130+
36131+ /* release all cryptcompress-specific resources */
36132+ cr_info = cryptcompress_inode_data(inode);
36133+ result = reserve_cryptcompress2unixfile(inode);
36134+ if (result)
36135+ goto out;
36136+ /* tell kill_hook to not truncate pages */
36137+ reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
36138+ result = cut_disk_cluster(inode, 0);
36139+ if (result)
36140+ goto out;
36141+ /* captured jnode of cluster and assotiated resources (pages,
36142+ reserved disk space) were released by ->kill_hook() method
36143+ of the item plugin */
36144+
36145+ result = __cryptcompress2unixfile(file, inode);
36146+ if (result)
36147+ goto out;
36148+ /* At this point file is managed by unix file plugin */
36149+
36150+ uf_info = unix_file_inode_data(inode);
36151+
36152+ assert("edward-1518",
36153+ ergo(jprivate(cont->pages[0]),
36154+ !jnode_is_cluster_page(jprivate(cont->pages[0]))));
36155+ for(i = 0; i < cont->nr_pages; i++) {
36156+ assert("edward-1519", cont->pages[i]);
36157+ assert("edward-1520", PageUptodate(cont->pages[i]));
36158+
36159+ result = find_or_create_extent(cont->pages[i]);
36160+ if (result)
36161+ break;
36162+ }
36163+ if (unlikely(result))
36164+ goto out;
36165+ uf_info->container = UF_CONTAINER_EXTENTS;
36166+ result = reiser4_update_sd(inode);
36167+ out:
36168+ all_grabbed2free();
36169+ return result;
36170+}
36171+
36172+#define convert_file_plugin cryptcompress2unixfile
36173+
36174+/**
36175+ * This is called by ->write() method of a cryptcompress file plugin.
36176+ * Make a decision about the most reasonable file plugin id to manage
36177+ * the file.
36178+ */
36179+int write_pschedule_hook(struct file * file, struct inode * inode,
36180+ loff_t pos, struct cluster_handle * clust,
36181+ struct psched_context * cont)
36182+{
36183+ int result;
36184+ if (!conversion_enabled(inode))
36185+ return 0;
36186+ result = check_psched_point(inode, pos, clust, cont);
36187+ if (result || cont->state != PSCHED_SCHED_POINT)
36188+ return result;
36189+ result = read_check_compressibility(inode, clust, cont);
36190+ if (result)
36191+ return result;
36192+ if (cont->state == PSCHED_REMAINS_OLD) {
36193+ put_page_cluster(clust, inode, READ_OP);
36194+ return disable_conversion(inode);
36195+ }
36196+ assert("edward-1543", cont->state == PSCHED_ASSIGNED_NEW);
36197+ /*
36198+ * page cluster is grabbed and uptodate. It will be
36199+ * released with a pgset after plugin conversion is
36200+ * finished, see put_psched_context().
36201+ */
36202+ reiser4_unset_hint(clust->hint);
36203+ move_cluster_pgset(clust, &cont->pages, &cont->nr_pages);
36204+ return 0;
36205+}
36206+
36207+/**
36208+ * This is called by ->setattr() method of cryptcompress file plugin.
36209+ */
36210+int setattr_pschedule_hook(struct inode * inode)
36211+{
36212+ if (conversion_enabled(inode))
36213+ return disable_conversion(inode);
36214+ return 0;
36215+}
36216+
36217+static inline void init_psched_context(struct psched_context * cont)
36218+{
36219+ memset(cont, 0, sizeof(*cont));
36220+}
36221+
36222+static inline void done_psched_context(struct psched_context * cont,
36223+ struct inode * inode)
36224+{
36225+ if (cont->pages) {
36226+ __put_page_cluster(0, cont->nr_pages, cont->pages, inode);
36227+ kfree(cont->pages);
36228+ }
36229+}
36230+/**
36231+ * Here are wrappers with "protection", aka Reiser4 "careful" methods.
36232+ * They are used by vfs (as methods of file_ops, inode_ops or as_ops),
36233+ * which is not aware of plugin conversion performed by Reiser4.
36234+ */
36235+
36236+/*
36237+ * Wrappers with active protection for:
36238+ *
36239+ * ->write();
36240+ */
36241+
36242+/*
36243+ * ->write() file operation supplied to VFS.
36244+ * Write a file in 3 steps (some of them can be optional).
36245+ */
36246+ssize_t reiser4_write_careful(struct file *file, const char __user *buf,
36247+ size_t count, loff_t *off)
36248+{
36249+ int result;
36250+ reiser4_context *ctx;
36251+ ssize_t written_old = 0; /* bytes written with initial plugin */
36252+ ssize_t written_new = 0; /* bytes written with new plugin */
36253+ struct psched_context cont;
36254+ struct inode * inode = file->f_dentry->d_inode;
36255+
36256+ ctx = reiser4_init_context(inode->i_sb);
36257+ if (IS_ERR(ctx))
36258+ return PTR_ERR(ctx);
36259+ init_psched_context(&cont);
36260+ mutex_lock(&inode->i_mutex);
36261+ /**
36262+ * First step.
36263+ * Start write with initial file plugin.
36264+ * Keep a plugin schedule status at @cont (if any).
36265+ */
36266+ written_old = inode_file_plugin(inode)->write(file,
36267+ buf,
36268+ count,
36269+ off,
36270+ &cont);
36271+ if (cont.state != PSCHED_ASSIGNED_NEW || written_old < 0)
36272+ goto exit;
36273+ /**
36274+ * Second step.
36275+ * New file plugin has been scheduled.
36276+ * Perform conversion to the new plugin.
36277+ */
36278+ down_read(&reiser4_inode_data(inode)->conv_sem);
36279+ result = convert_file_plugin(file, inode, &cont);
36280+ up_read(&reiser4_inode_data(inode)->conv_sem);
36281+ if (result) {
36282+ warning("edward-1544",
36283+ "Inode %llu: file plugin conversion failed (%d)",
36284+ (unsigned long long)get_inode_oid(inode),
36285+ result);
36286+ context_set_commit_async(ctx);
36287+ goto exit;
36288+ }
36289+ reiser4_txn_restart(ctx);
36290+ /**
36291+ * Third step:
36292+ * Finish write with the new file plugin.
36293+ */
36294+ assert("edward-1536",
36295+ inode_file_plugin(inode) ==
36296+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
36297+
36298+ written_new = inode_file_plugin(inode)->write(file,
36299+ buf + written_old,
36300+ count - written_old,
36301+ off,
36302+ NULL);
36303+ exit:
36304+ mutex_unlock(&inode->i_mutex);
36305+ done_psched_context(&cont, inode);
36306+ reiser4_exit_context(ctx);
36307+
36308+ return written_old + (written_new < 0 ? 0 : written_new);
36309+}
36310+
36311+/* Wrappers with passive protection for:
36312+ *
36313+ * ->open();
36314+ * ->read();
36315+ * ->ioctl();
36316+ * ->mmap();
36317+ * ->release();
36318+ * ->bmap().
36319+ */
36320+
36321+int reiser4_open_careful(struct inode *inode, struct file *file)
36322+{
36323+ return PROT_PASSIVE(int, open, (inode, file));
36324+}
36325+
36326+ssize_t reiser4_read_careful(struct file * file, char __user * buf,
36327+ size_t size, loff_t * off)
36328+{
36329+ struct inode * inode = file->f_dentry->d_inode;
36330+ return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
36331+}
36332+
36333+int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36334+ unsigned int cmd, unsigned long arg)
36335+{
36336+ return PROT_PASSIVE(int, ioctl, (inode, filp, cmd, arg));
36337+}
36338+
36339+int reiser4_mmap_careful(struct file *file, struct vm_area_struct *vma)
36340+{
36341+ struct inode *inode = file->f_dentry->d_inode;
36342+ return PROT_PASSIVE(int, mmap, (file, vma));
36343+}
36344+
36345+int reiser4_release_careful(struct inode *inode, struct file *file)
36346+{
36347+ return PROT_PASSIVE(int, release, (inode, file));
36348+}
36349+
36350+sector_t reiser4_bmap_careful(struct address_space * mapping, sector_t lblock)
36351+{
36352+ struct inode *inode = mapping->host;
36353+ return PROT_PASSIVE(sector_t, bmap, (mapping, lblock));
36354+}
36355+
36356+/*
36357+ * Wrappers without protection for:
36358+ *
36359+ * ->setattr()
36360+ */
36361+int reiser4_setattr(struct dentry *dentry, struct iattr *attr)
36362+{
36363+ return inode_file_plugin(dentry->d_inode)->setattr(dentry, attr);
36364+}
36365+
36366+/*
36367+ Local variables:
36368+ c-indentation-style: "K&R"
36369+ mode-name: "LC"
36370+ c-basic-offset: 8
36371+ tab-width: 8
36372+ fill-column: 80
36373+ scroll-step: 1
36374+ End:
36375+*/
36376diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/file.h linux-2.6.24/fs/reiser4/plugin/file/file.h
36377--- linux-2.6.24.orig/fs/reiser4/plugin/file/file.h 1970-01-01 03:00:00.000000000 +0300
36378+++ linux-2.6.24/fs/reiser4/plugin/file/file.h 2008-01-25 11:40:16.694168755 +0300
36379@@ -0,0 +1,331 @@
36380+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
36381+ * reiser4/README */
36382+
36383+/* this file contains declarations of methods implementing
36384+ file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
36385+ and SYMLINK_FILE_PLUGIN_ID) */
36386+
36387+#if !defined( __REISER4_FILE_H__ )
36388+#define __REISER4_FILE_H__
36389+
36390+/* possible states when scheduling a new file plugin */
36391+typedef enum {
36392+ PSCHED_INVAL_STATE, /* invalid state */
36393+ PSCHED_SCHED_POINT, /* scheduling point has been achieved */
36394+ PSCHED_REMAINS_OLD, /* made a decision to be managed by old plugin */
36395+ PSCHED_ASSIGNED_NEW /* new plugin has been scheduled */
36396+} psched_state;
36397+
36398+struct psched_context {
36399+ int nr_pages;
36400+ struct page **pages;
36401+ psched_state state;
36402+};
36403+
36404+/**
36405+ * Declarations of common/careful/generic methods.
36406+ * Suppose ->foo() is a vs method (of f_ops, i_ops, or a_ops);
36407+ * Then common reiser4 method for foo looks like reiser4_foo_common;
36408+ * careful method looks like reiser4_foo_careful;
36409+ * generic method looks like reiser4_foo.
36410+ *
36411+ * Common method is a simple instruction set eligible for more
36412+ * then one plugin id.
36413+ *
36414+ * Generic method looks at the plugin installed in inode's
36415+ * plugin set and calls its appropriate method.
36416+ *
36417+ * Careful method looks like generic method with protected pset
36418+ * (see plugin/file/file_conversion.c for details).
36419+ */
36420+
36421+/* inode operations */
36422+int reiser4_setattr(struct dentry *, struct iattr *);
36423+
36424+/* file operations */
36425+ssize_t reiser4_read_careful(struct file *, char __user *buf,
36426+ size_t count, loff_t *off);
36427+ssize_t reiser4_write_careful(struct file *, const char __user *buf,
36428+ size_t count, loff_t * off);
36429+int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36430+ unsigned int cmd, unsigned long arg);
36431+int reiser4_mmap_careful(struct file *, struct vm_area_struct *);
36432+int reiser4_open_careful(struct inode *inode, struct file *file);
36433+int reiser4_release_careful(struct inode *, struct file *);
36434+int reiser4_sync_file_common(struct file *, struct dentry *, int datasync);
36435+
36436+/* address space operations */
36437+int reiser4_readpage(struct file *, struct page *);
36438+int reiser4_readpages(struct file*, struct address_space*, struct list_head*,
36439+ unsigned);
36440+int reiser4_writepages(struct address_space *, struct writeback_control *);
36441+int reiser4_prepare_write(struct file *, struct page *, unsigned from,
36442+ unsigned to);
36443+int reiser4_commit_write(struct file *, struct page *, unsigned from,
36444+ unsigned to);
36445+sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock);
36446+
36447+/*
36448+ * Private methods of unix-file plugin
36449+ * (UNIX_FILE_PLUGIN_ID)
36450+ */
36451+
36452+/* private inode operations */
36453+int setattr_unix_file(struct dentry *, struct iattr *);
36454+
36455+/* private file operations */
36456+
36457+ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
36458+ loff_t *off);
36459+ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
36460+ loff_t * off, struct psched_context * cont);
36461+int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
36462+ unsigned long arg);
36463+int mmap_unix_file(struct file *, struct vm_area_struct *);
36464+int open_unix_file(struct inode *, struct file *);
36465+int release_unix_file(struct inode *, struct file *);
36466+
36467+/* private address space operations */
36468+int readpage_unix_file(struct file *, struct page *);
36469+int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
36470+int writepages_unix_file(struct address_space *, struct writeback_control *);
36471+int prepare_write_unix_file(struct file *, struct page *, unsigned from,
36472+ unsigned to);
36473+int commit_write_unix_file(struct file *, struct page *, unsigned from,
36474+ unsigned to);
36475+sector_t bmap_unix_file(struct address_space *, sector_t lblock);
36476+
36477+/* other private methods */
36478+int delete_object_unix_file(struct inode *);
36479+int flow_by_inode_unix_file(struct inode *, const char __user *buf,
36480+ int user, loff_t, loff_t, rw_op, flow_t *);
36481+int owns_item_unix_file(const struct inode *, const coord_t *);
36482+void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
36483+ int create);
36484+
36485+/*
36486+ * Private methods of cryptcompress file plugin
36487+ * (CRYPTCOMPRESS_FILE_PLUGIN_ID)
36488+ */
36489+
36490+/* private inode operations */
36491+int setattr_cryptcompress(struct dentry *, struct iattr *);
36492+
36493+/* private file operations */
36494+ssize_t read_cryptcompress(struct file *, char __user *buf,
36495+ size_t count, loff_t *off);
36496+ssize_t write_cryptcompress(struct file *, const char __user *buf,
36497+ size_t count, loff_t * off,
36498+ struct psched_context *cont);
36499+int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd,
36500+ unsigned long arg);
36501+int mmap_cryptcompress(struct file *, struct vm_area_struct *);
36502+int open_cryptcompress(struct inode *, struct file *);
36503+int release_cryptcompress(struct inode *, struct file *);
36504+
36505+/* private address space operations */
36506+int readpage_cryptcompress(struct file *, struct page *);
36507+int readpages_cryptcompress(struct file*, struct address_space*,
36508+ struct list_head*, unsigned);
36509+int writepages_cryptcompress(struct address_space *,
36510+ struct writeback_control *);
36511+int prepare_write_cryptcompress(struct file *, struct page *, unsigned from,
36512+ unsigned to);
36513+int commit_write_cryptcompress(struct file *, struct page *, unsigned from,
36514+ unsigned to);
36515+sector_t bmap_cryptcompress(struct address_space *, sector_t lblock);
36516+
36517+/* other private methods */
36518+int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
36519+ int user, loff_t, loff_t, rw_op, flow_t *);
36520+int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
36521+int create_object_cryptcompress(struct inode *, struct inode *,
36522+ reiser4_object_create_data *);
36523+int delete_object_cryptcompress(struct inode *);
36524+void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
36525+ int create);
36526+int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
36527+ const reiser4_key * to_key,
36528+ reiser4_key * smallest_removed,
36529+ struct inode *object, int truncate,
36530+ int *progress);
36531+void destroy_inode_cryptcompress(struct inode *);
36532+
36533+/*
36534+ * Private methods of symlink file plugin
36535+ * (SYMLINK_FILE_PLUGIN_ID)
36536+ */
36537+int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
36538+ reiser4_object_create_data *);
36539+void destroy_inode_symlink(struct inode *);
36540+
36541+/*
36542+ * all the write into unix file is performed by item write method. Write method
36543+ * of unix file plugin only decides which item plugin (extent or tail) and in
36544+ * which mode (one from the enum below) to call
36545+ */
36546+typedef enum {
36547+ FIRST_ITEM = 1,
36548+ APPEND_ITEM = 2,
36549+ OVERWRITE_ITEM = 3
36550+} write_mode_t;
36551+
36552+/* unix file may be in one the following states */
36553+typedef enum {
36554+ UF_CONTAINER_UNKNOWN = 0,
36555+ UF_CONTAINER_TAILS = 1,
36556+ UF_CONTAINER_EXTENTS = 2,
36557+ UF_CONTAINER_EMPTY = 3
36558+} file_container_t;
36559+
36560+struct formatting_plugin;
36561+struct inode;
36562+
36563+/* unix file plugin specific part of reiser4 inode */
36564+struct unix_file_info {
36565+ /*
36566+ * this read-write lock protects file containerization change. Accesses
36567+ * which do not change file containerization (see file_container_t)
36568+ * (read, readpage, writepage, write (until tail conversion is
36569+ * involved)) take read-lock. Accesses which modify file
36570+ * containerization (truncate, conversion from tail to extent and back)
36571+ * take write-lock.
36572+ */
36573+ struct rw_semaphore latch;
36574+ /* this enum specifies which items are used to build the file */
36575+ file_container_t container;
36576+ /*
36577+ * plugin which controls when file is to be converted to extents and
36578+ * back to tail
36579+ */
36580+ struct formatting_plugin *tplug;
36581+ /* if this is set, file is in exclusive use */
36582+ int exclusive_use;
36583+#if REISER4_DEBUG
36584+ /* pointer to task struct of thread owning exclusive access to file */
36585+ void *ea_owner;
36586+ atomic_t nr_neas;
36587+ void *last_reader;
36588+#endif
36589+};
36590+
36591+struct unix_file_info *unix_file_inode_data(const struct inode *inode);
36592+void get_exclusive_access(struct unix_file_info *);
36593+void drop_exclusive_access(struct unix_file_info *);
36594+void get_nonexclusive_access(struct unix_file_info *);
36595+void drop_nonexclusive_access(struct unix_file_info *);
36596+int try_to_get_nonexclusive_access(struct unix_file_info *);
36597+int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
36598+ struct inode *);
36599+int find_file_item_nohint(coord_t *, lock_handle *,
36600+ const reiser4_key *, znode_lock_mode,
36601+ struct inode *);
36602+
36603+int load_file_hint(struct file *, hint_t *);
36604+void save_file_hint(struct file *, const hint_t *);
36605+
36606+#include "../item/extent.h"
36607+#include "../item/tail.h"
36608+#include "../item/ctail.h"
36609+
36610+struct uf_coord {
36611+ coord_t coord;
36612+ lock_handle *lh;
36613+ int valid;
36614+ union {
36615+ struct extent_coord_extension extent;
36616+ struct tail_coord_extension tail;
36617+ struct ctail_coord_extension ctail;
36618+ } extension;
36619+};
36620+
36621+#include "../../forward.h"
36622+#include "../../seal.h"
36623+#include "../../lock.h"
36624+
36625+/*
36626+ * This structure is used to speed up file operations (reads and writes). A
36627+ * hint is a suggestion about where a key resolved to last time. A seal
36628+ * indicates whether a node has been modified since a hint was last recorded.
36629+ * You check the seal, and if the seal is still valid, you can use the hint
36630+ * without traversing the tree again.
36631+ */
36632+struct hint {
36633+ seal_t seal; /* a seal over last file item accessed */
36634+ uf_coord_t ext_coord;
36635+ loff_t offset;
36636+ znode_lock_mode mode;
36637+ lock_handle lh;
36638+};
36639+
36640+static inline int hint_is_valid(hint_t * hint)
36641+{
36642+ return hint->ext_coord.valid;
36643+}
36644+
36645+static inline void hint_set_valid(hint_t * hint)
36646+{
36647+ hint->ext_coord.valid = 1;
36648+}
36649+
36650+static inline void hint_clr_valid(hint_t * hint)
36651+{
36652+ hint->ext_coord.valid = 0;
36653+}
36654+
36655+int load_file_hint(struct file *, hint_t *);
36656+void save_file_hint(struct file *, const hint_t *);
36657+void hint_init_zero(hint_t *);
36658+void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
36659+int hint_is_set(const hint_t *);
36660+void reiser4_unset_hint(hint_t *);
36661+
36662+int reiser4_update_file_size(struct inode *, loff_t, int update_sd);
36663+int cut_file_items(struct inode *, loff_t new_size,
36664+ int update_sd, loff_t cur_size,
36665+ int (*update_actor) (struct inode *, loff_t, int));
36666+#if REISER4_DEBUG
36667+
36668+/* return 1 is exclusive access is obtained, 0 - otherwise */
36669+static inline int ea_obtained(struct unix_file_info * uf_info)
36670+{
36671+ int ret;
36672+
36673+ ret = down_read_trylock(&uf_info->latch);
36674+ if (ret)
36675+ up_read(&uf_info->latch);
36676+ return !ret;
36677+}
36678+
36679+#endif
36680+
36681+#define WRITE_GRANULARITY 32
36682+
36683+int tail2extent(struct unix_file_info *);
36684+int extent2tail(struct file *, struct unix_file_info *);
36685+
36686+int goto_right_neighbor(coord_t *, lock_handle *);
36687+int find_or_create_extent(struct page *);
36688+int equal_to_ldk(znode *, const reiser4_key *);
36689+
36690+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
36691+
36692+static inline int cbk_errored(int cbk_result)
36693+{
36694+ return (cbk_result != CBK_COORD_NOTFOUND
36695+ && cbk_result != CBK_COORD_FOUND);
36696+}
36697+
36698+/* __REISER4_FILE_H__ */
36699+#endif
36700+
36701+/*
36702+ * Local variables:
36703+ * c-indentation-style: "K&R"
36704+ * mode-name: "LC"
36705+ * c-basic-offset: 8
36706+ * tab-width: 8
36707+ * fill-column: 79
36708+ * scroll-step: 1
36709+ * End:
36710+*/
36711diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/Makefile linux-2.6.24/fs/reiser4/plugin/file/Makefile
36712--- linux-2.6.24.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 03:00:00.000000000 +0300
36713+++ linux-2.6.24/fs/reiser4/plugin/file/Makefile 2008-01-25 11:39:06.988221084 +0300
36714@@ -0,0 +1,7 @@
36715+obj-$(CONFIG_REISER4_FS) += file_plugins.o
36716+
36717+file_plugins-objs := \
36718+ file.o \
36719+ tail_conversion.o \
36720+ symlink.o \
36721+ cryptcompress.o
36722diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.24/fs/reiser4/plugin/file/symfile.c
36723--- linux-2.6.24.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 03:00:00.000000000 +0300
36724+++ linux-2.6.24/fs/reiser4/plugin/file/symfile.c 2008-01-25 11:39:06.992222114 +0300
36725@@ -0,0 +1,87 @@
36726+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
36727+
36728+/* Symfiles are a generalization of Unix symlinks.
36729+
36730+ A symfile when read behaves as though you took its contents and
36731+ substituted them into the reiser4 naming system as the right hand side
36732+ of an assignment, and then read that which you had assigned to it.
36733+
36734+ A key issue for symfiles is how to implement writes through to
36735+ subfiles. In general, one must have some method of determining what
36736+ of that which is written to the symfile is written to what subfile.
36737+ This can be done by use of custom plugin methods written by users, or
36738+ by using a few general methods we provide for those willing to endure
36739+ the insertion of delimiters into what is read.
36740+
36741+ Writing to symfiles without delimiters to denote what is written to
36742+ what subfile is not supported by any plugins we provide in this
36743+ release. Our most sophisticated support for writes is that embodied
36744+ by the invert plugin (see invert.c).
36745+
36746+ A read only version of the /etc/passwd file might be
36747+ constructed as a symfile whose contents are as follows:
36748+
36749+ /etc/passwd/userlines/*
36750+
36751+ or
36752+
36753+ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
36754+
36755+ or
36756+
36757+ /etc/passwd/userlines/(demidov+edward+reiser+root)
36758+
36759+ A symfile with contents
36760+
36761+ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
36762+
36763+ will return when read
36764+
36765+ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
36766+
36767+ and write of what has been read will not be possible to implement as
36768+ an identity operation because there are no delimiters denoting the
36769+ boundaries of what is to be written to what subfile.
36770+
36771+ Note that one could make this a read/write symfile if one specified
36772+ delimiters, and the write method understood those delimiters delimited
36773+ what was written to subfiles.
36774+
36775+ So, specifying the symfile in a manner that allows writes:
36776+
36777+ /etc/passwd/userlines/demidov+"(
36778+ )+/etc/passwd/userlines/edward+"(
36779+ )+/etc/passwd/userlines/reiser+"(
36780+ )+/etc/passwd/userlines/root+"(
36781+ )
36782+
36783+ or
36784+
36785+ /etc/passwd/userlines/(demidov+"(
36786+ )+edward+"(
36787+ )+reiser+"(
36788+ )+root+"(
36789+ ))
36790+
36791+ and the file demidov might be specified as:
36792+
36793+ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
36794+
36795+ or
36796+
36797+ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
36798+
36799+ Notice that if the file demidov has a carriage return in it, the
36800+ parsing fails, but then if you put carriage returns in the wrong place
36801+ in a normal /etc/passwd file it breaks things also.
36802+
36803+ Note that it is forbidden to have no text between two interpolations
36804+ if one wants to be able to define what parts of a write go to what
36805+ subfiles referenced in an interpolation.
36806+
36807+ If one wants to be able to add new lines by writing to the file, one
36808+ must either write a custom plugin for /etc/passwd that knows how to
36809+ name an added line, or one must use an invert, or one must use a more
36810+ sophisticated symfile syntax that we are not planning to write for
36811+ version 4.0.
36812+*/
36813diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.24/fs/reiser4/plugin/file/symlink.c
36814--- linux-2.6.24.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 03:00:00.000000000 +0300
36815+++ linux-2.6.24/fs/reiser4/plugin/file/symlink.c 2008-01-25 11:39:06.992222114 +0300
36816@@ -0,0 +1,95 @@
36817+/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
36818+
36819+#include "../../inode.h"
36820+
36821+#include <linux/types.h>
36822+#include <linux/fs.h>
36823+
36824+/* file plugin methods specific for symlink files
36825+ (SYMLINK_FILE_PLUGIN_ID) */
36826+
36827+/* this is implementation of create_object method of file plugin for
36828+ SYMLINK_FILE_PLUGIN_ID
36829+ */
36830+
36831+/**
36832+ * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
36833+ * @symlink: inode of symlink object
36834+ * @dir: inode of parent directory
36835+ * @info: parameters of new object
36836+ *
36837+ * Inserts stat data with symlink extension where into the tree.
36838+ */
36839+int reiser4_create_symlink(struct inode *symlink,
36840+ struct inode *dir UNUSED_ARG,
36841+ reiser4_object_create_data *data /* info passed to us
36842+ * this is filled by
36843+ * reiser4() syscall
36844+ * in particular */)
36845+{
36846+ int result;
36847+
36848+ assert("nikita-680", symlink != NULL);
36849+ assert("nikita-681", S_ISLNK(symlink->i_mode));
36850+ assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
36851+ assert("nikita-682", dir != NULL);
36852+ assert("nikita-684", data != NULL);
36853+ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
36854+
36855+ /*
36856+ * stat data of symlink has symlink extension in which we store
36857+ * symlink content, that is, path symlink is pointing to.
36858+ */
36859+ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
36860+
36861+ assert("vs-838", symlink->i_private == NULL);
36862+ symlink->i_private = (void *)data->name;
36863+
36864+ assert("vs-843", symlink->i_size == 0);
36865+ INODE_SET_FIELD(symlink, i_size, strlen(data->name));
36866+
36867+ /* insert stat data appended with data->name */
36868+ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
36869+ if (result) {
36870+ /* FIXME-VS: Make sure that symlink->i_private is not attached
36871+ to kmalloced data */
36872+ INODE_SET_FIELD(symlink, i_size, 0);
36873+ } else {
36874+ assert("vs-849", symlink->i_private
36875+ && reiser4_inode_get_flag(symlink,
36876+ REISER4_GENERIC_PTR_USED));
36877+ assert("vs-850",
36878+ !memcmp((char *)symlink->i_private, data->name,
36879+ (size_t) symlink->i_size + 1));
36880+ }
36881+ return result;
36882+}
36883+
36884+/* this is implementation of destroy_inode method of file plugin for
36885+ SYMLINK_FILE_PLUGIN_ID
36886+ */
36887+void destroy_inode_symlink(struct inode *inode)
36888+{
36889+ assert("edward-799",
36890+ inode_file_plugin(inode) ==
36891+ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
36892+ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
36893+ assert("edward-801", reiser4_inode_get_flag(inode,
36894+ REISER4_GENERIC_PTR_USED));
36895+ assert("vs-839", S_ISLNK(inode->i_mode));
36896+
36897+ kfree(inode->i_private);
36898+ inode->i_private = NULL;
36899+ reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
36900+}
36901+
36902+/*
36903+ Local variables:
36904+ c-indentation-style: "K&R"
36905+ mode-name: "LC"
36906+ c-basic-offset: 8
36907+ tab-width: 8
36908+ fill-column: 80
36909+ scroll-step: 1
36910+ End:
36911+*/
36912diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.24/fs/reiser4/plugin/file/tail_conversion.c
36913--- linux-2.6.24.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 03:00:00.000000000 +0300
36914+++ linux-2.6.24/fs/reiser4/plugin/file/tail_conversion.c 2008-01-25 11:40:16.694168755 +0300
36915@@ -0,0 +1,726 @@
36916+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
36917+
36918+#include "../../inode.h"
36919+#include "../../super.h"
36920+#include "../../page_cache.h"
36921+#include "../../carry.h"
36922+#include "../../safe_link.h"
36923+#include "../../vfs_ops.h"
36924+
36925+#include <linux/writeback.h>
36926+
36927+/* this file contains:
36928+ tail2extent and extent2tail */
36929+
36930+/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
36931+void get_exclusive_access(struct unix_file_info * uf_info)
36932+{
36933+ assert("nikita-3028", reiser4_schedulable());
36934+ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
36935+ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
36936+ /*
36937+ * "deadlock avoidance": sometimes we commit a transaction under
36938+ * rw-semaphore on a file. Such commit can deadlock with another
36939+ * thread that captured some block (hence preventing atom from being
36940+ * committed) and waits on rw-semaphore.
36941+ */
36942+ reiser4_txn_restart_current();
36943+ LOCK_CNT_INC(inode_sem_w);
36944+ down_write(&uf_info->latch);
36945+ uf_info->exclusive_use = 1;
36946+ assert("vs-1713", uf_info->ea_owner == NULL);
36947+ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
36948+ ON_DEBUG(uf_info->ea_owner = current);
36949+}
36950+
36951+void drop_exclusive_access(struct unix_file_info * uf_info)
36952+{
36953+ assert("vs-1714", uf_info->ea_owner == current);
36954+ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
36955+ ON_DEBUG(uf_info->ea_owner = NULL);
36956+ uf_info->exclusive_use = 0;
36957+ up_write(&uf_info->latch);
36958+ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
36959+ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
36960+ LOCK_CNT_DEC(inode_sem_w);
36961+ reiser4_txn_restart_current();
36962+}
36963+
36964+/**
36965+ * nea_grabbed - do something when file semaphore is down_read-ed
36966+ * @uf_info:
36967+ *
36968+ * This is called when nonexclisive access is obtained on file. All it does is
36969+ * for debugging purposes.
36970+ */
36971+static void nea_grabbed(struct unix_file_info *uf_info)
36972+{
36973+#if REISER4_DEBUG
36974+ LOCK_CNT_INC(inode_sem_r);
36975+ assert("vs-1716", uf_info->ea_owner == NULL);
36976+ atomic_inc(&uf_info->nr_neas);
36977+ uf_info->last_reader = current;
36978+#endif
36979+}
36980+
36981+/**
36982+ * get_nonexclusive_access - get nonexclusive access to a file
36983+ * @uf_info: unix file specific part of inode to obtain access to
36984+ *
36985+ * Nonexclusive access is obtained on a file before read, write, readpage.
36986+ */
36987+void get_nonexclusive_access(struct unix_file_info *uf_info)
36988+{
36989+ assert("nikita-3029", reiser4_schedulable());
36990+ assert("nikita-3361", get_current_context()->trans->atom == NULL);
36991+
36992+ down_read(&uf_info->latch);
36993+ nea_grabbed(uf_info);
36994+}
36995+
36996+/**
36997+ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
36998+ * @uf_info: unix file specific part of inode to obtain access to
36999+ *
37000+ * Non-blocking version of nonexclusive access obtaining.
37001+ */
37002+int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
37003+{
37004+ int result;
37005+
37006+ result = down_read_trylock(&uf_info->latch);
37007+ if (result)
37008+ nea_grabbed(uf_info);
37009+ return result;
37010+}
37011+
37012+void drop_nonexclusive_access(struct unix_file_info * uf_info)
37013+{
37014+ assert("vs-1718", uf_info->ea_owner == NULL);
37015+ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
37016+ ON_DEBUG(atomic_dec(&uf_info->nr_neas));
37017+
37018+ up_read(&uf_info->latch);
37019+
37020+ LOCK_CNT_DEC(inode_sem_r);
37021+ reiser4_txn_restart_current();
37022+}
37023+
37024+/* part of tail2extent. Cut all items covering @count bytes starting from
37025+ @offset */
37026+/* Audited by: green(2002.06.15) */
37027+static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
37028+{
37029+ reiser4_key from, to;
37030+
37031+ /* AUDIT: How about putting an assertion here, what would check
37032+ all provided range is covered by tail items only? */
37033+ /* key of first byte in the range to be cut */
37034+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
37035+
37036+ /* key of last byte in that range */
37037+ to = from;
37038+ set_key_offset(&to, (__u64) (offset + count - 1));
37039+
37040+ /* cut everything between those keys */
37041+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
37042+ inode, 0);
37043+}
37044+
37045+static void release_all_pages(struct page **pages, unsigned nr_pages)
37046+{
37047+ unsigned i;
37048+
37049+ for (i = 0; i < nr_pages; i++) {
37050+ if (pages[i] == NULL) {
37051+ unsigned j;
37052+ for (j = i + 1; j < nr_pages; j++)
37053+ assert("vs-1620", pages[j] == NULL);
37054+ break;
37055+ }
37056+ page_cache_release(pages[i]);
37057+ pages[i] = NULL;
37058+ }
37059+}
37060+
37061+/* part of tail2extent. replace tail items with extent one. Content of tail
37062+ items (@count bytes) being cut are copied already into
37063+ pages. extent_writepage method is called to create extents corresponding to
37064+ those pages */
37065+static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
37066+{
37067+ int result;
37068+ unsigned i;
37069+ STORE_COUNTERS;
37070+
37071+ if (nr_pages == 0)
37072+ return 0;
37073+
37074+ assert("vs-596", pages[0]);
37075+
37076+ /* cut copied items */
37077+ result = cut_formatting_items(inode, page_offset(pages[0]), count);
37078+ if (result)
37079+ return result;
37080+
37081+ CHECK_COUNTERS;
37082+
37083+ /* put into tree replacement for just removed items: extent item, namely */
37084+ for (i = 0; i < nr_pages; i++) {
37085+ result = add_to_page_cache_lru(pages[i], inode->i_mapping,
37086+ pages[i]->index,
37087+ mapping_gfp_mask(inode->
37088+ i_mapping));
37089+ if (result)
37090+ break;
37091+ unlock_page(pages[i]);
37092+ result = find_or_create_extent(pages[i]);
37093+ if (result)
37094+ break;
37095+ SetPageUptodate(pages[i]);
37096+ }
37097+ return result;
37098+}
37099+
37100+#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
37101+ * items */
37102+
37103+static int reserve_tail2extent_iteration(struct inode *inode)
37104+{
37105+ reiser4_block_nr unformatted_nodes;
37106+ reiser4_tree *tree;
37107+
37108+ tree = reiser4_tree_by_inode(inode);
37109+
37110+ /* number of unformatted nodes which will be created */
37111+ unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
37112+
37113+ /*
37114+ * space required for one iteration of extent->tail conversion:
37115+ *
37116+ * 1. kill N tail items
37117+ *
37118+ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
37119+ *
37120+ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
37121+ * extents) extent units.
37122+ *
37123+ * 4. drilling to the leaf level by coord_by_key()
37124+ *
37125+ * 5. possible update of stat-data
37126+ *
37127+ */
37128+ grab_space_enable();
37129+ return reiser4_grab_space
37130+ (2 * tree->height +
37131+ TAIL2EXTENT_PAGE_NUM +
37132+ TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
37133+ 1 + estimate_one_insert_item(tree) +
37134+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37135+}
37136+
37137+/* clear stat data's flag indicating that conversion is being converted */
37138+static int complete_conversion(struct inode *inode)
37139+{
37140+ int result;
37141+
37142+ grab_space_enable();
37143+ result =
37144+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
37145+ BA_CAN_COMMIT);
37146+ if (result == 0) {
37147+ reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
37148+ result = reiser4_update_sd(inode);
37149+ }
37150+ if (result)
37151+ warning("vs-1696", "Failed to clear converting bit of %llu: %i",
37152+ (unsigned long long)get_inode_oid(inode), result);
37153+ return 0;
37154+}
37155+
37156+/**
37157+ * find_start
37158+ * @inode:
37159+ * @id:
37160+ * @offset:
37161+ *
37162+ * this is used by tail2extent and extent2tail to detect where previous
37163+ * uncompleted conversion stopped
37164+ */
37165+static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
37166+{
37167+ int result;
37168+ lock_handle lh;
37169+ coord_t coord;
37170+ struct unix_file_info *ufo;
37171+ int found;
37172+ reiser4_key key;
37173+
37174+ ufo = unix_file_inode_data(inode);
37175+ init_lh(&lh);
37176+ result = 0;
37177+ found = 0;
37178+ inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
37179+ do {
37180+ init_lh(&lh);
37181+ result = find_file_item_nohint(&coord, &lh, &key,
37182+ ZNODE_READ_LOCK, inode);
37183+
37184+ if (result == CBK_COORD_FOUND) {
37185+ if (coord.between == AT_UNIT) {
37186+ /*coord_clear_iplug(&coord); */
37187+ result = zload(coord.node);
37188+ if (result == 0) {
37189+ if (item_id_by_coord(&coord) == id)
37190+ found = 1;
37191+ else
37192+ item_plugin_by_coord(&coord)->s.
37193+ file.append_key(&coord,
37194+ &key);
37195+ zrelse(coord.node);
37196+ }
37197+ } else
37198+ result = RETERR(-ENOENT);
37199+ }
37200+ done_lh(&lh);
37201+ } while (result == 0 && !found);
37202+ *offset = get_key_offset(&key);
37203+ return result;
37204+}
37205+
37206+/**
37207+ * tail2extent
37208+ * @uf_info:
37209+ *
37210+ *
37211+ */
37212+int tail2extent(struct unix_file_info *uf_info)
37213+{
37214+ int result;
37215+ reiser4_key key; /* key of next byte to be moved to page */
37216+ char *p_data; /* data of page */
37217+ unsigned page_off = 0, /* offset within the page where to copy data */
37218+ count; /* number of bytes of item which can be
37219+ * copied to page */
37220+ struct page *pages[TAIL2EXTENT_PAGE_NUM];
37221+ struct page *page;
37222+ int done; /* set to 1 when all file is read */
37223+ char *item;
37224+ int i;
37225+ struct inode *inode;
37226+ int first_iteration;
37227+ int bytes;
37228+ __u64 offset;
37229+
37230+ assert("nikita-3362", ea_obtained(uf_info));
37231+ inode = unix_file_info_to_inode(uf_info);
37232+ assert("nikita-3412", !IS_RDONLY(inode));
37233+ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
37234+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37235+
37236+ offset = 0;
37237+ first_iteration = 1;
37238+ result = 0;
37239+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37240+ /*
37241+ * file is marked on disk as there was a conversion which did
37242+ * not complete due to either crash or some error. Find which
37243+ * offset tail conversion stopped at
37244+ */
37245+ result = find_start(inode, FORMATTING_ID, &offset);
37246+ if (result == -ENOENT) {
37247+ /* no tail items found, everything is converted */
37248+ uf_info->container = UF_CONTAINER_EXTENTS;
37249+ complete_conversion(inode);
37250+ return 0;
37251+ } else if (result != 0)
37252+ /* some other error */
37253+ return result;
37254+ first_iteration = 0;
37255+ }
37256+
37257+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37258+
37259+ /* get key of first byte of a file */
37260+ inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
37261+
37262+ done = 0;
37263+ while (done == 0) {
37264+ memset(pages, 0, sizeof(pages));
37265+ result = reserve_tail2extent_iteration(inode);
37266+ if (result != 0)
37267+ goto out;
37268+ if (first_iteration) {
37269+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37270+ reiser4_update_sd(inode);
37271+ first_iteration = 0;
37272+ }
37273+ bytes = 0;
37274+ for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
37275+ assert("vs-598",
37276+ (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
37277+ page = alloc_page(reiser4_ctx_gfp_mask_get());
37278+ if (!page) {
37279+ result = RETERR(-ENOMEM);
37280+ goto error;
37281+ }
37282+
37283+ page->index =
37284+ (unsigned long)(get_key_offset(&key) >>
37285+ PAGE_CACHE_SHIFT);
37286+ /*
37287+ * usually when one is going to longterm lock znode (as
37288+ * find_file_item does, for instance) he must not hold
37289+ * locked pages. However, there is an exception for
37290+ * case tail2extent. Pages appearing here are not
37291+ * reachable to everyone else, they are clean, they do
37292+ * not have jnodes attached so keeping them locked do
37293+ * not risk deadlock appearance
37294+ */
37295+ assert("vs-983", !PagePrivate(page));
37296+ reiser4_invalidate_pages(inode->i_mapping, page->index,
37297+ 1, 0);
37298+
37299+ for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
37300+ coord_t coord;
37301+ lock_handle lh;
37302+
37303+ /* get next item */
37304+ /* FIXME: we might want to readahead here */
37305+ init_lh(&lh);
37306+ result =
37307+ find_file_item_nohint(&coord, &lh, &key,
37308+ ZNODE_READ_LOCK,
37309+ inode);
37310+ if (result != CBK_COORD_FOUND) {
37311+ /*
37312+ * error happened of not items of file
37313+ * were found
37314+ */
37315+ done_lh(&lh);
37316+ page_cache_release(page);
37317+ goto error;
37318+ }
37319+
37320+ if (coord.between == AFTER_UNIT) {
37321+ /*
37322+ * end of file is reached. Padd page
37323+ * with zeros
37324+ */
37325+ done_lh(&lh);
37326+ done = 1;
37327+ p_data = kmap_atomic(page, KM_USER0);
37328+ memset(p_data + page_off, 0,
37329+ PAGE_CACHE_SIZE - page_off);
37330+ kunmap_atomic(p_data, KM_USER0);
37331+ break;
37332+ }
37333+
37334+ result = zload(coord.node);
37335+ if (result) {
37336+ page_cache_release(page);
37337+ done_lh(&lh);
37338+ goto error;
37339+ }
37340+ assert("vs-856", coord.between == AT_UNIT);
37341+ item = ((char *)item_body_by_coord(&coord)) +
37342+ coord.unit_pos;
37343+
37344+ /* how many bytes to copy */
37345+ count =
37346+ item_length_by_coord(&coord) -
37347+ coord.unit_pos;
37348+ /* limit length of copy to end of page */
37349+ if (count > PAGE_CACHE_SIZE - page_off)
37350+ count = PAGE_CACHE_SIZE - page_off;
37351+
37352+ /*
37353+ * copy item (as much as will fit starting from
37354+ * the beginning of the item) into the page
37355+ */
37356+ p_data = kmap_atomic(page, KM_USER0);
37357+ memcpy(p_data + page_off, item, count);
37358+ kunmap_atomic(p_data, KM_USER0);
37359+
37360+ page_off += count;
37361+ bytes += count;
37362+ set_key_offset(&key,
37363+ get_key_offset(&key) + count);
37364+
37365+ zrelse(coord.node);
37366+ done_lh(&lh);
37367+ } /* end of loop which fills one page by content of
37368+ * formatting items */
37369+
37370+ if (page_off) {
37371+ /* something was copied into page */
37372+ pages[i] = page;
37373+ } else {
37374+ page_cache_release(page);
37375+ assert("vs-1648", done == 1);
37376+ break;
37377+ }
37378+ } /* end of loop through pages of one conversion iteration */
37379+
37380+ if (i > 0) {
37381+ result = replace(inode, pages, i, bytes);
37382+ release_all_pages(pages, sizeof_array(pages));
37383+ if (result)
37384+ goto error;
37385+ /*
37386+ * We have to drop exclusive access to avoid deadlock
37387+ * which may happen because called by reiser4_writepages
37388+ * capture_unix_file requires to get non-exclusive
37389+ * access to a file. It is safe to drop EA in the middle
37390+ * of tail2extent conversion because write_unix_file,
37391+ * setattr_unix_file(truncate), mmap_unix_file,
37392+ * release_unix_file(extent2tail) checks if conversion
37393+ * is not in progress (see comments before
37394+ * get_exclusive_access_careful().
37395+ * Other processes that acquire non-exclusive access
37396+ * (read_unix_file, reiser4_writepages, etc) should work
37397+ * on partially converted files.
37398+ */
37399+ drop_exclusive_access(uf_info);
37400+ /* throttle the conversion */
37401+ reiser4_throttle_write(inode);
37402+ get_exclusive_access(uf_info);
37403+
37404+ /*
37405+ * nobody is allowed to complete conversion but a
37406+ * process which started it
37407+ */
37408+ assert("", reiser4_inode_get_flag(inode,
37409+ REISER4_PART_MIXED));
37410+ }
37411+ }
37412+
37413+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37414+
37415+ if (result == 0) {
37416+ /* file is converted to extent items */
37417+ assert("vs-1697", reiser4_inode_get_flag(inode,
37418+ REISER4_PART_MIXED));
37419+
37420+ uf_info->container = UF_CONTAINER_EXTENTS;
37421+ complete_conversion(inode);
37422+ } else {
37423+ /*
37424+ * conversion is not complete. Inode was already marked as
37425+ * REISER4_PART_CONV and stat-data were updated at the first
37426+ * iteration of the loop above.
37427+ */
37428+ error:
37429+ release_all_pages(pages, sizeof_array(pages));
37430+ warning("nikita-2282", "Partial conversion of %llu: %i",
37431+ (unsigned long long)get_inode_oid(inode), result);
37432+ }
37433+
37434+ out:
37435+ return result;
37436+}
37437+
37438+static int reserve_extent2tail_iteration(struct inode *inode)
37439+{
37440+ reiser4_tree *tree;
37441+
37442+ tree = reiser4_tree_by_inode(inode);
37443+ /*
37444+ * reserve blocks for (in this order):
37445+ *
37446+ * 1. removal of extent item
37447+ *
37448+ * 2. insertion of tail by insert_flow()
37449+ *
37450+ * 3. drilling to the leaf level by coord_by_key()
37451+ *
37452+ * 4. possible update of stat-data
37453+ */
37454+ grab_space_enable();
37455+ return reiser4_grab_space
37456+ (estimate_one_item_removal(tree) +
37457+ estimate_insert_flow(tree->height) +
37458+ 1 + estimate_one_insert_item(tree) +
37459+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37460+}
37461+
37462+/* for every page of file: read page, cut part of extent pointing to this page,
37463+ put data of page tree by tail item */
37464+int extent2tail(struct file * file, struct unix_file_info *uf_info)
37465+{
37466+ int result;
37467+ struct inode *inode;
37468+ struct page *page;
37469+ unsigned long num_pages, i;
37470+ unsigned long start_page;
37471+ reiser4_key from;
37472+ reiser4_key to;
37473+ unsigned count;
37474+ __u64 offset;
37475+
37476+ assert("nikita-3362", ea_obtained(uf_info));
37477+ inode = unix_file_info_to_inode(uf_info);
37478+ assert("nikita-3412", !IS_RDONLY(inode));
37479+ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
37480+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37481+
37482+ offset = 0;
37483+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37484+ /*
37485+ * file is marked on disk as there was a conversion which did
37486+ * not complete due to either crash or some error. Find which
37487+ * offset tail conversion stopped at
37488+ */
37489+ result = find_start(inode, EXTENT_POINTER_ID, &offset);
37490+ if (result == -ENOENT) {
37491+ /* no extent found, everything is converted */
37492+ uf_info->container = UF_CONTAINER_TAILS;
37493+ complete_conversion(inode);
37494+ return 0;
37495+ } else if (result != 0)
37496+ /* some other error */
37497+ return result;
37498+ }
37499+
37500+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37501+
37502+ /* number of pages in the file */
37503+ num_pages =
37504+ (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
37505+ start_page = offset >> PAGE_CACHE_SHIFT;
37506+
37507+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
37508+ to = from;
37509+
37510+ result = 0;
37511+ for (i = 0; i < num_pages; i++) {
37512+ __u64 start_byte;
37513+
37514+ result = reserve_extent2tail_iteration(inode);
37515+ if (result != 0)
37516+ break;
37517+ if (i == 0 && offset == 0) {
37518+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37519+ reiser4_update_sd(inode);
37520+ }
37521+
37522+ page = read_mapping_page(inode->i_mapping,
37523+ (unsigned)(i + start_page), NULL);
37524+ if (IS_ERR(page)) {
37525+ result = PTR_ERR(page);
37526+ break;
37527+ }
37528+
37529+ wait_on_page_locked(page);
37530+
37531+ if (!PageUptodate(page)) {
37532+ page_cache_release(page);
37533+ result = RETERR(-EIO);
37534+ break;
37535+ }
37536+
37537+ /* cut part of file we have read */
37538+ start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
37539+ set_key_offset(&from, start_byte);
37540+ set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
37541+ /*
37542+ * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
37543+ * commits during over-long truncates. But
37544+ * extent->tail conversion should be performed in one
37545+ * transaction.
37546+ */
37547+ result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
37548+ &to, inode, 0);
37549+
37550+ if (result) {
37551+ page_cache_release(page);
37552+ break;
37553+ }
37554+
37555+ /* put page data into tree via tail_write */
37556+ count = PAGE_CACHE_SIZE;
37557+ if ((i == (num_pages - 1)) &&
37558+ (inode->i_size & ~PAGE_CACHE_MASK))
37559+ /* last page can be incompleted */
37560+ count = (inode->i_size & ~PAGE_CACHE_MASK);
37561+ while (count) {
37562+ loff_t pos = start_byte;
37563+
37564+ assert("edward-1537",
37565+ file != NULL && file->f_dentry != NULL);
37566+ assert("edward-1538",
37567+ file->f_dentry->d_inode == inode);
37568+
37569+ result = reiser4_write_tail(file, inode,
37570+ (char __user *)kmap(page),
37571+ count, &pos);
37572+ reiser4_free_file_fsdata(file);
37573+ if (result <= 0) {
37574+ warning("", "reiser4_write_tail failed");
37575+ page_cache_release(page);
37576+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37577+ return result;
37578+ }
37579+ count -= result;
37580+ }
37581+
37582+ /* release page */
37583+ lock_page(page);
37584+ /* page is already detached from jnode and mapping. */
37585+ assert("vs-1086", page->mapping == NULL);
37586+ assert("nikita-2690",
37587+ (!PagePrivate(page) && jprivate(page) == 0));
37588+ /* waiting for writeback completion with page lock held is
37589+ * perfectly valid. */
37590+ wait_on_page_writeback(page);
37591+ reiser4_drop_page(page);
37592+ /* release reference taken by read_cache_page() above */
37593+ page_cache_release(page);
37594+
37595+ drop_exclusive_access(uf_info);
37596+ /* throttle the conversion */
37597+ reiser4_throttle_write(inode);
37598+ get_exclusive_access(uf_info);
37599+ /*
37600+ * nobody is allowed to complete conversion but a process which
37601+ * started it
37602+ */
37603+ assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
37604+ }
37605+
37606+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37607+
37608+ if (i == num_pages) {
37609+ /* file is converted to formatted items */
37610+ assert("vs-1698", reiser4_inode_get_flag(inode,
37611+ REISER4_PART_MIXED));
37612+ assert("vs-1260",
37613+ inode_has_no_jnodes(reiser4_inode_data(inode)));
37614+
37615+ uf_info->container = UF_CONTAINER_TAILS;
37616+ complete_conversion(inode);
37617+ return 0;
37618+ }
37619+ /*
37620+ * conversion is not complete. Inode was already marked as
37621+ * REISER4_PART_MIXED and stat-data were updated at the first *
37622+ * iteration of the loop above.
37623+ */
37624+ warning("nikita-2282",
37625+ "Partial conversion of %llu: %lu of %lu: %i",
37626+ (unsigned long long)get_inode_oid(inode), i,
37627+ num_pages, result);
37628+
37629+ return result;
37630+}
37631+
37632+/*
37633+ * Local variables:
37634+ * c-indentation-style: "K&R"
37635+ * mode-name: "LC"
37636+ * c-basic-offset: 8
37637+ * tab-width: 8
37638+ * fill-column: 79
37639+ * scroll-step: 1
37640+ * End:
37641+ */
37642diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file_ops.c linux-2.6.24/fs/reiser4/plugin/file_ops.c
37643--- linux-2.6.24.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 03:00:00.000000000 +0300
37644+++ linux-2.6.24/fs/reiser4/plugin/file_ops.c 2008-01-25 11:39:06.992222114 +0300
37645@@ -0,0 +1,205 @@
37646+/* Copyright 2005 by Hans Reiser, licensing governed by
37647+ reiser4/README */
37648+
37649+/* this file contains typical implementations for some of methods of
37650+ struct file_operations and of struct address_space_operations
37651+*/
37652+
37653+#include "../inode.h"
37654+#include "object.h"
37655+
37656+/* file operations */
37657+
37658+/* implementation of vfs's llseek method of struct file_operations for
37659+ typical directory can be found in readdir_common.c
37660+*/
37661+loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
37662+
37663+/* implementation of vfs's readdir method of struct file_operations for
37664+ typical directory can be found in readdir_common.c
37665+*/
37666+int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
37667+
37668+/**
37669+ * reiser4_release_dir_common - release of struct file_operations
37670+ * @inode: inode of released file
37671+ * @file: file to release
37672+ *
37673+ * Implementation of release method of struct file_operations for typical
37674+ * directory. All it does is freeing of reiser4 specific file data.
37675+*/
37676+int reiser4_release_dir_common(struct inode *inode, struct file *file)
37677+{
37678+ reiser4_context *ctx;
37679+
37680+ ctx = reiser4_init_context(inode->i_sb);
37681+ if (IS_ERR(ctx))
37682+ return PTR_ERR(ctx);
37683+ reiser4_free_file_fsdata(file);
37684+ reiser4_exit_context(ctx);
37685+ return 0;
37686+}
37687+
37688+/* this is common implementation of vfs's fsync method of struct
37689+ file_operations
37690+*/
37691+int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
37692+{
37693+ reiser4_context *ctx;
37694+ int result;
37695+
37696+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
37697+ if (IS_ERR(ctx))
37698+ return PTR_ERR(ctx);
37699+ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
37700+
37701+ context_set_commit_async(ctx);
37702+ reiser4_exit_context(ctx);
37703+ return result;
37704+}
37705+
37706+/*
37707+ * common sync method for regular files.
37708+ *
37709+ * We are trying to be smart here. Instead of committing all atoms (original
37710+ * solution), we scan dirty pages of this file and commit all atoms they are
37711+ * part of.
37712+ *
37713+ * Situation is complicated by anonymous pages: i.e., extent-less pages
37714+ * dirtied through mmap. Fortunately sys_fsync() first calls
37715+ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
37716+ * all missing extents and capture anonymous pages.
37717+ */
37718+int reiser4_sync_file_common(struct file *file,
37719+ struct dentry *dentry, int datasync)
37720+{
37721+ reiser4_context *ctx;
37722+ txn_atom *atom;
37723+ reiser4_block_nr reserve;
37724+
37725+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
37726+ if (IS_ERR(ctx))
37727+ return PTR_ERR(ctx);
37728+
37729+ reserve = estimate_update_common(dentry->d_inode);
37730+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
37731+ reiser4_exit_context(ctx);
37732+ return RETERR(-ENOSPC);
37733+ }
37734+ write_sd_by_inode_common(dentry->d_inode);
37735+
37736+ atom = get_current_atom_locked();
37737+ spin_lock_txnh(ctx->trans);
37738+ force_commit_atom(ctx->trans);
37739+ reiser4_exit_context(ctx);
37740+ return 0;
37741+}
37742+
37743+/* this is common implementation of vfs's sendfile method of struct
37744+ file_operations
37745+
37746+ Reads @count bytes from @file and calls @actor for every page read. This is
37747+ needed for loop back devices support.
37748+*/
37749+#if 0
37750+ssize_t
37751+sendfile_common(struct file *file, loff_t *ppos, size_t count,
37752+ read_actor_t actor, void *target)
37753+{
37754+ reiser4_context *ctx;
37755+ ssize_t result;
37756+
37757+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
37758+ if (IS_ERR(ctx))
37759+ return PTR_ERR(ctx);
37760+ result = generic_file_sendfile(file, ppos, count, actor, target);
37761+ reiser4_exit_context(ctx);
37762+ return result;
37763+}
37764+#endif /* 0 */
37765+
37766+/* address space operations */
37767+
37768+/* this is common implementation of vfs's prepare_write method of struct
37769+ address_space_operations
37770+*/
37771+int
37772+prepare_write_common(struct file *file, struct page *page, unsigned from,
37773+ unsigned to)
37774+{
37775+ reiser4_context *ctx;
37776+ int result;
37777+
37778+ ctx = reiser4_init_context(page->mapping->host->i_sb);
37779+ result = do_prepare_write(file, page, from, to);
37780+
37781+ /* don't commit transaction under inode semaphore */
37782+ context_set_commit_async(ctx);
37783+ reiser4_exit_context(ctx);
37784+
37785+ return result;
37786+}
37787+
37788+/* this is helper for prepare_write_common and prepare_write_unix_file
37789+ */
37790+int
37791+do_prepare_write(struct file *file, struct page *page, unsigned from,
37792+ unsigned to)
37793+{
37794+ int result;
37795+ file_plugin *fplug;
37796+ struct inode *inode;
37797+
37798+ assert("umka-3099", file != NULL);
37799+ assert("umka-3100", page != NULL);
37800+ assert("umka-3095", PageLocked(page));
37801+
37802+ if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
37803+ return 0;
37804+
37805+ inode = page->mapping->host;
37806+ fplug = inode_file_plugin(inode);
37807+
37808+ if (page->mapping->a_ops->readpage == NULL)
37809+ return RETERR(-EINVAL);
37810+
37811+ result = page->mapping->a_ops->readpage(file, page);
37812+ if (result != 0) {
37813+ SetPageError(page);
37814+ ClearPageUptodate(page);
37815+ /* All reiser4 readpage() implementations should return the
37816+ * page locked in case of error. */
37817+ assert("nikita-3472", PageLocked(page));
37818+ } else {
37819+ /*
37820+ * ->readpage() either:
37821+ *
37822+ * 1. starts IO against @page. @page is locked for IO in
37823+ * this case.
37824+ *
37825+ * 2. doesn't start IO. @page is unlocked.
37826+ *
37827+ * In either case, page should be locked.
37828+ */
37829+ lock_page(page);
37830+ /*
37831+ * IO (if any) is completed at this point. Check for IO
37832+ * errors.
37833+ */
37834+ if (!PageUptodate(page))
37835+ result = RETERR(-EIO);
37836+ }
37837+ assert("umka-3098", PageLocked(page));
37838+ return result;
37839+}
37840+
37841+/*
37842+ * Local variables:
37843+ * c-indentation-style: "K&R"
37844+ * mode-name: "LC"
37845+ * c-basic-offset: 8
37846+ * tab-width: 8
37847+ * fill-column: 79
37848+ * scroll-step: 1
37849+ * End:
37850+ */
37851diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.24/fs/reiser4/plugin/file_ops_readdir.c
37852--- linux-2.6.24.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 03:00:00.000000000 +0300
37853+++ linux-2.6.24/fs/reiser4/plugin/file_ops_readdir.c 2008-01-25 11:39:06.996223145 +0300
37854@@ -0,0 +1,658 @@
37855+/* Copyright 2005 by Hans Reiser, licensing governed by
37856+ * reiser4/README */
37857+
37858+#include "../inode.h"
37859+
37860+/* return true, iff @coord points to the valid directory item that is part of
37861+ * @inode directory. */
37862+static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
37863+{
37864+ return plugin_of_group(item_plugin_by_coord(coord),
37865+ DIR_ENTRY_ITEM_TYPE) &&
37866+ inode_file_plugin(inode)->owns_item(inode, coord);
37867+}
37868+
37869+/* compare two logical positions within the same directory */
37870+static cmp_t dir_pos_cmp(const struct dir_pos * p1, const struct dir_pos * p2)
37871+{
37872+ cmp_t result;
37873+
37874+ assert("nikita-2534", p1 != NULL);
37875+ assert("nikita-2535", p2 != NULL);
37876+
37877+ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
37878+ if (result == EQUAL_TO) {
37879+ int diff;
37880+
37881+ diff = p1->pos - p2->pos;
37882+ result =
37883+ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
37884+ }
37885+ return result;
37886+}
37887+
37888+/* see comment before reiser4_readdir_common() for overview of why "adjustment" is
37889+ * necessary. */
37890+static void
37891+adjust_dir_pos(struct file *dir, struct readdir_pos * readdir_spot,
37892+ const struct dir_pos * mod_point, int adj)
37893+{
37894+ struct dir_pos *pos;
37895+
37896+ /*
37897+ * new directory entry was added (adj == +1) or removed (adj == -1) at
37898+ * the @mod_point. Directory file descriptor @dir is doing readdir and
37899+ * is currently positioned at @readdir_spot. Latter has to be updated
37900+ * to maintain stable readdir.
37901+ */
37902+ /* directory is positioned to the beginning. */
37903+ if (readdir_spot->entry_no == 0)
37904+ return;
37905+
37906+ pos = &readdir_spot->position;
37907+ switch (dir_pos_cmp(mod_point, pos)) {
37908+ case LESS_THAN:
37909+ /* @mod_pos is _before_ @readdir_spot, that is, entry was
37910+ * added/removed on the left (in key order) of current
37911+ * position. */
37912+ /* logical number of directory entry readdir is "looking" at
37913+ * changes */
37914+ readdir_spot->entry_no += adj;
37915+ assert("nikita-2577",
37916+ ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
37917+ if (de_id_cmp(&pos->dir_entry_key,
37918+ &mod_point->dir_entry_key) == EQUAL_TO) {
37919+ assert("nikita-2575", mod_point->pos < pos->pos);
37920+ /*
37921+ * if entry added/removed has the same key as current
37922+ * for readdir, update counter of duplicate keys in
37923+ * @readdir_spot.
37924+ */
37925+ pos->pos += adj;
37926+ }
37927+ break;
37928+ case GREATER_THAN:
37929+ /* directory is modified after @pos: nothing to do. */
37930+ break;
37931+ case EQUAL_TO:
37932+ /* cannot insert an entry readdir is looking at, because it
37933+ already exists. */
37934+ assert("nikita-2576", adj < 0);
37935+ /* directory entry to which @pos points to is being
37936+ removed.
37937+
37938+ NOTE-NIKITA: Right thing to do is to update @pos to point
37939+ to the next entry. This is complex (we are under spin-lock
37940+ for one thing). Just rewind it to the beginning. Next
37941+ readdir will have to scan the beginning of
37942+ directory. Proper solution is to use semaphore in
37943+ spin lock's stead and use rewind_right() here.
37944+
37945+ NOTE-NIKITA: now, semaphore is used, so...
37946+ */
37947+ memset(readdir_spot, 0, sizeof *readdir_spot);
37948+ }
37949+}
37950+
37951+/* scan all file-descriptors for this directory and adjust their
37952+ positions respectively. Should be used by implementations of
37953+ add_entry and rem_entry of dir plugin */
37954+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
37955+ int offset, int adj)
37956+{
37957+ reiser4_file_fsdata *scan;
37958+ struct dir_pos mod_point;
37959+
37960+ assert("nikita-2536", dir != NULL);
37961+ assert("nikita-2538", de != NULL);
37962+ assert("nikita-2539", adj != 0);
37963+
37964+ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
37965+ mod_point.pos = offset;
37966+
37967+ spin_lock_inode(dir);
37968+
37969+ /*
37970+ * new entry was added/removed in directory @dir. Scan all file
37971+ * descriptors for @dir that are currently involved into @readdir and
37972+ * update them.
37973+ */
37974+
37975+ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
37976+ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
37977+
37978+ spin_unlock_inode(dir);
37979+}
37980+
37981+/*
37982+ * traverse tree to start/continue readdir from the readdir position @pos.
37983+ */
37984+static int dir_go_to(struct file *dir, struct readdir_pos * pos, tap_t * tap)
37985+{
37986+ reiser4_key key;
37987+ int result;
37988+ struct inode *inode;
37989+
37990+ assert("nikita-2554", pos != NULL);
37991+
37992+ inode = dir->f_dentry->d_inode;
37993+ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
37994+ if (result != 0)
37995+ return result;
37996+ result = reiser4_object_lookup(inode,
37997+ &key,
37998+ tap->coord,
37999+ tap->lh,
38000+ tap->mode,
38001+ FIND_EXACT,
38002+ LEAF_LEVEL, LEAF_LEVEL,
38003+ 0, &tap->ra_info);
38004+ if (result == CBK_COORD_FOUND)
38005+ result = rewind_right(tap, (int)pos->position.pos);
38006+ else {
38007+ tap->coord->node = NULL;
38008+ done_lh(tap->lh);
38009+ result = RETERR(-EIO);
38010+ }
38011+ return result;
38012+}
38013+
38014+/*
38015+ * handling of non-unique keys: calculate at what ordinal position within
38016+ * sequence of directory items with identical keys @pos is.
38017+ */
38018+static int set_pos(struct inode *inode, struct readdir_pos * pos, tap_t * tap)
38019+{
38020+ int result;
38021+ coord_t coord;
38022+ lock_handle lh;
38023+ tap_t scan;
38024+ de_id *did;
38025+ reiser4_key de_key;
38026+
38027+ coord_init_zero(&coord);
38028+ init_lh(&lh);
38029+ reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
38030+ reiser4_tap_copy(&scan, tap);
38031+ reiser4_tap_load(&scan);
38032+ pos->position.pos = 0;
38033+
38034+ did = &pos->position.dir_entry_key;
38035+
38036+ if (is_valid_dir_coord(inode, scan.coord)) {
38037+
38038+ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
38039+
38040+ while (1) {
38041+
38042+ result = go_prev_unit(&scan);
38043+ if (result != 0)
38044+ break;
38045+
38046+ if (!is_valid_dir_coord(inode, scan.coord)) {
38047+ result = -EINVAL;
38048+ break;
38049+ }
38050+
38051+ /* get key of directory entry */
38052+ unit_key_by_coord(scan.coord, &de_key);
38053+ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
38054+ /* duplicate-sequence is over */
38055+ break;
38056+ }
38057+ pos->position.pos++;
38058+ }
38059+ } else
38060+ result = RETERR(-ENOENT);
38061+ reiser4_tap_relse(&scan);
38062+ reiser4_tap_done(&scan);
38063+ return result;
38064+}
38065+
38066+/*
38067+ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
38068+ */
38069+static int dir_rewind(struct file *dir, struct readdir_pos * pos, tap_t * tap)
38070+{
38071+ __u64 destination;
38072+ __s64 shift;
38073+ int result;
38074+ struct inode *inode;
38075+ loff_t dirpos;
38076+
38077+ assert("nikita-2553", dir != NULL);
38078+ assert("nikita-2548", pos != NULL);
38079+ assert("nikita-2551", tap->coord != NULL);
38080+ assert("nikita-2552", tap->lh != NULL);
38081+
38082+ dirpos = reiser4_get_dir_fpos(dir);
38083+ shift = dirpos - pos->fpos;
38084+ /* this is logical directory entry within @dir which we are rewinding
38085+ * to */
38086+ destination = pos->entry_no + shift;
38087+
38088+ inode = dir->f_dentry->d_inode;
38089+ if (dirpos < 0)
38090+ return RETERR(-EINVAL);
38091+ else if (destination == 0ll || dirpos == 0) {
38092+ /* rewind to the beginning of directory */
38093+ memset(pos, 0, sizeof *pos);
38094+ return dir_go_to(dir, pos, tap);
38095+ } else if (destination >= inode->i_size)
38096+ return RETERR(-ENOENT);
38097+
38098+ if (shift < 0) {
38099+ /* I am afraid of negative numbers */
38100+ shift = -shift;
38101+ /* rewinding to the left */
38102+ if (shift <= (int)pos->position.pos) {
38103+ /* destination is within sequence of entries with
38104+ duplicate keys. */
38105+ result = dir_go_to(dir, pos, tap);
38106+ } else {
38107+ shift -= pos->position.pos;
38108+ while (1) {
38109+ /* repetitions: deadlock is possible when
38110+ going to the left. */
38111+ result = dir_go_to(dir, pos, tap);
38112+ if (result == 0) {
38113+ result = rewind_left(tap, shift);
38114+ if (result == -E_DEADLOCK) {
38115+ reiser4_tap_done(tap);
38116+ continue;
38117+ }
38118+ }
38119+ break;
38120+ }
38121+ }
38122+ } else {
38123+ /* rewinding to the right */
38124+ result = dir_go_to(dir, pos, tap);
38125+ if (result == 0)
38126+ result = rewind_right(tap, shift);
38127+ }
38128+ if (result == 0) {
38129+ result = set_pos(inode, pos, tap);
38130+ if (result == 0) {
38131+ /* update pos->position.pos */
38132+ pos->entry_no = destination;
38133+ pos->fpos = dirpos;
38134+ }
38135+ }
38136+ return result;
38137+}
38138+
38139+/*
38140+ * Function that is called by common_readdir() on each directory entry while
38141+ * doing readdir. ->filldir callback may block, so we had to release long term
38142+ * lock while calling it. To avoid repeating tree traversal, seal is used. If
38143+ * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
38144+ *
38145+ * Whether node is unlocked in case of any other error is undefined. It is
38146+ * guaranteed to be still locked if success (0) is returned.
38147+ *
38148+ * When ->filldir() wants no more, feed_entry() returns 1, and node is
38149+ * unlocked.
38150+ */
38151+static int
38152+feed_entry(struct file *f, struct readdir_pos * pos, tap_t * tap,
38153+ filldir_t filldir, void *dirent)
38154+{
38155+ item_plugin *iplug;
38156+ char *name;
38157+ reiser4_key sd_key;
38158+ int result;
38159+ char buf[DE_NAME_BUF_LEN];
38160+ char name_buf[32];
38161+ char *local_name;
38162+ unsigned file_type;
38163+ seal_t seal;
38164+ coord_t *coord;
38165+ reiser4_key entry_key;
38166+
38167+ coord = tap->coord;
38168+ iplug = item_plugin_by_coord(coord);
38169+
38170+ /* pointer to name within the node */
38171+ name = iplug->s.dir.extract_name(coord, buf);
38172+ assert("nikita-1371", name != NULL);
38173+
38174+ /* key of object the entry points to */
38175+ if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
38176+ return RETERR(-EIO);
38177+
38178+ /* we must release longterm znode lock before calling filldir to avoid
38179+ deadlock which may happen if filldir causes page fault. So, copy
38180+ name to intermediate buffer */
38181+ if (strlen(name) + 1 > sizeof(name_buf)) {
38182+ local_name = kmalloc(strlen(name) + 1,
38183+ reiser4_ctx_gfp_mask_get());
38184+ if (local_name == NULL)
38185+ return RETERR(-ENOMEM);
38186+ } else
38187+ local_name = name_buf;
38188+
38189+ strcpy(local_name, name);
38190+ file_type = iplug->s.dir.extract_file_type(coord);
38191+
38192+ unit_key_by_coord(coord, &entry_key);
38193+ reiser4_seal_init(&seal, coord, &entry_key);
38194+
38195+ longterm_unlock_znode(tap->lh);
38196+
38197+ /*
38198+ * send information about directory entry to the ->filldir() filler
38199+ * supplied to us by caller (VFS).
38200+ *
38201+ * ->filldir is entitled to do weird things. For example, ->filldir
38202+ * supplied by knfsd re-enters file system. Make sure no locks are
38203+ * held.
38204+ */
38205+ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
38206+
38207+ reiser4_txn_restart_current();
38208+ result = filldir(dirent, name, (int)strlen(name),
38209+ /* offset of this entry */
38210+ f->f_pos,
38211+ /* inode number of object bounden by this entry */
38212+ oid_to_uino(get_key_objectid(&sd_key)), file_type);
38213+ if (local_name != name_buf)
38214+ kfree(local_name);
38215+ if (result < 0)
38216+ /* ->filldir() is satisfied. (no space in buffer, IOW) */
38217+ result = 1;
38218+ else
38219+ result = reiser4_seal_validate(&seal, coord, &entry_key,
38220+ tap->lh, tap->mode,
38221+ ZNODE_LOCK_HIPRI);
38222+ return result;
38223+}
38224+
38225+static void move_entry(struct readdir_pos * pos, coord_t * coord)
38226+{
38227+ reiser4_key de_key;
38228+ de_id *did;
38229+
38230+ /* update @pos */
38231+ ++pos->entry_no;
38232+ did = &pos->position.dir_entry_key;
38233+
38234+ /* get key of directory entry */
38235+ unit_key_by_coord(coord, &de_key);
38236+
38237+ if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
38238+ /* we are within sequence of directory entries
38239+ with duplicate keys. */
38240+ ++pos->position.pos;
38241+ else {
38242+ pos->position.pos = 0;
38243+ build_de_id_by_key(&de_key, did);
38244+ }
38245+ ++pos->fpos;
38246+}
38247+
38248+/*
38249+ * STATELESS READDIR
38250+ *
38251+ * readdir support in reiser4 relies on ability to update readdir_pos embedded
38252+ * into reiser4_file_fsdata on each directory modification (name insertion and
38253+ * removal), see reiser4_readdir_common() function below. This obviously doesn't
38254+ * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
38255+ * across client READDIR requests for the same directory.
38256+ *
38257+ * To address this we maintain a "pool" of detached reiser4_file_fsdata
38258+ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
38259+ * find detached reiser4_file_fsdata corresponding to previous readdir
38260+ * request. In other words, additional state is maintained on the
38261+ * server. (This is somewhat contrary to the design goals of NFS protocol.)
38262+ *
38263+ * To efficiently detect when our ->readdir() method is called by NFS server,
38264+ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
38265+ * file_is_stateless() function).
38266+ *
38267+ * To find out d_cursor in the pool, we encode client id (cid) in the highest
38268+ * bits of NFS readdir cookie: when first readdir request comes to the given
38269+ * directory from the given client, cookie is set to 0. This situation is
38270+ * detected, global cid_counter is incremented, and stored in highest bits of
38271+ * all direntry offsets returned to the client, including last one. As the
38272+ * only valid readdir cookie is one obtained as direntry->offset, we are
38273+ * guaranteed that next readdir request (continuing current one) will have
38274+ * current cid in the highest bits of starting readdir cookie. All d_cursors
38275+ * are hashed into per-super-block hash table by (oid, cid) key.
38276+ *
38277+ * In addition d_cursors are placed into per-super-block radix tree where they
38278+ * are keyed by oid alone. This is necessary to efficiently remove them during
38279+ * rmdir.
38280+ *
38281+ * At last, currently unused d_cursors are linked into special list. This list
38282+ * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
38283+ *
38284+ */
38285+
38286+/*
38287+ * prepare for readdir.
38288+ */
38289+static int dir_readdir_init(struct file *f, tap_t * tap,
38290+ struct readdir_pos ** pos)
38291+{
38292+ struct inode *inode;
38293+ reiser4_file_fsdata *fsdata;
38294+ int result;
38295+
38296+ assert("nikita-1359", f != NULL);
38297+ inode = f->f_dentry->d_inode;
38298+ assert("nikita-1360", inode != NULL);
38299+
38300+ if (!S_ISDIR(inode->i_mode))
38301+ return RETERR(-ENOTDIR);
38302+
38303+ /* try to find detached readdir state */
38304+ result = reiser4_attach_fsdata(f, inode);
38305+ if (result != 0)
38306+ return result;
38307+
38308+ fsdata = reiser4_get_file_fsdata(f);
38309+ assert("nikita-2571", fsdata != NULL);
38310+ if (IS_ERR(fsdata))
38311+ return PTR_ERR(fsdata);
38312+
38313+ /* add file descriptor to the readdir list hanging of directory
38314+ * inode. This list is used to scan "readdirs-in-progress" while
38315+ * inserting or removing names in the directory. */
38316+ spin_lock_inode(inode);
38317+ if (list_empty_careful(&fsdata->dir.linkage))
38318+ list_add(&fsdata->dir.linkage, get_readdir_list(inode));
38319+ *pos = &fsdata->dir.readdir;
38320+ spin_unlock_inode(inode);
38321+
38322+ /* move @tap to the current position */
38323+ return dir_rewind(f, *pos, tap);
38324+}
38325+
38326+/* this is implementation of vfs's llseek method of struct file_operations for
38327+ typical directory
38328+ See comment before reiser4_readdir_common() for explanation.
38329+*/
38330+loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin)
38331+{
38332+ reiser4_context *ctx;
38333+ loff_t result;
38334+ struct inode *inode;
38335+
38336+ inode = file->f_dentry->d_inode;
38337+
38338+ ctx = reiser4_init_context(inode->i_sb);
38339+ if (IS_ERR(ctx))
38340+ return PTR_ERR(ctx);
38341+
38342+ mutex_lock(&inode->i_mutex);
38343+
38344+ /* update ->f_pos */
38345+ result = default_llseek(file, off, origin);
38346+ if (result >= 0) {
38347+ int ff;
38348+ coord_t coord;
38349+ lock_handle lh;
38350+ tap_t tap;
38351+ struct readdir_pos *pos;
38352+
38353+ coord_init_zero(&coord);
38354+ init_lh(&lh);
38355+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38356+
38357+ ff = dir_readdir_init(file, &tap, &pos);
38358+ reiser4_detach_fsdata(file);
38359+ if (ff != 0)
38360+ result = (loff_t) ff;
38361+ reiser4_tap_done(&tap);
38362+ }
38363+ reiser4_detach_fsdata(file);
38364+ mutex_unlock(&inode->i_mutex);
38365+
38366+ reiser4_exit_context(ctx);
38367+ return result;
38368+}
38369+
38370+/* this is common implementation of vfs's readdir method of struct
38371+ file_operations
38372+
38373+ readdir problems:
38374+
38375+ readdir(2)/getdents(2) interface is based on implicit assumption that
38376+ readdir can be restarted from any particular point by supplying file system
38377+ with off_t-full of data. That is, file system fills ->d_off field in struct
38378+ dirent and later user passes ->d_off to the seekdir(3), which is, actually,
38379+ implemented by glibc as lseek(2) on directory.
38380+
38381+ Reiser4 cannot restart readdir from 64 bits of data, because two last
38382+ components of the key of directory entry are unknown, which given 128 bits:
38383+ locality and type fields in the key of directory entry are always known, to
38384+ start readdir() from given point objectid and offset fields have to be
38385+ filled.
38386+
38387+ Traditional UNIX API for scanning through directory
38388+ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
38389+ assumption that directory is structured very much like regular file, in
38390+ particular, it is implied that each name within given directory (directory
38391+ entry) can be uniquely identified by scalar offset and that such offset is
38392+ stable across the life-time of the name is identifies.
38393+
38394+ This is manifestly not so for reiser4. In reiser4 the only stable unique
38395+ identifies for the directory entry is its key that doesn't fit into
38396+ seekdir/telldir API.
38397+
38398+ solution:
38399+
38400+ Within each file descriptor participating in readdir-ing of directory
38401+ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
38402+ the "current" directory entry that file descriptor looks at. It contains a
38403+ key of directory entry (plus some additional info to deal with non-unique
38404+ keys that we wouldn't dwell onto here) and a logical position of this
38405+ directory entry starting from the beginning of the directory, that is
38406+ ordinal number of this entry in the readdir order.
38407+
38408+ Obviously this logical position is not stable in the face of directory
38409+ modifications. To work around this, on each addition or removal of directory
38410+ entry all file descriptors for directory inode are scanned and their
38411+ readdir_pos are updated accordingly (adjust_dir_pos()).
38412+*/
38413+int reiser4_readdir_common(struct file *f /* directory file being read */,
38414+ void *dirent /* opaque data passed to us by VFS */,
38415+ filldir_t filld /* filler function passed to us
38416+ * by VFS */)
38417+{
38418+ reiser4_context *ctx;
38419+ int result;
38420+ struct inode *inode;
38421+ coord_t coord;
38422+ lock_handle lh;
38423+ tap_t tap;
38424+ struct readdir_pos *pos;
38425+
38426+ assert("nikita-1359", f != NULL);
38427+ inode = f->f_dentry->d_inode;
38428+ assert("nikita-1360", inode != NULL);
38429+
38430+ if (!S_ISDIR(inode->i_mode))
38431+ return RETERR(-ENOTDIR);
38432+
38433+ ctx = reiser4_init_context(inode->i_sb);
38434+ if (IS_ERR(ctx))
38435+ return PTR_ERR(ctx);
38436+
38437+ coord_init_zero(&coord);
38438+ init_lh(&lh);
38439+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38440+
38441+ reiser4_readdir_readahead_init(inode, &tap);
38442+
38443+ repeat:
38444+ result = dir_readdir_init(f, &tap, &pos);
38445+ if (result == 0) {
38446+ result = reiser4_tap_load(&tap);
38447+ /* scan entries one by one feeding them to @filld */
38448+ while (result == 0) {
38449+ coord_t *coord;
38450+
38451+ coord = tap.coord;
38452+ assert("nikita-2572", coord_is_existing_unit(coord));
38453+ assert("nikita-3227", is_valid_dir_coord(inode, coord));
38454+
38455+ result = feed_entry(f, pos, &tap, filld, dirent);
38456+ if (result > 0) {
38457+ break;
38458+ } else if (result == 0) {
38459+ ++f->f_pos;
38460+ result = go_next_unit(&tap);
38461+ if (result == -E_NO_NEIGHBOR ||
38462+ result == -ENOENT) {
38463+ result = 0;
38464+ break;
38465+ } else if (result == 0) {
38466+ if (is_valid_dir_coord(inode, coord))
38467+ move_entry(pos, coord);
38468+ else
38469+ break;
38470+ }
38471+ } else if (result == -E_REPEAT) {
38472+ /* feed_entry() had to restart. */
38473+ ++f->f_pos;
38474+ reiser4_tap_relse(&tap);
38475+ goto repeat;
38476+ } else
38477+ warning("vs-1617",
38478+ "reiser4_readdir_common: unexpected error %d",
38479+ result);
38480+ }
38481+ reiser4_tap_relse(&tap);
38482+
38483+ if (result >= 0)
38484+ f->f_version = inode->i_version;
38485+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
38486+ result = 0;
38487+ reiser4_tap_done(&tap);
38488+ reiser4_detach_fsdata(f);
38489+
38490+ /* try to update directory's atime */
38491+ if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
38492+ BA_CAN_COMMIT) != 0)
38493+ warning("", "failed to update atime on readdir: %llu",
38494+ get_inode_oid(inode));
38495+ else
38496+ file_accessed(f);
38497+
38498+ context_set_commit_async(ctx);
38499+ reiser4_exit_context(ctx);
38500+
38501+ return (result <= 0) ? result : 0;
38502+}
38503+
38504+/*
38505+ * Local variables:
38506+ * c-indentation-style: "K&R"
38507+ * mode-name: "LC"
38508+ * c-basic-offset: 8
38509+ * tab-width: 8
38510+ * fill-column: 79
38511+ * End:
38512+ */
38513diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.24/fs/reiser4/plugin/file_plugin_common.c
38514--- linux-2.6.24.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
38515+++ linux-2.6.24/fs/reiser4/plugin/file_plugin_common.c 2008-01-25 11:55:43.900543447 +0300
38516@@ -0,0 +1,1009 @@
38517+/* Copyright 2005 by Hans Reiser, licensing governed by
38518+ reiser4/README */
38519+
38520+/* this file contains typical implementations for most of methods of
38521+ file plugin
38522+*/
38523+
38524+#include "../inode.h"
38525+#include "object.h"
38526+#include "../safe_link.h"
38527+
38528+#include <linux/quotaops.h>
38529+
38530+static int insert_new_sd(struct inode *inode);
38531+static int update_sd(struct inode *inode);
38532+
38533+/* this is common implementation of write_sd_by_inode method of file plugin
38534+ either insert stat data or update it
38535+ */
38536+int write_sd_by_inode_common(struct inode *inode /* object to save */ )
38537+{
38538+ int result;
38539+
38540+ assert("nikita-730", inode != NULL);
38541+
38542+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
38543+ /* object doesn't have stat-data yet */
38544+ result = insert_new_sd(inode);
38545+ else
38546+ result = update_sd(inode);
38547+ if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
38548+ /* Don't issue warnings about "name is too long" */
38549+ warning("nikita-2221", "Failed to save sd for %llu: %i",
38550+ (unsigned long long)get_inode_oid(inode), result);
38551+ return result;
38552+}
38553+
38554+/* this is common implementation of key_by_inode method of file plugin
38555+ */
38556+int
38557+key_by_inode_and_offset_common(struct inode *inode, loff_t off,
38558+ reiser4_key * key)
38559+{
38560+ reiser4_key_init(key);
38561+ set_key_locality(key, reiser4_inode_data(inode)->locality_id);
38562+ set_key_ordering(key, get_inode_ordering(inode));
38563+ set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
38564+ set_key_type(key, KEY_BODY_MINOR);
38565+ set_key_offset(key, (__u64) off);
38566+ return 0;
38567+}
38568+
38569+/* this is common implementation of set_plug_in_inode method of file plugin
38570+ */
38571+int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
38572+ struct inode *parent /* parent object */ ,
38573+ reiser4_object_create_data * data /* creational
38574+ * data */ )
38575+{
38576+ __u64 mask;
38577+
38578+ object->i_mode = data->mode;
38579+ /* this should be plugin decision */
38580+ object->i_uid = current->fsuid;
38581+ object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
38582+
38583+ /* support for BSD style group-id assignment. See mount's manual page
38584+ description of bsdgroups ext2 mount options for more details */
38585+ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
38586+ object->i_gid = parent->i_gid;
38587+ else if (parent->i_mode & S_ISGID) {
38588+ /* parent directory has sguid bit */
38589+ object->i_gid = parent->i_gid;
38590+ if (S_ISDIR(object->i_mode))
38591+ /* sguid is inherited by sub-directories */
38592+ object->i_mode |= S_ISGID;
38593+ } else
38594+ object->i_gid = current->fsgid;
38595+
38596+ /* this object doesn't have stat-data yet */
38597+ reiser4_inode_set_flag(object, REISER4_NO_SD);
38598+#if 0
38599+ /* this is now called after all inode plugins are initialized:
38600+ do_create_vfs_child after adjust_to_parent */
38601+ /* setup inode and file-operations for this inode */
38602+ setup_inode_ops(object, data);
38603+#endif
38604+ object->i_nlink = 0;
38605+ reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
38606+ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
38607+ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
38608+ mask |= (1 << LARGE_TIMES_STAT);
38609+
38610+ reiser4_inode_data(object)->extmask = mask;
38611+ return 0;
38612+}
38613+
38614+/* this is common implementation of adjust_to_parent method of file plugin for
38615+ regular files
38616+ */
38617+int adjust_to_parent_common(struct inode *object /* new object */ ,
38618+ struct inode *parent /* parent directory */ ,
38619+ struct inode *root /* root directory */ )
38620+{
38621+ assert("nikita-2165", object != NULL);
38622+ if (parent == NULL)
38623+ parent = root;
38624+ assert("nikita-2069", parent != NULL);
38625+
38626+ /*
38627+ * inherit missing plugins from parent
38628+ */
38629+
38630+ grab_plugin_pset(object, parent, PSET_FILE);
38631+ grab_plugin_pset(object, parent, PSET_SD);
38632+ grab_plugin_pset(object, parent, PSET_FORMATTING);
38633+ grab_plugin_pset(object, parent, PSET_PERM);
38634+ return 0;
38635+}
38636+
38637+/* this is common implementation of adjust_to_parent method of file plugin for
38638+ typical directories
38639+ */
38640+int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
38641+ struct inode *parent /* parent directory */ ,
38642+ struct inode *root /* root directory */ )
38643+{
38644+ int result = 0;
38645+ pset_member memb;
38646+
38647+ assert("nikita-2166", object != NULL);
38648+ if (parent == NULL)
38649+ parent = root;
38650+ assert("nikita-2167", parent != NULL);
38651+
38652+ /*
38653+ * inherit missing plugins from parent
38654+ */
38655+ for (memb = 0; memb < PSET_LAST; ++memb) {
38656+ result = grab_plugin_pset(object, parent, memb);
38657+ if (result != 0)
38658+ break;
38659+ }
38660+ return result;
38661+}
38662+
38663+int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
38664+ struct inode *parent /* parent directory */,
38665+ struct inode *root /* root directory */)
38666+{
38667+ int result;
38668+ result = adjust_to_parent_common(object, parent, root);
38669+ if (result)
38670+ return result;
38671+ assert("edward-1416", parent != NULL);
38672+
38673+ grab_plugin_pset(object, parent, PSET_CLUSTER);
38674+ grab_plugin_pset(object, parent, PSET_CIPHER);
38675+ grab_plugin_pset(object, parent, PSET_DIGEST);
38676+ grab_plugin_pset(object, parent, PSET_COMPRESSION);
38677+ grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
38678+
38679+ return 0;
38680+}
38681+
38682+/* this is common implementation of create_object method of file plugin
38683+ */
38684+int reiser4_create_object_common(struct inode *object, struct inode *parent,
38685+ reiser4_object_create_data * data)
38686+{
38687+ reiser4_block_nr reserve;
38688+ assert("nikita-744", object != NULL);
38689+ assert("nikita-745", parent != NULL);
38690+ assert("nikita-747", data != NULL);
38691+ assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
38692+
38693+ reserve = estimate_create_common(object);
38694+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
38695+ return RETERR(-ENOSPC);
38696+ return write_sd_by_inode_common(object);
38697+}
38698+
38699+static int common_object_delete_no_reserve(struct inode *inode);
38700+
38701+/**
38702+ * reiser4_delete_object_common - delete_object of file_plugin
38703+ * @inode: inode to be deleted
38704+ *
38705+ * This is common implementation of delete_object method of file_plugin. It
38706+ * applies to object its deletion consists of removing two items - stat data
38707+ * and safe-link.
38708+ */
38709+int reiser4_delete_object_common(struct inode *inode)
38710+{
38711+ int result;
38712+
38713+ assert("nikita-1477", inode != NULL);
38714+ /* FIXME: if file body deletion failed (i/o error, for instance),
38715+ inode->i_size can be != 0 here */
38716+ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
38717+ assert("nikita-3421", inode->i_nlink == 0);
38718+
38719+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
38720+ reiser4_block_nr reserve;
38721+
38722+ /* grab space which is needed to remove 2 items from the tree:
38723+ stat data and safe-link */
38724+ reserve = 2 *
38725+ estimate_one_item_removal(reiser4_tree_by_inode(inode));
38726+ if (reiser4_grab_space_force(reserve,
38727+ BA_RESERVED | BA_CAN_COMMIT))
38728+ return RETERR(-ENOSPC);
38729+ result = common_object_delete_no_reserve(inode);
38730+ } else
38731+ result = 0;
38732+ return result;
38733+}
38734+
38735+/**
38736+ * reiser4_delete_dir_common - delete_object of file_plugin
38737+ * @inode: inode to be deleted
38738+ *
38739+ * This is common implementation of delete_object method of file_plugin for
38740+ * typical directory. It calls done method of dir_plugin to remove "." and
38741+ * removes stat data and safe-link.
38742+ */
38743+int reiser4_delete_dir_common(struct inode *inode)
38744+{
38745+ int result;
38746+ dir_plugin *dplug;
38747+
38748+ assert("", (get_current_context() &&
38749+ get_current_context()->trans->atom == NULL));
38750+
38751+ dplug = inode_dir_plugin(inode);
38752+ assert("vs-1101", dplug && dplug->done);
38753+
38754+ /* kill cursors which might be attached to inode */
38755+ reiser4_kill_cursors(inode);
38756+
38757+ /* grab space enough for removing two items */
38758+ if (reiser4_grab_space
38759+ (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
38760+ BA_RESERVED | BA_CAN_COMMIT))
38761+ return RETERR(-ENOSPC);
38762+
38763+ result = dplug->done(inode);
38764+ if (!result)
38765+ result = common_object_delete_no_reserve(inode);
38766+ return result;
38767+}
38768+
38769+/* this is common implementation of add_link method of file plugin
38770+ */
38771+int reiser4_add_link_common(struct inode *object, struct inode *parent)
38772+{
38773+ /*
38774+ * increment ->i_nlink and update ->i_ctime
38775+ */
38776+
38777+ INODE_INC_FIELD(object, i_nlink);
38778+ object->i_ctime = CURRENT_TIME;
38779+ return 0;
38780+}
38781+
38782+/* this is common implementation of rem_link method of file plugin
38783+ */
38784+int reiser4_rem_link_common(struct inode *object, struct inode *parent)
38785+{
38786+ assert("nikita-2021", object != NULL);
38787+ assert("nikita-2163", object->i_nlink > 0);
38788+
38789+ /*
38790+ * decrement ->i_nlink and update ->i_ctime
38791+ */
38792+
38793+ INODE_DEC_FIELD(object, i_nlink);
38794+ object->i_ctime = CURRENT_TIME;
38795+ return 0;
38796+}
38797+
38798+/* this is common implementation of rem_link method of file plugin for typical
38799+ directory
38800+*/
38801+int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
38802+{
38803+ assert("nikita-20211", object != NULL);
38804+ assert("nikita-21631", object->i_nlink > 0);
38805+
38806+ /*
38807+ * decrement ->i_nlink and update ->i_ctime
38808+ */
38809+ INODE_DEC_FIELD(object, i_nlink);
38810+ if (object->i_nlink == 1)
38811+ INODE_DEC_FIELD(object, i_nlink);
38812+ object->i_ctime = CURRENT_TIME;
38813+ return 0;
38814+}
38815+
38816+/* this is common implementation of owns_item method of file plugin
38817+ compare objectids of keys in inode and coord */
38818+int owns_item_common(const struct inode *inode, /* object to check
38819+ * against */
38820+ const coord_t * coord /* coord to check */ )
38821+{
38822+ reiser4_key item_key;
38823+ reiser4_key file_key;
38824+
38825+ assert("nikita-760", inode != NULL);
38826+ assert("nikita-761", coord != NULL);
38827+
38828+ return coord_is_existing_item(coord) &&
38829+ (get_key_objectid(build_sd_key(inode, &file_key)) ==
38830+ get_key_objectid(item_key_by_coord(coord, &item_key)));
38831+}
38832+
38833+/* this is common implementation of owns_item method of file plugin
38834+ for typical directory
38835+*/
38836+int owns_item_common_dir(const struct inode *inode, /* object to check against */
38837+ const coord_t * coord /* coord of item to check */ )
38838+{
38839+ reiser4_key item_key;
38840+
38841+ assert("nikita-1335", inode != NULL);
38842+ assert("nikita-1334", coord != NULL);
38843+
38844+ if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
38845+ return get_key_locality(item_key_by_coord(coord, &item_key)) ==
38846+ get_inode_oid(inode);
38847+ else
38848+ return owns_item_common(inode, coord);
38849+}
38850+
38851+/* this is common implementation of can_add_link method of file plugin
38852+ checks whether yet another hard links to this object can be added
38853+*/
38854+int can_add_link_common(const struct inode *object /* object to check */ )
38855+{
38856+ assert("nikita-732", object != NULL);
38857+
38858+ /* inode->i_nlink is unsigned int, so just check for integer
38859+ overflow */
38860+ return object->i_nlink + 1 != 0;
38861+}
38862+
38863+/* this is common implementation of can_rem_link method of file plugin for
38864+ typical directory
38865+*/
38866+int can_rem_link_common_dir(const struct inode *inode)
38867+{
38868+ /* is_dir_empty() returns 0 is dir is empty */
38869+ return !is_dir_empty(inode);
38870+}
38871+
38872+/* this is common implementation of detach method of file plugin for typical
38873+ directory
38874+*/
38875+int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
38876+{
38877+ dir_plugin *dplug;
38878+
38879+ dplug = inode_dir_plugin(child);
38880+ assert("nikita-2883", dplug != NULL);
38881+ assert("nikita-2884", dplug->detach != NULL);
38882+ return dplug->detach(child, parent);
38883+}
38884+
38885+/* this is common implementation of bind method of file plugin for typical
38886+ directory
38887+*/
38888+int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
38889+{
38890+ dir_plugin *dplug;
38891+
38892+ dplug = inode_dir_plugin(child);
38893+ assert("nikita-2646", dplug != NULL);
38894+ return dplug->attach(child, parent);
38895+}
38896+
38897+static int process_truncate(struct inode *, __u64 size);
38898+
38899+/* this is common implementation of safelink method of file plugin
38900+ */
38901+int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
38902+{
38903+ int result;
38904+
38905+ assert("vs-1705", get_current_context()->trans->atom == NULL);
38906+ if (link == SAFE_UNLINK)
38907+ /* nothing to do. iput() in the caller (process_safelink) will
38908+ * finish with file */
38909+ result = 0;
38910+ else if (link == SAFE_TRUNCATE)
38911+ result = process_truncate(object, value);
38912+ else {
38913+ warning("nikita-3438", "Unrecognized safe-link type: %i", link);
38914+ result = RETERR(-EIO);
38915+ }
38916+ return result;
38917+}
38918+
38919+/* this is common implementation of estimate.create method of file plugin
38920+ can be used when object creation involves insertion of one item (usually stat
38921+ data) into tree
38922+*/
38923+reiser4_block_nr estimate_create_common(const struct inode * object)
38924+{
38925+ return estimate_one_insert_item(reiser4_tree_by_inode(object));
38926+}
38927+
38928+/* this is common implementation of estimate.create method of file plugin for
38929+ typical directory
38930+ can be used when directory creation involves insertion of two items (usually
38931+ stat data and item containing "." and "..") into tree
38932+*/
38933+reiser4_block_nr estimate_create_common_dir(const struct inode * object)
38934+{
38935+ return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
38936+}
38937+
38938+/* this is common implementation of estimate.update method of file plugin
38939+ can be used when stat data update does not do more than inserting a unit
38940+ into a stat data item which is probably true for most cases
38941+*/
38942+reiser4_block_nr estimate_update_common(const struct inode * inode)
38943+{
38944+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
38945+}
38946+
38947+/* this is common implementation of estimate.unlink method of file plugin
38948+ */
38949+reiser4_block_nr
38950+estimate_unlink_common(const struct inode * object UNUSED_ARG,
38951+ const struct inode * parent UNUSED_ARG)
38952+{
38953+ return 0;
38954+}
38955+
38956+/* this is common implementation of estimate.unlink method of file plugin for
38957+ typical directory
38958+*/
38959+reiser4_block_nr
38960+estimate_unlink_common_dir(const struct inode * object,
38961+ const struct inode * parent)
38962+{
38963+ dir_plugin *dplug;
38964+
38965+ dplug = inode_dir_plugin(object);
38966+ assert("nikita-2888", dplug != NULL);
38967+ assert("nikita-2887", dplug->estimate.unlink != NULL);
38968+ return dplug->estimate.unlink(object, parent);
38969+}
38970+
38971+char *wire_write_common(struct inode *inode, char *start)
38972+{
38973+ return build_inode_onwire(inode, start);
38974+}
38975+
38976+char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
38977+{
38978+ if (!obj)
38979+ return locate_obj_key_id_onwire(addr);
38980+ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
38981+}
38982+
38983+struct dentry *wire_get_common(struct super_block *sb,
38984+ reiser4_object_on_wire * obj)
38985+{
38986+ struct inode *inode;
38987+ struct dentry *dentry;
38988+ reiser4_key key;
38989+
38990+ extract_key_from_id(&obj->u.std.key_id, &key);
38991+ inode = reiser4_iget(sb, &key, 1);
38992+ if (!IS_ERR(inode)) {
38993+ reiser4_iget_complete(inode);
38994+ dentry = d_alloc_anon(inode);
38995+ if (dentry == NULL) {
38996+ iput(inode);
38997+ dentry = ERR_PTR(-ENOMEM);
38998+ } else
38999+ dentry->d_op = &get_super_private(sb)->ops.dentry;
39000+ } else if (PTR_ERR(inode) == -ENOENT)
39001+ /*
39002+ * inode wasn't found at the key encoded in the file
39003+ * handle. Hence, file handle is stale.
39004+ */
39005+ dentry = ERR_PTR(RETERR(-ESTALE));
39006+ else
39007+ dentry = (void *)inode;
39008+ return dentry;
39009+}
39010+
39011+int wire_size_common(struct inode *inode)
39012+{
39013+ return inode_onwire_size(inode);
39014+}
39015+
39016+void wire_done_common(reiser4_object_on_wire * obj)
39017+{
39018+ /* nothing to do */
39019+}
39020+
39021+/* helper function to print errors */
39022+static void key_warning(const reiser4_key * key /* key to print */ ,
39023+ const struct inode *inode,
39024+ int code /* error code to print */ )
39025+{
39026+ assert("nikita-716", key != NULL);
39027+
39028+ if (code != -ENOMEM) {
39029+ warning("nikita-717", "Error for inode %llu (%i)",
39030+ (unsigned long long)get_key_objectid(key), code);
39031+ reiser4_print_key("for key", key);
39032+ }
39033+}
39034+
39035+/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
39036+#if REISER4_DEBUG
39037+static void
39038+check_inode_seal(const struct inode *inode,
39039+ const coord_t * coord, const reiser4_key * key)
39040+{
39041+ reiser4_key unit_key;
39042+
39043+ unit_key_by_coord(coord, &unit_key);
39044+ assert("nikita-2752",
39045+ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
39046+ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
39047+}
39048+
39049+static void check_sd_coord(coord_t * coord, const reiser4_key * key)
39050+{
39051+ reiser4_key ukey;
39052+
39053+ coord_clear_iplug(coord);
39054+ if (zload(coord->node))
39055+ return;
39056+
39057+ if (!coord_is_existing_unit(coord) ||
39058+ !item_plugin_by_coord(coord) ||
39059+ !keyeq(unit_key_by_coord(coord, &ukey), key) ||
39060+ (znode_get_level(coord->node) != LEAF_LEVEL) ||
39061+ !item_is_statdata(coord)) {
39062+ warning("nikita-1901", "Conspicuous seal");
39063+ reiser4_print_key("key", key);
39064+ print_coord("coord", coord, 1);
39065+ impossible("nikita-2877", "no way");
39066+ }
39067+ zrelse(coord->node);
39068+}
39069+
39070+#else
39071+#define check_inode_seal(inode, coord, key) noop
39072+#define check_sd_coord(coord, key) noop
39073+#endif
39074+
39075+/* insert new stat-data into tree. Called with inode state
39076+ locked. Return inode state locked. */
39077+static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
39078+{
39079+ int result;
39080+ reiser4_key key;
39081+ coord_t coord;
39082+ reiser4_item_data data;
39083+ char *area;
39084+ reiser4_inode *ref;
39085+ lock_handle lh;
39086+ oid_t oid;
39087+
39088+ assert("nikita-723", inode != NULL);
39089+ assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
39090+
39091+ ref = reiser4_inode_data(inode);
39092+ spin_lock_inode(inode);
39093+
39094+ if (ref->plugin_mask != 0)
39095+ /* inode has non-standard plugins */
39096+ inode_set_extension(inode, PLUGIN_STAT);
39097+ /*
39098+ * prepare specification of new item to be inserted
39099+ */
39100+
39101+ data.iplug = inode_sd_plugin(inode);
39102+ data.length = data.iplug->s.sd.save_len(inode);
39103+ spin_unlock_inode(inode);
39104+
39105+ data.data = NULL;
39106+ data.user = 0;
39107+/* could be optimized for case where there is only one node format in
39108+ * use in the filesystem, probably there are lots of such
39109+ * places we could optimize for only one node layout.... -Hans */
39110+ if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){
39111+ /* This is silly check, but we don't know actual node where
39112+ insertion will go into. */
39113+ return RETERR(-ENAMETOOLONG);
39114+ }
39115+ oid = oid_allocate(inode->i_sb);
39116+/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
39117+ if (oid == ABSOLUTE_MAX_OID)
39118+ return RETERR(-EOVERFLOW);
39119+
39120+ set_inode_oid(inode, oid);
39121+
39122+ coord_init_zero(&coord);
39123+ init_lh(&lh);
39124+
39125+ result = insert_by_key(reiser4_tree_by_inode(inode),
39126+ build_sd_key(inode, &key), &data, &coord, &lh,
39127+ /* stat data lives on a leaf level */
39128+ LEAF_LEVEL, CBK_UNIQUE);
39129+
39130+ /* we don't want to re-check that somebody didn't insert
39131+ stat-data while we were doing io, because if it did,
39132+ insert_by_key() returned error. */
39133+ /* but what _is_ possible is that plugin for inode's stat-data,
39134+ list of non-standard plugins or their state would change
39135+ during io, so that stat-data wouldn't fit into sd. To avoid
39136+ this race we keep inode_state lock. This lock has to be
39137+ taken each time you access inode in a way that would cause
39138+ changes in sd size: changing plugins etc.
39139+ */
39140+
39141+ if (result == IBK_INSERT_OK) {
39142+ coord_clear_iplug(&coord);
39143+ result = zload(coord.node);
39144+ if (result == 0) {
39145+ /* have we really inserted stat data? */
39146+ assert("nikita-725", item_is_statdata(&coord));
39147+
39148+ /* inode was just created. It is inserted into hash
39149+ table, but no directory entry was yet inserted into
39150+ parent. So, inode is inaccessible through
39151+ ->lookup(). All places that directly grab inode
39152+ from hash-table (like old knfsd), should check
39153+ IMMUTABLE flag that is set by common_create_child.
39154+ */
39155+ assert("nikita-3240", data.iplug != NULL);
39156+ assert("nikita-3241", data.iplug->s.sd.save != NULL);
39157+ area = item_body_by_coord(&coord);
39158+ result = data.iplug->s.sd.save(inode, &area);
39159+ znode_make_dirty(coord.node);
39160+ if (result == 0) {
39161+ /* object has stat-data now */
39162+ reiser4_inode_clr_flag(inode, REISER4_NO_SD);
39163+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
39164+ /* initialise stat-data seal */
39165+ reiser4_seal_init(&ref->sd_seal, &coord, &key);
39166+ ref->sd_coord = coord;
39167+ check_inode_seal(inode, &coord, &key);
39168+ } else if (result != -ENOMEM)
39169+ /*
39170+ * convert any other error code to -EIO to
39171+ * avoid confusing user level with unexpected
39172+ * errors.
39173+ */
39174+ result = RETERR(-EIO);
39175+ zrelse(coord.node);
39176+ }
39177+ }
39178+ done_lh(&lh);
39179+
39180+ if (result != 0)
39181+ key_warning(&key, inode, result);
39182+ else
39183+ oid_count_allocated();
39184+
39185+ return result;
39186+}
39187+
39188+/* find sd of inode in a tree, deal with errors */
39189+int lookup_sd(struct inode *inode /* inode to look sd for */ ,
39190+ znode_lock_mode lock_mode /* lock mode */ ,
39191+ coord_t * coord /* resulting coord */ ,
39192+ lock_handle * lh /* resulting lock handle */ ,
39193+ const reiser4_key * key /* resulting key */ ,
39194+ int silent)
39195+{
39196+ int result;
39197+ __u32 flags;
39198+
39199+ assert("nikita-1692", inode != NULL);
39200+ assert("nikita-1693", coord != NULL);
39201+ assert("nikita-1694", key != NULL);
39202+
39203+ /* look for the object's stat data in a tree.
39204+ This returns in "node" pointer to a locked znode and in "pos"
39205+ position of an item found in node. Both are only valid if
39206+ coord_found is returned. */
39207+ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
39208+ flags |= CBK_UNIQUE;
39209+ /*
39210+ * traverse tree to find stat data. We cannot use vroot here, because
39211+ * it only covers _body_ of the file, and stat data don't belong
39212+ * there.
39213+ */
39214+ result = coord_by_key(reiser4_tree_by_inode(inode),
39215+ key,
39216+ coord,
39217+ lh,
39218+ lock_mode,
39219+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
39220+ if (REISER4_DEBUG && result == 0)
39221+ check_sd_coord(coord, key);
39222+
39223+ if (result != 0 && !silent)
39224+ key_warning(key, inode, result);
39225+ return result;
39226+}
39227+
39228+static int
39229+locate_inode_sd(struct inode *inode,
39230+ reiser4_key * key, coord_t * coord, lock_handle * lh)
39231+{
39232+ reiser4_inode *state;
39233+ seal_t seal;
39234+ int result;
39235+
39236+ assert("nikita-3483", inode != NULL);
39237+
39238+ state = reiser4_inode_data(inode);
39239+ spin_lock_inode(inode);
39240+ *coord = state->sd_coord;
39241+ coord_clear_iplug(coord);
39242+ seal = state->sd_seal;
39243+ spin_unlock_inode(inode);
39244+
39245+ build_sd_key(inode, key);
39246+ if (reiser4_seal_is_set(&seal)) {
39247+ /* first, try to use seal */
39248+ result = reiser4_seal_validate(&seal,
39249+ coord,
39250+ key,
39251+ lh, ZNODE_WRITE_LOCK,
39252+ ZNODE_LOCK_LOPRI);
39253+ if (result == 0)
39254+ check_sd_coord(coord, key);
39255+ } else
39256+ result = -E_REPEAT;
39257+
39258+ if (result != 0) {
39259+ coord_init_zero(coord);
39260+ result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
39261+ }
39262+ return result;
39263+}
39264+
39265+#if REISER4_DEBUG
39266+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
39267+{
39268+ return (get_key_locality(k1) == get_key_locality(k2) &&
39269+ get_key_type(k1) == get_key_type(k2) &&
39270+ get_key_band(k1) == get_key_band(k2) &&
39271+ get_key_ordering(k1) == get_key_ordering(k2) &&
39272+ get_key_objectid(k1) == get_key_objectid(k2));
39273+}
39274+
39275+#include "../tree_walk.h"
39276+
39277+/* make some checks before and after stat-data resize operation */
39278+static int check_sd_resize(struct inode * inode, coord_t * coord,
39279+ int length, int progress /* 1 means after resize */)
39280+{
39281+ int ret = 0;
39282+ lock_handle left_lock;
39283+ coord_t left_coord;
39284+ reiser4_key left_key;
39285+ reiser4_key key;
39286+
39287+ if (inode_file_plugin(inode) !=
39288+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
39289+ return 0;
39290+ if (!length)
39291+ return 0;
39292+ if (coord->item_pos != 0)
39293+ return 0;
39294+
39295+ init_lh(&left_lock);
39296+ ret = reiser4_get_left_neighbor(&left_lock,
39297+ coord->node,
39298+ ZNODE_WRITE_LOCK,
39299+ GN_CAN_USE_UPPER_LEVELS);
39300+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
39301+ ret == -ENOENT || ret == -EINVAL
39302+ || ret == -E_DEADLOCK) {
39303+ ret = 0;
39304+ goto exit;
39305+ }
39306+ ret = zload(left_lock.node);
39307+ if (ret)
39308+ goto exit;
39309+ coord_init_last_unit(&left_coord, left_lock.node);
39310+ item_key_by_coord(&left_coord, &left_key);
39311+ item_key_by_coord(coord, &key);
39312+
39313+ if (all_but_offset_key_eq(&key, &left_key))
39314+ /* corruption occured */
39315+ ret = 1;
39316+ zrelse(left_lock.node);
39317+ exit:
39318+ done_lh(&left_lock);
39319+ return ret;
39320+}
39321+#endif
39322+
39323+/* update stat-data at @coord */
39324+static int
39325+update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
39326+ lock_handle * lh)
39327+{
39328+ int result;
39329+ reiser4_item_data data;
39330+ char *area;
39331+ reiser4_inode *state;
39332+ znode *loaded;
39333+
39334+ state = reiser4_inode_data(inode);
39335+
39336+ coord_clear_iplug(coord);
39337+ result = zload(coord->node);
39338+ if (result != 0)
39339+ return result;
39340+ loaded = coord->node;
39341+
39342+ spin_lock_inode(inode);
39343+ assert("nikita-728", inode_sd_plugin(inode) != NULL);
39344+ data.iplug = inode_sd_plugin(inode);
39345+
39346+ /* if inode has non-standard plugins, add appropriate stat data
39347+ * extension */
39348+ if (state->extmask & (1 << PLUGIN_STAT)) {
39349+ if (state->plugin_mask == 0)
39350+ inode_clr_extension(inode, PLUGIN_STAT);
39351+ } else if (state->plugin_mask != 0)
39352+ inode_set_extension(inode, PLUGIN_STAT);
39353+
39354+ if (state->extmask & (1 << HEIR_STAT)) {
39355+ if (state->heir_mask == 0)
39356+ inode_clr_extension(inode, HEIR_STAT);
39357+ } else if (state->heir_mask != 0)
39358+ inode_set_extension(inode, HEIR_STAT);
39359+
39360+ /* data.length is how much space to add to (or remove
39361+ from if negative) sd */
39362+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
39363+ /* recalculate stat-data length */
39364+ data.length =
39365+ data.iplug->s.sd.save_len(inode) -
39366+ item_length_by_coord(coord);
39367+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
39368+ } else
39369+ data.length = 0;
39370+ spin_unlock_inode(inode);
39371+
39372+ /* if on-disk stat data is of different length than required
39373+ for this inode, resize it */
39374+
39375+ if (data.length != 0) {
39376+ data.data = NULL;
39377+ data.user = 0;
39378+
39379+ assert("edward-1441",
39380+ !check_sd_resize(inode, coord,
39381+ data.length, 0/* before resize */));
39382+
39383+ /* insertion code requires that insertion point (coord) was
39384+ * between units. */
39385+ coord->between = AFTER_UNIT;
39386+ result = reiser4_resize_item(coord, &data, key, lh,
39387+ COPI_DONT_SHIFT_LEFT);
39388+ if (result != 0) {
39389+ key_warning(key, inode, result);
39390+ zrelse(loaded);
39391+ return result;
39392+ }
39393+ if (loaded != coord->node) {
39394+ /* reiser4_resize_item moved coord to another node.
39395+ Zload it */
39396+ zrelse(loaded);
39397+ coord_clear_iplug(coord);
39398+ result = zload(coord->node);
39399+ if (result != 0)
39400+ return result;
39401+ loaded = coord->node;
39402+ }
39403+ assert("edward-1442",
39404+ !check_sd_resize(inode, coord,
39405+ data.length, 1/* after resize */));
39406+ }
39407+ area = item_body_by_coord(coord);
39408+ spin_lock_inode(inode);
39409+ result = data.iplug->s.sd.save(inode, &area);
39410+ znode_make_dirty(coord->node);
39411+
39412+ /* re-initialise stat-data seal */
39413+
39414+ /*
39415+ * coord.between was possibly skewed from AT_UNIT when stat-data size
39416+ * was changed and new extensions were pasted into item.
39417+ */
39418+ coord->between = AT_UNIT;
39419+ reiser4_seal_init(&state->sd_seal, coord, key);
39420+ state->sd_coord = *coord;
39421+ spin_unlock_inode(inode);
39422+ check_inode_seal(inode, coord, key);
39423+ zrelse(loaded);
39424+ return result;
39425+}
39426+
39427+/* Update existing stat-data in a tree. Called with inode state locked. Return
39428+ inode state locked. */
39429+static int update_sd(struct inode *inode /* inode to update sd for */ )
39430+{
39431+ int result;
39432+ reiser4_key key;
39433+ coord_t coord;
39434+ lock_handle lh;
39435+
39436+ assert("nikita-726", inode != NULL);
39437+
39438+ /* no stat-data, nothing to update?! */
39439+ assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
39440+
39441+ init_lh(&lh);
39442+
39443+ result = locate_inode_sd(inode, &key, &coord, &lh);
39444+ if (result == 0)
39445+ result = update_sd_at(inode, &coord, &key, &lh);
39446+ done_lh(&lh);
39447+
39448+ return result;
39449+}
39450+
39451+/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
39452+ Remove object stat data. Space for that must be reserved by caller before
39453+*/
39454+static int
39455+common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
39456+{
39457+ int result;
39458+
39459+ assert("nikita-1477", inode != NULL);
39460+
39461+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
39462+ reiser4_key sd_key;
39463+
39464+ DQUOT_FREE_INODE(inode);
39465+ DQUOT_DROP(inode);
39466+
39467+ build_sd_key(inode, &sd_key);
39468+ result =
39469+ reiser4_cut_tree(reiser4_tree_by_inode(inode),
39470+ &sd_key, &sd_key, NULL, 0);
39471+ if (result == 0) {
39472+ reiser4_inode_set_flag(inode, REISER4_NO_SD);
39473+ result = oid_release(inode->i_sb, get_inode_oid(inode));
39474+ if (result == 0) {
39475+ oid_count_released();
39476+
39477+ result = safe_link_del(reiser4_tree_by_inode(inode),
39478+ get_inode_oid(inode),
39479+ SAFE_UNLINK);
39480+ }
39481+ }
39482+ } else
39483+ result = 0;
39484+ return result;
39485+}
39486+
39487+/* helper for safelink_common */
39488+static int process_truncate(struct inode *inode, __u64 size)
39489+{
39490+ int result;
39491+ struct iattr attr;
39492+ file_plugin *fplug;
39493+ reiser4_context *ctx;
39494+ struct dentry dentry;
39495+
39496+ assert("vs-21", is_in_reiser4_context());
39497+ ctx = reiser4_init_context(inode->i_sb);
39498+ assert("vs-22", !IS_ERR(ctx));
39499+
39500+ attr.ia_size = size;
39501+ attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
39502+ fplug = inode_file_plugin(inode);
39503+
39504+ mutex_lock(&inode->i_mutex);
39505+ assert("vs-1704", get_current_context()->trans->atom == NULL);
39506+ dentry.d_inode = inode;
39507+ result = inode->i_op->setattr(&dentry, &attr);
39508+ mutex_unlock(&inode->i_mutex);
39509+
39510+ context_set_commit_async(ctx);
39511+ reiser4_exit_context(ctx);
39512+
39513+ return result;
39514+}
39515+
39516+/*
39517+ Local variables:
39518+ c-indentation-style: "K&R"
39519+ mode-name: "LC"
39520+ c-basic-offset: 8
39521+ tab-width: 8
39522+ fill-column: 80
39523+ scroll-step: 1
39524+ End:
39525+*/
39526diff -urN linux-2.6.24.orig/fs/reiser4/plugin/hash.c linux-2.6.24/fs/reiser4/plugin/hash.c
39527--- linux-2.6.24.orig/fs/reiser4/plugin/hash.c 1970-01-01 03:00:00.000000000 +0300
39528+++ linux-2.6.24/fs/reiser4/plugin/hash.c 2008-01-25 11:39:06.996223145 +0300
39529@@ -0,0 +1,353 @@
39530+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
39531+ * reiser4/README */
39532+
39533+/* Hash functions */
39534+
39535+#include "../debug.h"
39536+#include "plugin_header.h"
39537+#include "plugin.h"
39538+#include "../super.h"
39539+#include "../inode.h"
39540+
39541+#include <linux/types.h>
39542+
39543+/* old rupasov (yura) hash */
39544+static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
39545+ int len /* @name's length */ )
39546+{
39547+ int i;
39548+ int j;
39549+ int pow;
39550+ __u64 a;
39551+ __u64 c;
39552+
39553+ assert("nikita-672", name != NULL);
39554+ assert("nikita-673", len >= 0);
39555+
39556+ for (pow = 1, i = 1; i < len; ++i)
39557+ pow = pow * 10;
39558+
39559+ if (len == 1)
39560+ a = name[0] - 48;
39561+ else
39562+ a = (name[0] - 48) * pow;
39563+
39564+ for (i = 1; i < len; ++i) {
39565+ c = name[i] - 48;
39566+ for (pow = 1, j = i; j < len - 1; ++j)
39567+ pow = pow * 10;
39568+ a = a + c * pow;
39569+ }
39570+ for (; i < 40; ++i) {
39571+ c = '0' - 48;
39572+ for (pow = 1, j = i; j < len - 1; ++j)
39573+ pow = pow * 10;
39574+ a = a + c * pow;
39575+ }
39576+
39577+ for (; i < 256; ++i) {
39578+ c = i;
39579+ for (pow = 1, j = i; j < len - 1; ++j)
39580+ pow = pow * 10;
39581+ a = a + c * pow;
39582+ }
39583+
39584+ a = a << 7;
39585+ return a;
39586+}
39587+
39588+/* r5 hash */
39589+static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
39590+ int len UNUSED_ARG /* @name's length */ )
39591+{
39592+ __u64 a = 0;
39593+
39594+ assert("nikita-674", name != NULL);
39595+ assert("nikita-675", len >= 0);
39596+
39597+ while (*name) {
39598+ a += *name << 4;
39599+ a += *name >> 4;
39600+ a *= 11;
39601+ name++;
39602+ }
39603+ return a;
39604+}
39605+
39606+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
39607+ H0 = Key
39608+ Hi = E Mi(Hi-1) + Hi-1
39609+
39610+ (see Applied Cryptography, 2nd edition, p448).
39611+
39612+ Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
39613+
39614+ Jeremy has agreed to the contents of reiserfs/README. -Hans
39615+
39616+ This code was blindly upgraded to __u64 by s/__u32/__u64/g.
39617+*/
39618+static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
39619+ int len /* @name's length */ )
39620+{
39621+ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
39622+
39623+ __u64 h0 = k[0], h1 = k[1];
39624+ __u64 a, b, c, d;
39625+ __u64 pad;
39626+ int i;
39627+
39628+ assert("nikita-676", name != NULL);
39629+ assert("nikita-677", len >= 0);
39630+
39631+#define DELTA 0x9E3779B9u
39632+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
39633+#define PARTROUNDS 6 /* 6 gets complete mixing */
39634+
39635+/* a, b, c, d - data; h0, h1 - accumulated hash */
39636+#define TEACORE(rounds) \
39637+ do { \
39638+ __u64 sum = 0; \
39639+ int n = rounds; \
39640+ __u64 b0, b1; \
39641+ \
39642+ b0 = h0; \
39643+ b1 = h1; \
39644+ \
39645+ do \
39646+ { \
39647+ sum += DELTA; \
39648+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
39649+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
39650+ } while(--n); \
39651+ \
39652+ h0 += b0; \
39653+ h1 += b1; \
39654+ } while(0)
39655+
39656+ pad = (__u64) len | ((__u64) len << 8);
39657+ pad |= pad << 16;
39658+
39659+ while (len >= 16) {
39660+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39661+ 16 | (__u64) name[3] << 24;
39662+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39663+ 16 | (__u64) name[7] << 24;
39664+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39665+ 16 | (__u64) name[11] << 24;
39666+ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
39667+ << 16 | (__u64) name[15] << 24;
39668+
39669+ TEACORE(PARTROUNDS);
39670+
39671+ len -= 16;
39672+ name += 16;
39673+ }
39674+
39675+ if (len >= 12) {
39676+ //assert(len < 16);
39677+ if (len >= 16)
39678+ *(int *)0 = 0;
39679+
39680+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39681+ 16 | (__u64) name[3] << 24;
39682+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39683+ 16 | (__u64) name[7] << 24;
39684+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39685+ 16 | (__u64) name[11] << 24;
39686+
39687+ d = pad;
39688+ for (i = 12; i < len; i++) {
39689+ d <<= 8;
39690+ d |= name[i];
39691+ }
39692+ } else if (len >= 8) {
39693+ //assert(len < 12);
39694+ if (len >= 12)
39695+ *(int *)0 = 0;
39696+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39697+ 16 | (__u64) name[3] << 24;
39698+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39699+ 16 | (__u64) name[7] << 24;
39700+
39701+ c = d = pad;
39702+ for (i = 8; i < len; i++) {
39703+ c <<= 8;
39704+ c |= name[i];
39705+ }
39706+ } else if (len >= 4) {
39707+ //assert(len < 8);
39708+ if (len >= 8)
39709+ *(int *)0 = 0;
39710+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39711+ 16 | (__u64) name[3] << 24;
39712+
39713+ b = c = d = pad;
39714+ for (i = 4; i < len; i++) {
39715+ b <<= 8;
39716+ b |= name[i];
39717+ }
39718+ } else {
39719+ //assert(len < 4);
39720+ if (len >= 4)
39721+ *(int *)0 = 0;
39722+ a = b = c = d = pad;
39723+ for (i = 0; i < len; i++) {
39724+ a <<= 8;
39725+ a |= name[i];
39726+ }
39727+ }
39728+
39729+ TEACORE(FULLROUNDS);
39730+
39731+/* return 0;*/
39732+ return h0 ^ h1;
39733+
39734+}
39735+
39736+/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
39737+
39738+ See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
39739+
39740+ Excerpts:
39741+
39742+ FNV hashes are designed to be fast while maintaining a low collision
39743+ rate.
39744+
39745+ [This version also seems to preserve lexicographical order locally.]
39746+
39747+ FNV hash algorithms and source code have been released into the public
39748+ domain.
39749+
39750+*/
39751+static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
39752+ int len UNUSED_ARG /* @name's length */ )
39753+{
39754+ unsigned long long a = 0xcbf29ce484222325ull;
39755+ const unsigned long long fnv_64_prime = 0x100000001b3ull;
39756+
39757+ assert("nikita-678", name != NULL);
39758+ assert("nikita-679", len >= 0);
39759+
39760+ /* FNV-1 hash each octet in the buffer */
39761+ for (; *name; ++name) {
39762+ /* multiply by the 32 bit FNV magic prime mod 2^64 */
39763+ a *= fnv_64_prime;
39764+ /* xor the bottom with the current octet */
39765+ a ^= (unsigned long long)(*name);
39766+ }
39767+ /* return our new hash value */
39768+ return a;
39769+}
39770+
39771+/* degenerate hash function used to simplify testing of non-unique key
39772+ handling */
39773+static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
39774+ int len UNUSED_ARG /* @name's length */ )
39775+{
39776+ return 0xc0c0c0c010101010ull;
39777+}
39778+
39779+static int change_hash(struct inode *inode,
39780+ reiser4_plugin * plugin,
39781+ pset_member memb)
39782+{
39783+ int result;
39784+
39785+ assert("nikita-3503", inode != NULL);
39786+ assert("nikita-3504", plugin != NULL);
39787+
39788+ assert("nikita-3505", is_reiser4_inode(inode));
39789+ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
39790+
39791+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
39792+ return RETERR(-EINVAL);
39793+
39794+ result = 0;
39795+ if (inode_hash_plugin(inode) == NULL ||
39796+ inode_hash_plugin(inode)->h.id != plugin->h.id) {
39797+ if (is_dir_empty(inode) == 0)
39798+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
39799+ PSET_HASH, plugin);
39800+ else
39801+ result = RETERR(-ENOTEMPTY);
39802+
39803+ }
39804+ return result;
39805+}
39806+
39807+static reiser4_plugin_ops hash_plugin_ops = {
39808+ .init = NULL,
39809+ .load = NULL,
39810+ .save_len = NULL,
39811+ .save = NULL,
39812+ .change = change_hash
39813+};
39814+
39815+/* hash plugins */
39816+hash_plugin hash_plugins[LAST_HASH_ID] = {
39817+ [RUPASOV_HASH_ID] = {
39818+ .h = {
39819+ .type_id = REISER4_HASH_PLUGIN_TYPE,
39820+ .id = RUPASOV_HASH_ID,
39821+ .pops = &hash_plugin_ops,
39822+ .label = "rupasov",
39823+ .desc = "Original Yura's hash",
39824+ .linkage = {NULL, NULL}
39825+ },
39826+ .hash = hash_rupasov
39827+ },
39828+ [R5_HASH_ID] = {
39829+ .h = {
39830+ .type_id = REISER4_HASH_PLUGIN_TYPE,
39831+ .id = R5_HASH_ID,
39832+ .pops = &hash_plugin_ops,
39833+ .label = "r5",
39834+ .desc = "r5 hash",
39835+ .linkage = {NULL, NULL}
39836+ },
39837+ .hash = hash_r5
39838+ },
39839+ [TEA_HASH_ID] = {
39840+ .h = {
39841+ .type_id = REISER4_HASH_PLUGIN_TYPE,
39842+ .id = TEA_HASH_ID,
39843+ .pops = &hash_plugin_ops,
39844+ .label = "tea",
39845+ .desc = "tea hash",
39846+ .linkage = {NULL, NULL}
39847+ },
39848+ .hash = hash_tea
39849+ },
39850+ [FNV1_HASH_ID] = {
39851+ .h = {
39852+ .type_id = REISER4_HASH_PLUGIN_TYPE,
39853+ .id = FNV1_HASH_ID,
39854+ .pops = &hash_plugin_ops,
39855+ .label = "fnv1",
39856+ .desc = "fnv1 hash",
39857+ .linkage = {NULL, NULL}
39858+ },
39859+ .hash = hash_fnv1
39860+ },
39861+ [DEGENERATE_HASH_ID] = {
39862+ .h = {
39863+ .type_id = REISER4_HASH_PLUGIN_TYPE,
39864+ .id = DEGENERATE_HASH_ID,
39865+ .pops = &hash_plugin_ops,
39866+ .label = "degenerate hash",
39867+ .desc = "Degenerate hash: only for testing",
39868+ .linkage = {NULL, NULL}
39869+ },
39870+ .hash = hash_deg
39871+ }
39872+};
39873+
39874+/* Make Linus happy.
39875+ Local variables:
39876+ c-indentation-style: "K&R"
39877+ mode-name: "LC"
39878+ c-basic-offset: 8
39879+ tab-width: 8
39880+ fill-column: 120
39881+ End:
39882+*/
39883diff -urN linux-2.6.24.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.24/fs/reiser4/plugin/inode_ops.c
39884--- linux-2.6.24.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 03:00:00.000000000 +0300
39885+++ linux-2.6.24/fs/reiser4/plugin/inode_ops.c 2008-01-25 11:39:07.000224175 +0300
39886@@ -0,0 +1,897 @@
39887+/*
39888+ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
39889+ */
39890+
39891+/*
39892+ * this file contains typical implementations for most of methods of struct
39893+ * inode_operations
39894+ */
39895+
39896+#include "../inode.h"
39897+#include "../safe_link.h"
39898+
39899+#include <linux/quotaops.h>
39900+#include <linux/namei.h>
39901+
39902+static int create_vfs_object(struct inode *parent, struct dentry *dentry,
39903+ reiser4_object_create_data *data);
39904+
39905+/**
39906+ * reiser4_create_common - create of inode operations
39907+ * @parent: inode of parent directory
39908+ * @dentry: dentry of new object to create
39909+ * @mode: the permissions to use
39910+ * @nameidata:
39911+ *
39912+ * This is common implementation of vfs's create method of struct
39913+ * inode_operations.
39914+ * Creates regular file using file plugin from parent directory plugin set.
39915+ */
39916+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
39917+ int mode, struct nameidata *nameidata)
39918+{
39919+ reiser4_object_create_data data;
39920+ file_plugin *fplug;
39921+
39922+ memset(&data, 0, sizeof data);
39923+ data.mode = S_IFREG | mode;
39924+ fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
39925+ if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
39926+ warning("vpf-1900", "'%s' is not a regular file plugin.",
39927+ fplug->h.label);
39928+ return RETERR(-EIO);
39929+ }
39930+ data.id = fplug->h.id;
39931+ return create_vfs_object(parent, dentry, &data);
39932+}
39933+
39934+int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
39935+void check_light_weight(struct inode *inode, struct inode *parent);
39936+
39937+/**
39938+ * reiser4_lookup_common - lookup of inode operations
39939+ * @parent: inode of directory to lookup into
39940+ * @dentry: name to look for
39941+ * @nameidata:
39942+ *
39943+ * This is common implementation of vfs's lookup method of struct
39944+ * inode_operations.
39945+ */
39946+struct dentry *reiser4_lookup_common(struct inode *parent,
39947+ struct dentry *dentry,
39948+ struct nameidata *nameidata)
39949+{
39950+ reiser4_context *ctx;
39951+ int result;
39952+ struct dentry *new;
39953+ struct inode *inode;
39954+ reiser4_dir_entry_desc entry;
39955+
39956+ ctx = reiser4_init_context(parent->i_sb);
39957+ if (IS_ERR(ctx))
39958+ return (struct dentry *)ctx;
39959+
39960+ /* set up operations on dentry. */
39961+ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
39962+
39963+ result = reiser4_lookup_name(parent, dentry, &entry.key);
39964+ if (result) {
39965+ context_set_commit_async(ctx);
39966+ reiser4_exit_context(ctx);
39967+ if (result == -ENOENT) {
39968+ /* object not found */
39969+ if (!IS_DEADDIR(parent))
39970+ d_add(dentry, NULL);
39971+ return NULL;
39972+ }
39973+ return ERR_PTR(result);
39974+ }
39975+
39976+ inode = reiser4_iget(parent->i_sb, &entry.key, 0);
39977+ if (IS_ERR(inode)) {
39978+ context_set_commit_async(ctx);
39979+ reiser4_exit_context(ctx);
39980+ return ERR_PTR(PTR_ERR(inode));
39981+ }
39982+
39983+ /* success */
39984+ check_light_weight(inode, parent);
39985+ new = d_splice_alias(inode, dentry);
39986+ reiser4_iget_complete(inode);
39987+
39988+ /* prevent balance_dirty_pages() from being called: we don't want to
39989+ * do this under directory i_mutex. */
39990+ context_set_commit_async(ctx);
39991+ reiser4_exit_context(ctx);
39992+ return new;
39993+}
39994+
39995+static reiser4_block_nr common_estimate_link(struct inode *parent,
39996+ struct inode *object);
39997+int reiser4_update_dir(struct inode *);
39998+
39999+/**
40000+ * reiser4_link_common - link of inode operations
40001+ * @existing: dentry of object which is to get new name
40002+ * @parent: directory where new name is to be created
40003+ * @newname: new name
40004+ *
40005+ * This is common implementation of vfs's link method of struct
40006+ * inode_operations.
40007+ */
40008+int reiser4_link_common(struct dentry *existing, struct inode *parent,
40009+ struct dentry *newname)
40010+{
40011+ reiser4_context *ctx;
40012+ int result;
40013+ struct inode *object;
40014+ dir_plugin *parent_dplug;
40015+ reiser4_dir_entry_desc entry;
40016+ reiser4_object_create_data data;
40017+ reiser4_block_nr reserve;
40018+
40019+ ctx = reiser4_init_context(parent->i_sb);
40020+ if (IS_ERR(ctx))
40021+ return PTR_ERR(ctx);
40022+
40023+ assert("nikita-1431", existing != NULL);
40024+ assert("nikita-1432", parent != NULL);
40025+ assert("nikita-1433", newname != NULL);
40026+
40027+ object = existing->d_inode;
40028+ assert("nikita-1434", object != NULL);
40029+
40030+ /* check for race with create_object() */
40031+ if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
40032+ context_set_commit_async(ctx);
40033+ reiser4_exit_context(ctx);
40034+ return RETERR(-E_REPEAT);
40035+ }
40036+
40037+ parent_dplug = inode_dir_plugin(parent);
40038+
40039+ memset(&entry, 0, sizeof entry);
40040+ entry.obj = object;
40041+
40042+ data.mode = object->i_mode;
40043+ data.id = inode_file_plugin(object)->h.id;
40044+
40045+ reserve = common_estimate_link(parent, existing->d_inode);
40046+ if ((__s64) reserve < 0) {
40047+ context_set_commit_async(ctx);
40048+ reiser4_exit_context(ctx);
40049+ return reserve;
40050+ }
40051+
40052+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
40053+ context_set_commit_async(ctx);
40054+ reiser4_exit_context(ctx);
40055+ return RETERR(-ENOSPC);
40056+ }
40057+
40058+ /*
40059+ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
40060+ * means that link(2) can race against unlink(2) or rename(2), and
40061+ * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
40062+ *
40063+ * For such inode we have to undo special processing done in
40064+ * reiser4_unlink() viz. creation of safe-link.
40065+ */
40066+ if (unlikely(object->i_nlink == 0)) {
40067+ result = safe_link_del(reiser4_tree_by_inode(object),
40068+ get_inode_oid(object), SAFE_UNLINK);
40069+ if (result != 0) {
40070+ context_set_commit_async(ctx);
40071+ reiser4_exit_context(ctx);
40072+ return result;
40073+ }
40074+ }
40075+
40076+ /* increment nlink of @existing and update its stat data */
40077+ result = reiser4_add_nlink(object, parent, 1);
40078+ if (result == 0) {
40079+ /* add entry to the parent */
40080+ result =
40081+ parent_dplug->add_entry(parent, newname, &data, &entry);
40082+ if (result != 0) {
40083+ /* failed to add entry to the parent, decrement nlink
40084+ of @existing */
40085+ reiser4_del_nlink(object, parent, 1);
40086+ /*
40087+ * now, if that failed, we have a file with too big
40088+ * nlink---space leak, much better than directory
40089+ * entry pointing to nowhere
40090+ */
40091+ }
40092+ }
40093+ if (result == 0) {
40094+ atomic_inc(&object->i_count);
40095+ /*
40096+ * Upon successful completion, link() shall mark for update
40097+ * the st_ctime field of the file. Also, the st_ctime and
40098+ * st_mtime fields of the directory that contains the new
40099+ * entry shall be marked for update. --SUS
40100+ */
40101+ result = reiser4_update_dir(parent);
40102+ }
40103+ if (result == 0)
40104+ d_instantiate(newname, existing->d_inode);
40105+
40106+ context_set_commit_async(ctx);
40107+ reiser4_exit_context(ctx);
40108+ return result;
40109+}
40110+
40111+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
40112+
40113+/**
40114+ * reiser4_unlink_common - unlink of inode operations
40115+ * @parent: inode of directory to remove name from
40116+ * @victim: name to be removed
40117+ *
40118+ * This is common implementation of vfs's unlink method of struct
40119+ * inode_operations.
40120+ */
40121+int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
40122+{
40123+ reiser4_context *ctx;
40124+ int result;
40125+ struct inode *object;
40126+ file_plugin *fplug;
40127+
40128+ ctx = reiser4_init_context(parent->i_sb);
40129+ if (IS_ERR(ctx))
40130+ return PTR_ERR(ctx);
40131+
40132+ object = victim->d_inode;
40133+ fplug = inode_file_plugin(object);
40134+ assert("nikita-2882", fplug->detach != NULL);
40135+
40136+ result = unlink_check_and_grab(parent, victim);
40137+ if (result != 0) {
40138+ context_set_commit_async(ctx);
40139+ reiser4_exit_context(ctx);
40140+ return result;
40141+ }
40142+
40143+ result = fplug->detach(object, parent);
40144+ if (result == 0) {
40145+ dir_plugin *parent_dplug;
40146+ reiser4_dir_entry_desc entry;
40147+
40148+ parent_dplug = inode_dir_plugin(parent);
40149+ memset(&entry, 0, sizeof entry);
40150+
40151+ /* first, delete directory entry */
40152+ result = parent_dplug->rem_entry(parent, victim, &entry);
40153+ if (result == 0) {
40154+ /*
40155+ * if name was removed successfully, we _have_ to
40156+ * return 0 from this function, because upper level
40157+ * caller (vfs_{rmdir,unlink}) expect this.
40158+ *
40159+ * now that directory entry is removed, update
40160+ * stat-data
40161+ */
40162+ reiser4_del_nlink(object, parent, 1);
40163+ /*
40164+ * Upon successful completion, unlink() shall mark for
40165+ * update the st_ctime and st_mtime fields of the
40166+ * parent directory. Also, if the file's link count is
40167+ * not 0, the st_ctime field of the file shall be
40168+ * marked for update. --SUS
40169+ */
40170+ reiser4_update_dir(parent);
40171+ /* add safe-link for this file */
40172+ if (object->i_nlink == 0)
40173+ safe_link_add(object, SAFE_UNLINK);
40174+ }
40175+ }
40176+
40177+ if (unlikely(result != 0)) {
40178+ if (result != -ENOMEM)
40179+ warning("nikita-3398", "Cannot unlink %llu (%i)",
40180+ (unsigned long long)get_inode_oid(object),
40181+ result);
40182+ /* if operation failed commit pending inode modifications to
40183+ * the stat-data */
40184+ reiser4_update_sd(object);
40185+ reiser4_update_sd(parent);
40186+ }
40187+
40188+ reiser4_release_reserved(object->i_sb);
40189+
40190+ /* @object's i_ctime was updated by ->rem_link() method(). */
40191+
40192+ /* @victim can be already removed from the disk by this time. Inode is
40193+ then marked so that iput() wouldn't try to remove stat data. But
40194+ inode itself is still there.
40195+ */
40196+
40197+ /*
40198+ * we cannot release directory semaphore here, because name has
40199+ * already been deleted, but dentry (@victim) still exists. Prevent
40200+ * balance_dirty_pages() from being called on exiting this context: we
40201+ * don't want to do this under directory i_mutex.
40202+ */
40203+ context_set_commit_async(ctx);
40204+ reiser4_exit_context(ctx);
40205+ return result;
40206+}
40207+
40208+/**
40209+ * reiser4_symlink_common - symlink of inode operations
40210+ * @parent: inode of parent directory
40211+ * @dentry: dentry of object to be created
40212+ * @linkname: string symlink is to contain
40213+ *
40214+ * This is common implementation of vfs's symlink method of struct
40215+ * inode_operations.
40216+ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
40217+ */
40218+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
40219+ const char *linkname)
40220+{
40221+ reiser4_object_create_data data;
40222+
40223+ memset(&data, 0, sizeof data);
40224+ data.name = linkname;
40225+ data.id = SYMLINK_FILE_PLUGIN_ID;
40226+ data.mode = S_IFLNK | S_IRWXUGO;
40227+ return create_vfs_object(parent, dentry, &data);
40228+}
40229+
40230+/**
40231+ * reiser4_mkdir_common - mkdir of inode operations
40232+ * @parent: inode of parent directory
40233+ * @dentry: dentry of object to be created
40234+ * @mode: the permissions to use
40235+ *
40236+ * This is common implementation of vfs's mkdir method of struct
40237+ * inode_operations.
40238+ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
40239+ */
40240+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
40241+{
40242+ reiser4_object_create_data data;
40243+
40244+ memset(&data, 0, sizeof data);
40245+ data.mode = S_IFDIR | mode;
40246+ data.id = DIRECTORY_FILE_PLUGIN_ID;
40247+ return create_vfs_object(parent, dentry, &data);
40248+}
40249+
40250+/**
40251+ * reiser4_mknod_common - mknod of inode operations
40252+ * @parent: inode of parent directory
40253+ * @dentry: dentry of object to be created
40254+ * @mode: the permissions to use and file type
40255+ * @rdev: minor and major of new device file
40256+ *
40257+ * This is common implementation of vfs's mknod method of struct
40258+ * inode_operations.
40259+ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
40260+ */
40261+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
40262+ int mode, dev_t rdev)
40263+{
40264+ reiser4_object_create_data data;
40265+
40266+ memset(&data, 0, sizeof data);
40267+ data.mode = mode;
40268+ data.rdev = rdev;
40269+ data.id = SPECIAL_FILE_PLUGIN_ID;
40270+ return create_vfs_object(parent, dentry, &data);
40271+}
40272+
40273+/*
40274+ * implementation of vfs's rename method of struct inode_operations for typical
40275+ * directory is in inode_ops_rename.c
40276+ */
40277+
40278+/**
40279+ * reiser4_follow_link_common - follow_link of inode operations
40280+ * @dentry: dentry of symlink
40281+ * @data:
40282+ *
40283+ * This is common implementation of vfs's followlink method of struct
40284+ * inode_operations.
40285+ * Assumes that inode's i_private points to the content of symbolic link.
40286+ */
40287+void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
40288+{
40289+ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
40290+
40291+ if (!dentry->d_inode->i_private
40292+ || !reiser4_inode_get_flag(dentry->d_inode,
40293+ REISER4_GENERIC_PTR_USED))
40294+ return ERR_PTR(RETERR(-EINVAL));
40295+ nd_set_link(nd, dentry->d_inode->i_private);
40296+ return NULL;
40297+}
40298+
40299+/**
40300+ * reiser4_permission_common - permission of inode operations
40301+ * @inode: inode to check permissions for
40302+ * @mask: mode bits to check permissions for
40303+ * @nameidata:
40304+ *
40305+ * Uses generic function to check for rwx permissions.
40306+ */
40307+int reiser4_permission_common(struct inode *inode, int mask,
40308+ struct nameidata *nameidata)
40309+{
40310+ return generic_permission(inode, mask, NULL);
40311+}
40312+
40313+static int setattr_reserve(reiser4_tree *);
40314+
40315+/* this is common implementation of vfs's setattr method of struct
40316+ inode_operations
40317+*/
40318+int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
40319+{
40320+ reiser4_context *ctx;
40321+ struct inode *inode;
40322+ int result;
40323+
40324+ inode = dentry->d_inode;
40325+ result = inode_change_ok(inode, attr);
40326+ if (result)
40327+ return result;
40328+
40329+ ctx = reiser4_init_context(inode->i_sb);
40330+ if (IS_ERR(ctx))
40331+ return PTR_ERR(ctx);
40332+
40333+ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
40334+
40335+ /*
40336+ * grab disk space and call standard inode_setattr().
40337+ */
40338+ result = setattr_reserve(reiser4_tree_by_inode(inode));
40339+ if (!result) {
40340+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
40341+ || (attr->ia_valid & ATTR_GID
40342+ && attr->ia_gid != inode->i_gid)) {
40343+ result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
40344+ if (result) {
40345+ context_set_commit_async(ctx);
40346+ reiser4_exit_context(ctx);
40347+ return result;
40348+ }
40349+ }
40350+ result = inode_setattr(inode, attr);
40351+ if (!result)
40352+ reiser4_update_sd(inode);
40353+ }
40354+
40355+ context_set_commit_async(ctx);
40356+ reiser4_exit_context(ctx);
40357+ return result;
40358+}
40359+
40360+/* this is common implementation of vfs's getattr method of struct
40361+ inode_operations
40362+*/
40363+int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
40364+ struct dentry *dentry, struct kstat *stat)
40365+{
40366+ struct inode *obj;
40367+
40368+ assert("nikita-2298", dentry != NULL);
40369+ assert("nikita-2299", stat != NULL);
40370+ assert("nikita-2300", dentry->d_inode != NULL);
40371+
40372+ obj = dentry->d_inode;
40373+
40374+ stat->dev = obj->i_sb->s_dev;
40375+ stat->ino = oid_to_uino(get_inode_oid(obj));
40376+ stat->mode = obj->i_mode;
40377+ /* don't confuse userland with huge nlink. This is not entirely
40378+ * correct, because nlink_t is not necessary 16 bit signed. */
40379+ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
40380+ stat->uid = obj->i_uid;
40381+ stat->gid = obj->i_gid;
40382+ stat->rdev = obj->i_rdev;
40383+ stat->atime = obj->i_atime;
40384+ stat->mtime = obj->i_mtime;
40385+ stat->ctime = obj->i_ctime;
40386+ stat->size = obj->i_size;
40387+ stat->blocks =
40388+ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
40389+ /* "preferred" blocksize for efficient file system I/O */
40390+ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
40391+
40392+ return 0;
40393+}
40394+
40395+/* Estimate the maximum amount of nodes which might be allocated or changed on
40396+ typical new object creation. Typical creation consists of calling create
40397+ method of file plugin, adding directory entry to parent and update parent
40398+ directory's stat data.
40399+*/
40400+static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */
40401+ struct inode *object
40402+ /* object */ )
40403+{
40404+ assert("vpf-309", parent != NULL);
40405+ assert("vpf-307", object != NULL);
40406+
40407+ return
40408+ /* object creation estimation */
40409+ inode_file_plugin(object)->estimate.create(object) +
40410+ /* stat data of parent directory estimation */
40411+ inode_file_plugin(parent)->estimate.update(parent) +
40412+ /* adding entry estimation */
40413+ inode_dir_plugin(parent)->estimate.add_entry(parent) +
40414+ /* to undo in the case of failure */
40415+ inode_dir_plugin(parent)->estimate.rem_entry(parent);
40416+}
40417+
40418+/* Create child in directory.
40419+
40420+ . get object's plugin
40421+ . get fresh inode
40422+ . initialize inode
40423+ . add object's stat-data
40424+ . initialize object's directory
40425+ . add entry to the parent
40426+ . instantiate dentry
40427+
40428+*/
40429+static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new
40430+ object */
40431+ struct inode **retobj)
40432+{
40433+ int result;
40434+
40435+ struct dentry *dentry; /* parent object */
40436+ struct inode *parent; /* new name */
40437+
40438+ dir_plugin *par_dir; /* directory plugin on the parent */
40439+ dir_plugin *obj_dir; /* directory plugin on the new object */
40440+ file_plugin *obj_plug; /* object plugin on the new object */
40441+ struct inode *object; /* new object */
40442+ reiser4_block_nr reserve;
40443+
40444+ reiser4_dir_entry_desc entry; /* new directory entry */
40445+
40446+ assert("nikita-1420", data != NULL);
40447+ parent = data->parent;
40448+ dentry = data->dentry;
40449+
40450+ assert("nikita-1418", parent != NULL);
40451+ assert("nikita-1419", dentry != NULL);
40452+
40453+ /* check, that name is acceptable for parent */
40454+ par_dir = inode_dir_plugin(parent);
40455+ if (par_dir->is_name_acceptable &&
40456+ !par_dir->is_name_acceptable(parent,
40457+ dentry->d_name.name,
40458+ (int)dentry->d_name.len))
40459+ return RETERR(-ENAMETOOLONG);
40460+
40461+ result = 0;
40462+ obj_plug = file_plugin_by_id((int)data->id);
40463+ if (obj_plug == NULL) {
40464+ warning("nikita-430", "Cannot find plugin %i", data->id);
40465+ return RETERR(-ENOENT);
40466+ }
40467+ object = new_inode(parent->i_sb);
40468+ if (object == NULL)
40469+ return RETERR(-ENOMEM);
40470+ /* we'll update i_nlink below */
40471+ object->i_nlink = 0;
40472+ /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
40473+ * to simplify error handling: if some error occurs before i_ino is
40474+ * initialized with oid, i_ino should already be set to some
40475+ * distinguished value. */
40476+ object->i_ino = 0;
40477+
40478+ /* So that on error iput will be called. */
40479+ *retobj = object;
40480+
40481+ if (DQUOT_ALLOC_INODE(object)) {
40482+ DQUOT_DROP(object);
40483+ object->i_flags |= S_NOQUOTA;
40484+ return RETERR(-EDQUOT);
40485+ }
40486+
40487+ memset(&entry, 0, sizeof entry);
40488+ entry.obj = object;
40489+
40490+ set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
40491+ file_plugin_to_plugin(obj_plug));
40492+ result = obj_plug->set_plug_in_inode(object, parent, data);
40493+ if (result) {
40494+ warning("nikita-431", "Cannot install plugin %i on %llx",
40495+ data->id, (unsigned long long)get_inode_oid(object));
40496+ DQUOT_FREE_INODE(object);
40497+ object->i_flags |= S_NOQUOTA;
40498+ return result;
40499+ }
40500+
40501+ /* reget plugin after installation */
40502+ obj_plug = inode_file_plugin(object);
40503+
40504+ if (obj_plug->create_object == NULL) {
40505+ DQUOT_FREE_INODE(object);
40506+ object->i_flags |= S_NOQUOTA;
40507+ return RETERR(-EPERM);
40508+ }
40509+
40510+ /* if any of hash, tail, sd or permission plugins for newly created
40511+ object are not set yet set them here inheriting them from parent
40512+ directory
40513+ */
40514+ assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
40515+ result = obj_plug->adjust_to_parent(object,
40516+ parent,
40517+ object->i_sb->s_root->d_inode);
40518+ if (result == 0)
40519+ result = finish_pset(object);
40520+ if (result != 0) {
40521+ warning("nikita-432", "Cannot inherit from %llx to %llx",
40522+ (unsigned long long)get_inode_oid(parent),
40523+ (unsigned long long)get_inode_oid(object));
40524+ DQUOT_FREE_INODE(object);
40525+ object->i_flags |= S_NOQUOTA;
40526+ return result;
40527+ }
40528+
40529+ /* setup inode and file-operations for this inode */
40530+ setup_inode_ops(object, data);
40531+
40532+ /* call file plugin's method to initialize plugin specific part of
40533+ * inode */
40534+ if (obj_plug->init_inode_data)
40535+ obj_plug->init_inode_data(object, data, 1 /*create */ );
40536+
40537+ /* obtain directory plugin (if any) for new object. */
40538+ obj_dir = inode_dir_plugin(object);
40539+ if (obj_dir != NULL && obj_dir->init == NULL) {
40540+ DQUOT_FREE_INODE(object);
40541+ object->i_flags |= S_NOQUOTA;
40542+ return RETERR(-EPERM);
40543+ }
40544+
40545+ reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
40546+
40547+ reserve = estimate_create_vfs_object(parent, object);
40548+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
40549+ DQUOT_FREE_INODE(object);
40550+ object->i_flags |= S_NOQUOTA;
40551+ return RETERR(-ENOSPC);
40552+ }
40553+
40554+ /* mark inode `immutable'. We disable changes to the file being
40555+ created until valid directory entry for it is inserted. Otherwise,
40556+ if file were expanded and insertion of directory entry fails, we
40557+ have to remove file, but we only alloted enough space in
40558+ transaction to remove _empty_ file. 3.x code used to remove stat
40559+ data in different transaction thus possibly leaking disk space on
40560+ crash. This all only matters if it's possible to access file
40561+ without name, for example, by inode number
40562+ */
40563+ reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
40564+
40565+ /* create empty object, this includes allocation of new objectid. For
40566+ directories this implies creation of dot and dotdot */
40567+ assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
40568+
40569+ /* mark inode as `loaded'. From this point onward
40570+ reiser4_delete_inode() will try to remove its stat-data. */
40571+ reiser4_inode_set_flag(object, REISER4_LOADED);
40572+
40573+ result = obj_plug->create_object(object, parent, data);
40574+ if (result != 0) {
40575+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40576+ if (result != -ENAMETOOLONG && result != -ENOMEM)
40577+ warning("nikita-2219",
40578+ "Failed to create sd for %llu",
40579+ (unsigned long long)get_inode_oid(object));
40580+ DQUOT_FREE_INODE(object);
40581+ object->i_flags |= S_NOQUOTA;
40582+ return result;
40583+ }
40584+
40585+ if (obj_dir != NULL)
40586+ result = obj_dir->init(object, parent, data);
40587+ if (result == 0) {
40588+ assert("nikita-434", !reiser4_inode_get_flag(object,
40589+ REISER4_NO_SD));
40590+ /* insert inode into VFS hash table */
40591+ insert_inode_hash(object);
40592+ /* create entry */
40593+ result = par_dir->add_entry(parent, dentry, data, &entry);
40594+ if (result == 0) {
40595+ result = reiser4_add_nlink(object, parent, 0);
40596+ /* If O_CREAT is set and the file did not previously
40597+ exist, upon successful completion, open() shall
40598+ mark for update the st_atime, st_ctime, and
40599+ st_mtime fields of the file and the st_ctime and
40600+ st_mtime fields of the parent directory. --SUS
40601+ */
40602+ /* @object times are already updated by
40603+ reiser4_add_nlink() */
40604+ if (result == 0)
40605+ reiser4_update_dir(parent);
40606+ if (result != 0)
40607+ /* cleanup failure to add nlink */
40608+ par_dir->rem_entry(parent, dentry, &entry);
40609+ }
40610+ if (result != 0)
40611+ /* cleanup failure to add entry */
40612+ obj_plug->detach(object, parent);
40613+ } else if (result != -ENOMEM)
40614+ warning("nikita-2219", "Failed to initialize dir for %llu: %i",
40615+ (unsigned long long)get_inode_oid(object), result);
40616+
40617+ /*
40618+ * update stat-data, committing all pending modifications to the inode
40619+ * fields.
40620+ */
40621+ reiser4_update_sd(object);
40622+ if (result != 0) {
40623+ DQUOT_FREE_INODE(object);
40624+ object->i_flags |= S_NOQUOTA;
40625+ /* if everything was ok (result == 0), parent stat-data is
40626+ * already updated above (update_parent_dir()) */
40627+ reiser4_update_sd(parent);
40628+ /* failure to create entry, remove object */
40629+ obj_plug->delete_object(object);
40630+ }
40631+
40632+ /* file has name now, clear immutable flag */
40633+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40634+
40635+ /* on error, iput() will call ->delete_inode(). We should keep track
40636+ of the existence of stat-data for this inode and avoid attempt to
40637+ remove it in reiser4_delete_inode(). This is accomplished through
40638+ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
40639+ */
40640+ return result;
40641+}
40642+
40643+/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
40644+ reiser4_mknod and reiser4_symlink
40645+*/
40646+static int
40647+create_vfs_object(struct inode *parent,
40648+ struct dentry *dentry, reiser4_object_create_data * data)
40649+{
40650+ reiser4_context *ctx;
40651+ int result;
40652+ struct inode *child;
40653+
40654+ ctx = reiser4_init_context(parent->i_sb);
40655+ if (IS_ERR(ctx))
40656+ return PTR_ERR(ctx);
40657+ context_set_commit_async(ctx);
40658+
40659+ data->parent = parent;
40660+ data->dentry = dentry;
40661+ child = NULL;
40662+ result = do_create_vfs_child(data, &child);
40663+ if (unlikely(result != 0)) {
40664+ if (child != NULL) {
40665+ reiser4_make_bad_inode(child);
40666+ iput(child);
40667+ }
40668+ } else
40669+ d_instantiate(dentry, child);
40670+
40671+ reiser4_exit_context(ctx);
40672+ return result;
40673+}
40674+
40675+/* helper for link_common. Estimate disk space necessary to add a link
40676+ from @parent to @object
40677+*/
40678+static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */
40679+ struct inode *object
40680+ /* object to which new link is being cerated */
40681+ )
40682+{
40683+ reiser4_block_nr res = 0;
40684+ file_plugin *fplug;
40685+ dir_plugin *dplug;
40686+
40687+ assert("vpf-317", object != NULL);
40688+ assert("vpf-318", parent != NULL);
40689+
40690+ fplug = inode_file_plugin(object);
40691+ dplug = inode_dir_plugin(parent);
40692+ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
40693+ /* reiser4_add_nlink(object) */
40694+ res += fplug->estimate.update(object);
40695+ /* add_entry(parent) */
40696+ res += dplug->estimate.add_entry(parent);
40697+ /* reiser4_del_nlink(object) */
40698+ res += fplug->estimate.update(object);
40699+ /* update_dir(parent) */
40700+ res += inode_file_plugin(parent)->estimate.update(parent);
40701+ /* safe-link */
40702+ res += estimate_one_item_removal(reiser4_tree_by_inode(object));
40703+
40704+ return res;
40705+}
40706+
40707+/* Estimate disk space necessary to remove a link between @parent and
40708+ @object.
40709+*/
40710+static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */
40711+ struct inode *object
40712+ /* object to which new link is being cerated */
40713+ )
40714+{
40715+ reiser4_block_nr res = 0;
40716+ file_plugin *fplug;
40717+ dir_plugin *dplug;
40718+
40719+ assert("vpf-317", object != NULL);
40720+ assert("vpf-318", parent != NULL);
40721+
40722+ fplug = inode_file_plugin(object);
40723+ dplug = inode_dir_plugin(parent);
40724+
40725+ /* rem_entry(parent) */
40726+ res += dplug->estimate.rem_entry(parent);
40727+ /* reiser4_del_nlink(object) */
40728+ res += fplug->estimate.update(object);
40729+ /* update_dir(parent) */
40730+ res += inode_file_plugin(parent)->estimate.update(parent);
40731+ /* fplug->unlink */
40732+ res += fplug->estimate.unlink(object, parent);
40733+ /* safe-link */
40734+ res += estimate_one_insert_item(reiser4_tree_by_inode(object));
40735+
40736+ return res;
40737+}
40738+
40739+/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
40740+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
40741+{
40742+ file_plugin *fplug;
40743+ struct inode *child;
40744+ int result;
40745+
40746+ result = 0;
40747+ child = victim->d_inode;
40748+ fplug = inode_file_plugin(child);
40749+
40750+ /* check for race with create_object() */
40751+ if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
40752+ return RETERR(-E_REPEAT);
40753+ /* object being deleted should have stat data */
40754+ assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
40755+
40756+ /* ask object plugin */
40757+ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
40758+ return RETERR(-ENOTEMPTY);
40759+
40760+ result = (int)estimate_unlink(parent, child);
40761+ if (result < 0)
40762+ return result;
40763+
40764+ return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
40765+}
40766+
40767+/* helper for reiser4_setattr_common */
40768+static int setattr_reserve(reiser4_tree * tree)
40769+{
40770+ assert("vs-1096", is_grab_enabled(get_current_context()));
40771+ return reiser4_grab_space(estimate_one_insert_into_item(tree),
40772+ BA_CAN_COMMIT);
40773+}
40774+
40775+/* helper function. Standards require that for many file-system operations
40776+ on success ctime and mtime of parent directory is to be updated. */
40777+int reiser4_update_dir(struct inode *dir)
40778+{
40779+ assert("nikita-2525", dir != NULL);
40780+
40781+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
40782+ return reiser4_update_sd(dir);
40783+}
40784diff -urN linux-2.6.24.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.24/fs/reiser4/plugin/inode_ops_rename.c
40785--- linux-2.6.24.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 03:00:00.000000000 +0300
40786+++ linux-2.6.24/fs/reiser4/plugin/inode_ops_rename.c 2008-01-25 11:39:07.000224175 +0300
40787@@ -0,0 +1,912 @@
40788+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
40789+ * reiser4/README */
40790+
40791+#include "../inode.h"
40792+#include "../safe_link.h"
40793+
40794+static const char *possible_leak = "Possible disk space leak.";
40795+
40796+/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
40797+
40798+ Helper function called from hashed_rename() */
40799+static int replace_name(struct inode *to_inode, /* inode where @from_coord is
40800+ * to be re-targeted at */
40801+ struct inode *from_dir, /* directory where @from_coord
40802+ * lives */
40803+ struct inode *from_inode, /* inode @from_coord
40804+ * originally point to */
40805+ coord_t * from_coord, /* where directory entry is in
40806+ * the tree */
40807+ lock_handle * from_lh /* lock handle on @from_coord */ )
40808+{
40809+ item_plugin *from_item;
40810+ int result;
40811+ znode *node;
40812+
40813+ coord_clear_iplug(from_coord);
40814+ node = from_coord->node;
40815+ result = zload(node);
40816+ if (result != 0)
40817+ return result;
40818+ from_item = item_plugin_by_coord(from_coord);
40819+ if (plugin_of_group(item_plugin_by_coord(from_coord),
40820+ DIR_ENTRY_ITEM_TYPE))
40821+ {
40822+ reiser4_key to_key;
40823+
40824+ build_sd_key(to_inode, &to_key);
40825+
40826+ /* everything is found and prepared to change directory entry
40827+ at @from_coord to point to @to_inode.
40828+
40829+ @to_inode is just about to get new name, so bump its link
40830+ counter.
40831+
40832+ */
40833+ result = reiser4_add_nlink(to_inode, from_dir, 0);
40834+ if (result != 0) {
40835+ /* Don't issue warning: this may be plain -EMLINK */
40836+ zrelse(node);
40837+ return result;
40838+ }
40839+
40840+ result =
40841+ from_item->s.dir.update_key(from_coord, &to_key, from_lh);
40842+ if (result != 0) {
40843+ reiser4_del_nlink(to_inode, from_dir, 0);
40844+ zrelse(node);
40845+ return result;
40846+ }
40847+
40848+ /* @from_inode just lost its name, he-he.
40849+
40850+ If @from_inode was directory, it contained dotdot pointing
40851+ to @from_dir. @from_dir i_nlink will be decreased when
40852+ iput() will be called on @from_inode.
40853+
40854+ If file-system is not ADG (hard-links are
40855+ supported on directories), iput(from_inode) will not remove
40856+ @from_inode, and thus above is incorrect, but hard-links on
40857+ directories are problematic in many other respects.
40858+ */
40859+ result = reiser4_del_nlink(from_inode, from_dir, 0);
40860+ if (result != 0) {
40861+ warning("nikita-2330",
40862+ "Cannot remove link from source: %i. %s",
40863+ result, possible_leak);
40864+ }
40865+ /* Has to return success, because entry is already
40866+ * modified. */
40867+ result = 0;
40868+
40869+ /* NOTE-NIKITA consider calling plugin method in stead of
40870+ accessing inode fields directly. */
40871+ from_dir->i_mtime = CURRENT_TIME;
40872+ } else {
40873+ warning("nikita-2326", "Unexpected item type");
40874+ result = RETERR(-EIO);
40875+ }
40876+ zrelse(node);
40877+ return result;
40878+}
40879+
40880+/* add new entry pointing to @inode into @dir at @coord, locked by @lh
40881+
40882+ Helper function used by hashed_rename(). */
40883+static int add_name(struct inode *inode, /* inode where @coord is to be
40884+ * re-targeted at */
40885+ struct inode *dir, /* directory where @coord lives */
40886+ struct dentry *name, /* new name */
40887+ coord_t * coord, /* where directory entry is in the tree */
40888+ lock_handle * lh, /* lock handle on @coord */
40889+ int is_dir /* true, if @inode is directory */ )
40890+{
40891+ int result;
40892+ reiser4_dir_entry_desc entry;
40893+
40894+ assert("nikita-2333", lh->node == coord->node);
40895+ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
40896+
40897+ memset(&entry, 0, sizeof entry);
40898+ entry.obj = inode;
40899+ /* build key of directory entry description */
40900+ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
40901+
40902+ /* ext2 does this in different order: first inserts new entry,
40903+ then increases directory nlink. We don't want do this,
40904+ because reiser4_add_nlink() calls ->add_link() plugin
40905+ method that can fail for whatever reason, leaving as with
40906+ cleanup problems.
40907+ */
40908+ /* @inode is getting new name */
40909+ reiser4_add_nlink(inode, dir, 0);
40910+ /* create @new_name in @new_dir pointing to
40911+ @old_inode */
40912+ result = WITH_COORD(coord,
40913+ inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
40914+ coord,
40915+ lh,
40916+ name,
40917+ &entry));
40918+ if (result != 0) {
40919+ int result2;
40920+ result2 = reiser4_del_nlink(inode, dir, 0);
40921+ if (result2 != 0) {
40922+ warning("nikita-2327",
40923+ "Cannot drop link on %lli %i. %s",
40924+ (unsigned long long)get_inode_oid(inode),
40925+ result2, possible_leak);
40926+ }
40927+ } else
40928+ INODE_INC_FIELD(dir, i_size);
40929+ return result;
40930+}
40931+
40932+static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
40933+ struct dentry *old_name, /* old name */
40934+ struct inode *new_dir, /* directory where @new is located */
40935+ struct dentry *new_name /* new name */ )
40936+{
40937+ reiser4_block_nr res1, res2;
40938+ dir_plugin *p_parent_old, *p_parent_new;
40939+ file_plugin *p_child_old, *p_child_new;
40940+
40941+ assert("vpf-311", old_dir != NULL);
40942+ assert("vpf-312", new_dir != NULL);
40943+ assert("vpf-313", old_name != NULL);
40944+ assert("vpf-314", new_name != NULL);
40945+
40946+ p_parent_old = inode_dir_plugin(old_dir);
40947+ p_parent_new = inode_dir_plugin(new_dir);
40948+ p_child_old = inode_file_plugin(old_name->d_inode);
40949+ if (new_name->d_inode)
40950+ p_child_new = inode_file_plugin(new_name->d_inode);
40951+ else
40952+ p_child_new = NULL;
40953+
40954+ /* find_entry - can insert one leaf. */
40955+ res1 = res2 = 1;
40956+
40957+ /* replace_name */
40958+ {
40959+ /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
40960+ res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
40961+ /* update key */
40962+ res1 += 1;
40963+ /* reiser4_del_nlink(p_child_new) */
40964+ if (p_child_new)
40965+ res1 += p_child_new->estimate.update(new_name->d_inode);
40966+ }
40967+
40968+ /* else add_name */
40969+ {
40970+ /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
40971+ res2 +=
40972+ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
40973+ /* reiser4_add_nlink(p_parent_old) */
40974+ res2 += p_child_old->estimate.update(old_name->d_inode);
40975+ /* add_entry(p_parent_new) */
40976+ res2 += p_parent_new->estimate.add_entry(new_dir);
40977+ /* reiser4_del_nlink(p_parent_old) */
40978+ res2 += p_child_old->estimate.update(old_name->d_inode);
40979+ }
40980+
40981+ res1 = res1 < res2 ? res2 : res1;
40982+
40983+ /* reiser4_write_sd(p_parent_new) */
40984+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
40985+
40986+ /* reiser4_write_sd(p_child_new) */
40987+ if (p_child_new)
40988+ res1 += p_child_new->estimate.update(new_name->d_inode);
40989+
40990+ /* hashed_rem_entry(p_parent_old) */
40991+ res1 += p_parent_old->estimate.rem_entry(old_dir);
40992+
40993+ /* reiser4_del_nlink(p_child_old) */
40994+ res1 += p_child_old->estimate.update(old_name->d_inode);
40995+
40996+ /* replace_name */
40997+ {
40998+ /* reiser4_add_nlink(p_parent_dir_new) */
40999+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
41000+ /* update_key */
41001+ res1 += 1;
41002+ /* reiser4_del_nlink(p_parent_new) */
41003+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
41004+ /* reiser4_del_nlink(p_parent_old) */
41005+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
41006+ }
41007+
41008+ /* reiser4_write_sd(p_parent_old) */
41009+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
41010+
41011+ /* reiser4_write_sd(p_child_old) */
41012+ res1 += p_child_old->estimate.update(old_name->d_inode);
41013+
41014+ return res1;
41015+}
41016+
41017+static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */
41018+ struct dentry *old_name, /* old name */
41019+ struct inode *new_dir, /* directory where @new is located */
41020+ struct dentry *new_name
41021+ /* new name */ )
41022+{
41023+ reiser4_block_nr reserve;
41024+
41025+ reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
41026+
41027+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41028+ return RETERR(-ENOSPC);
41029+
41030+ return 0;
41031+}
41032+
41033+/* check whether @old_inode and @new_inode can be moved within file system
41034+ * tree. This singles out attempts to rename pseudo-files, for example. */
41035+static int can_rename(struct inode *old_dir, struct inode *old_inode,
41036+ struct inode *new_dir, struct inode *new_inode)
41037+{
41038+ file_plugin *fplug;
41039+ dir_plugin *dplug;
41040+
41041+ assert("nikita-3370", old_inode != NULL);
41042+
41043+ dplug = inode_dir_plugin(new_dir);
41044+ fplug = inode_file_plugin(old_inode);
41045+
41046+ if (dplug == NULL)
41047+ return RETERR(-ENOTDIR);
41048+ else if (new_dir->i_op->create == NULL)
41049+ return RETERR(-EPERM);
41050+ else if (!fplug->can_add_link(old_inode))
41051+ return RETERR(-EMLINK);
41052+ else if (new_inode != NULL) {
41053+ fplug = inode_file_plugin(new_inode);
41054+ if (fplug->can_rem_link != NULL &&
41055+ !fplug->can_rem_link(new_inode))
41056+ return RETERR(-EBUSY);
41057+ }
41058+ return 0;
41059+}
41060+
41061+int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *,
41062+ znode_lock_mode, reiser4_dir_entry_desc *);
41063+int reiser4_update_dir(struct inode *);
41064+
41065+/* this is common implementation of vfs's rename method of struct
41066+ inode_operations
41067+ See comments in the body.
41068+
41069+ It is arguable that this function can be made generic so, that it
41070+ will be applicable to any kind of directory plugin that deals with
41071+ directories composed out of directory entries. The only obstacle
41072+ here is that we don't have any data-type to represent directory
41073+ entry. This should be re-considered when more than one different
41074+ directory plugin will be implemented.
41075+*/
41076+int reiser4_rename_common(struct inode *old_dir /* directory where @old
41077+ * is located */ ,
41078+ struct dentry *old_name /* old name */ ,
41079+ struct inode *new_dir /* directory where @new
41080+ * is located */ ,
41081+ struct dentry *new_name /* new name */ )
41082+{
41083+ /* From `The Open Group Base Specifications Issue 6'
41084+
41085+ If either the old or new argument names a symbolic link, rename()
41086+ shall operate on the symbolic link itself, and shall not resolve
41087+ the last component of the argument. If the old argument and the new
41088+ argument resolve to the same existing file, rename() shall return
41089+ successfully and perform no other action.
41090+
41091+ [this is done by VFS: vfs_rename()]
41092+
41093+ If the old argument points to the pathname of a file that is not a
41094+ directory, the new argument shall not point to the pathname of a
41095+ directory.
41096+
41097+ [checked by VFS: vfs_rename->may_delete()]
41098+
41099+ If the link named by the new argument exists, it shall
41100+ be removed and old renamed to new. In this case, a link named new
41101+ shall remain visible to other processes throughout the renaming
41102+ operation and refer either to the file referred to by new or old
41103+ before the operation began.
41104+
41105+ [we should assure this]
41106+
41107+ Write access permission is required for
41108+ both the directory containing old and the directory containing new.
41109+
41110+ [checked by VFS: vfs_rename->may_delete(), may_create()]
41111+
41112+ If the old argument points to the pathname of a directory, the new
41113+ argument shall not point to the pathname of a file that is not a
41114+ directory.
41115+
41116+ [checked by VFS: vfs_rename->may_delete()]
41117+
41118+ If the directory named by the new argument exists, it
41119+ shall be removed and old renamed to new. In this case, a link named
41120+ new shall exist throughout the renaming operation and shall refer
41121+ either to the directory referred to by new or old before the
41122+ operation began.
41123+
41124+ [we should assure this]
41125+
41126+ If new names an existing directory, it shall be
41127+ required to be an empty directory.
41128+
41129+ [we should check this]
41130+
41131+ If the old argument points to a pathname of a symbolic link, the
41132+ symbolic link shall be renamed. If the new argument points to a
41133+ pathname of a symbolic link, the symbolic link shall be removed.
41134+
41135+ The new pathname shall not contain a path prefix that names
41136+ old. Write access permission is required for the directory
41137+ containing old and the directory containing new. If the old
41138+ argument points to the pathname of a directory, write access
41139+ permission may be required for the directory named by old, and, if
41140+ it exists, the directory named by new.
41141+
41142+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
41143+
41144+ If the link named by the new argument exists and the file's link
41145+ count becomes 0 when it is removed and no process has the file
41146+ open, the space occupied by the file shall be freed and the file
41147+ shall no longer be accessible. If one or more processes have the
41148+ file open when the last link is removed, the link shall be removed
41149+ before rename() returns, but the removal of the file contents shall
41150+ be postponed until all references to the file are closed.
41151+
41152+ [iput() handles this, but we can do this manually, a la
41153+ reiser4_unlink()]
41154+
41155+ Upon successful completion, rename() shall mark for update the
41156+ st_ctime and st_mtime fields of the parent directory of each file.
41157+
41158+ [N/A]
41159+
41160+ */
41161+ reiser4_context *ctx;
41162+ int result;
41163+ int is_dir; /* is @old_name directory */
41164+
41165+ struct inode *old_inode;
41166+ struct inode *new_inode;
41167+ coord_t *new_coord;
41168+
41169+ struct reiser4_dentry_fsdata *new_fsdata;
41170+ dir_plugin *dplug;
41171+ file_plugin *fplug;
41172+
41173+ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
41174+ lock_handle *new_lh, *dotdot_lh;
41175+ struct dentry *dotdot_name;
41176+ struct reiser4_dentry_fsdata *dataonstack;
41177+
41178+ ctx = reiser4_init_context(old_dir->i_sb);
41179+ if (IS_ERR(ctx))
41180+ return PTR_ERR(ctx);
41181+
41182+ old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
41183+ sizeof(*dotdot_name) + sizeof(*dataonstack),
41184+ reiser4_ctx_gfp_mask_get());
41185+ if (!old_entry) {
41186+ context_set_commit_async(ctx);
41187+ reiser4_exit_context(ctx);
41188+ return RETERR(-ENOMEM);
41189+ }
41190+
41191+ new_entry = old_entry + 1;
41192+ dotdot_entry = old_entry + 2;
41193+ new_lh = (lock_handle *)(old_entry + 3);
41194+ dotdot_lh = new_lh + 1;
41195+ dotdot_name = (struct dentry *)(new_lh + 2);
41196+ dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1);
41197+
41198+ assert("nikita-2318", old_dir != NULL);
41199+ assert("nikita-2319", new_dir != NULL);
41200+ assert("nikita-2320", old_name != NULL);
41201+ assert("nikita-2321", new_name != NULL);
41202+
41203+ old_inode = old_name->d_inode;
41204+ new_inode = new_name->d_inode;
41205+
41206+ dplug = inode_dir_plugin(old_dir);
41207+ fplug = NULL;
41208+
41209+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
41210+ if (IS_ERR(new_fsdata)) {
41211+ kfree(old_entry);
41212+ context_set_commit_async(ctx);
41213+ reiser4_exit_context(ctx);
41214+ return PTR_ERR(new_fsdata);
41215+ }
41216+
41217+ new_coord = &new_fsdata->dec.entry_coord;
41218+ coord_clear_iplug(new_coord);
41219+
41220+ is_dir = S_ISDIR(old_inode->i_mode);
41221+
41222+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41223+
41224+ /* if target is existing directory and it's not empty---return error.
41225+
41226+ This check is done specifically, because is_dir_empty() requires
41227+ tree traversal and have to be done before locks are taken.
41228+ */
41229+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
41230+ kfree(old_entry);
41231+ context_set_commit_async(ctx);
41232+ reiser4_exit_context(ctx);
41233+ return RETERR(-ENOTEMPTY);
41234+ }
41235+
41236+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
41237+ if (result != 0) {
41238+ kfree(old_entry);
41239+ context_set_commit_async(ctx);
41240+ reiser4_exit_context(ctx);
41241+ return result;
41242+ }
41243+
41244+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
41245+ new_dir, new_name);
41246+ if (result != 0) {
41247+ kfree(old_entry);
41248+ context_set_commit_async(ctx);
41249+ reiser4_exit_context(ctx);
41250+ return result;
41251+ }
41252+
41253+ init_lh(new_lh);
41254+
41255+ /* find entry for @new_name */
41256+ result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
41257+ new_entry);
41258+
41259+ if (IS_CBKERR(result)) {
41260+ done_lh(new_lh);
41261+ kfree(old_entry);
41262+ context_set_commit_async(ctx);
41263+ reiser4_exit_context(ctx);
41264+ return result;
41265+ }
41266+
41267+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
41268+
41269+ /* add or replace name for @old_inode as @new_name */
41270+ if (new_inode != NULL) {
41271+ /* target (@new_name) exists. */
41272+ /* Not clear what to do with objects that are
41273+ both directories and files at the same time. */
41274+ if (result == CBK_COORD_FOUND) {
41275+ result = replace_name(old_inode,
41276+ new_dir,
41277+ new_inode, new_coord, new_lh);
41278+ if (result == 0)
41279+ fplug = inode_file_plugin(new_inode);
41280+ } else if (result == CBK_COORD_NOTFOUND) {
41281+ /* VFS told us that @new_name is bound to existing
41282+ inode, but we failed to find directory entry. */
41283+ warning("nikita-2324", "Target not found");
41284+ result = RETERR(-ENOENT);
41285+ }
41286+ } else {
41287+ /* target (@new_name) doesn't exists. */
41288+ if (result == CBK_COORD_NOTFOUND)
41289+ result = add_name(old_inode,
41290+ new_dir,
41291+ new_name, new_coord, new_lh, is_dir);
41292+ else if (result == CBK_COORD_FOUND) {
41293+ /* VFS told us that @new_name is "negative" dentry,
41294+ but we found directory entry. */
41295+ warning("nikita-2331", "Target found unexpectedly");
41296+ result = RETERR(-EIO);
41297+ }
41298+ }
41299+
41300+ assert("nikita-3462", ergo(result == 0,
41301+ old_inode->i_nlink >= 2 + !!is_dir));
41302+
41303+ /* We are done with all modifications to the @new_dir, release lock on
41304+ node. */
41305+ done_lh(new_lh);
41306+
41307+ if (fplug != NULL) {
41308+ /* detach @new_inode from name-space */
41309+ result = fplug->detach(new_inode, new_dir);
41310+ if (result != 0)
41311+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
41312+ (unsigned long long)get_inode_oid(new_inode),
41313+ result, possible_leak);
41314+ }
41315+
41316+ if (new_inode != NULL)
41317+ reiser4_update_sd(new_inode);
41318+
41319+ if (result == 0) {
41320+ old_entry->obj = old_inode;
41321+
41322+ dplug->build_entry_key(old_dir,
41323+ &old_name->d_name, &old_entry->key);
41324+
41325+ /* At this stage new name was introduced for
41326+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41327+ counters were updated.
41328+
41329+ We want to remove @old_name now. If @old_inode wasn't
41330+ directory this is simple.
41331+ */
41332+ result = dplug->rem_entry(old_dir, old_name, old_entry);
41333+ if (result != 0 && result != -ENOMEM) {
41334+ warning("nikita-2335",
41335+ "Cannot remove old name: %i", result);
41336+ } else {
41337+ result = reiser4_del_nlink(old_inode, old_dir, 0);
41338+ if (result != 0 && result != -ENOMEM) {
41339+ warning("nikita-2337",
41340+ "Cannot drop link on old: %i", result);
41341+ }
41342+ }
41343+
41344+ if (result == 0 && is_dir) {
41345+ /* @old_inode is directory. We also have to update
41346+ dotdot entry. */
41347+ coord_t *dotdot_coord;
41348+
41349+ memset(dataonstack, 0, sizeof dataonstack);
41350+ memset(dotdot_entry, 0, sizeof dotdot_entry);
41351+ dotdot_entry->obj = old_dir;
41352+ memset(dotdot_name, 0, sizeof dotdot_name);
41353+ dotdot_name->d_name.name = "..";
41354+ dotdot_name->d_name.len = 2;
41355+ /*
41356+ * allocate ->d_fsdata on the stack to avoid using
41357+ * reiser4_get_dentry_fsdata(). Locking is not needed,
41358+ * because dentry is private to the current thread.
41359+ */
41360+ dotdot_name->d_fsdata = dataonstack;
41361+ init_lh(dotdot_lh);
41362+
41363+ dotdot_coord = &dataonstack->dec.entry_coord;
41364+ coord_clear_iplug(dotdot_coord);
41365+
41366+ result = reiser4_find_entry(old_inode, dotdot_name,
41367+ dotdot_lh, ZNODE_WRITE_LOCK,
41368+ dotdot_entry);
41369+ if (result == 0) {
41370+ /* replace_name() decreases i_nlink on
41371+ * @old_dir */
41372+ result = replace_name(new_dir,
41373+ old_inode,
41374+ old_dir,
41375+ dotdot_coord, dotdot_lh);
41376+ } else
41377+ result = RETERR(-EIO);
41378+ done_lh(dotdot_lh);
41379+ }
41380+ }
41381+ reiser4_update_dir(new_dir);
41382+ reiser4_update_dir(old_dir);
41383+ reiser4_update_sd(old_inode);
41384+ if (result == 0) {
41385+ file_plugin *fplug;
41386+
41387+ if (new_inode != NULL) {
41388+ /* add safe-link for target file (in case we removed
41389+ * last reference to the poor fellow */
41390+ fplug = inode_file_plugin(new_inode);
41391+ if (new_inode->i_nlink == 0)
41392+ result = safe_link_add(new_inode, SAFE_UNLINK);
41393+ }
41394+ }
41395+ kfree(old_entry);
41396+ context_set_commit_async(ctx);
41397+ reiser4_exit_context(ctx);
41398+ return result;
41399+}
41400+
41401+#if 0
41402+int reiser4_rename_common(struct inode *old_dir /* directory where @old
41403+ * is located */ ,
41404+ struct dentry *old_name /* old name */ ,
41405+ struct inode *new_dir /* directory where @new
41406+ * is located */ ,
41407+ struct dentry *new_name /* new name */ )
41408+{
41409+ /* From `The Open Group Base Specifications Issue 6'
41410+
41411+ If either the old or new argument names a symbolic link, rename()
41412+ shall operate on the symbolic link itself, and shall not resolve
41413+ the last component of the argument. If the old argument and the new
41414+ argument resolve to the same existing file, rename() shall return
41415+ successfully and perform no other action.
41416+
41417+ [this is done by VFS: vfs_rename()]
41418+
41419+ If the old argument points to the pathname of a file that is not a
41420+ directory, the new argument shall not point to the pathname of a
41421+ directory.
41422+
41423+ [checked by VFS: vfs_rename->may_delete()]
41424+
41425+ If the link named by the new argument exists, it shall
41426+ be removed and old renamed to new. In this case, a link named new
41427+ shall remain visible to other processes throughout the renaming
41428+ operation and refer either to the file referred to by new or old
41429+ before the operation began.
41430+
41431+ [we should assure this]
41432+
41433+ Write access permission is required for
41434+ both the directory containing old and the directory containing new.
41435+
41436+ [checked by VFS: vfs_rename->may_delete(), may_create()]
41437+
41438+ If the old argument points to the pathname of a directory, the new
41439+ argument shall not point to the pathname of a file that is not a
41440+ directory.
41441+
41442+ [checked by VFS: vfs_rename->may_delete()]
41443+
41444+ If the directory named by the new argument exists, it
41445+ shall be removed and old renamed to new. In this case, a link named
41446+ new shall exist throughout the renaming operation and shall refer
41447+ either to the directory referred to by new or old before the
41448+ operation began.
41449+
41450+ [we should assure this]
41451+
41452+ If new names an existing directory, it shall be
41453+ required to be an empty directory.
41454+
41455+ [we should check this]
41456+
41457+ If the old argument points to a pathname of a symbolic link, the
41458+ symbolic link shall be renamed. If the new argument points to a
41459+ pathname of a symbolic link, the symbolic link shall be removed.
41460+
41461+ The new pathname shall not contain a path prefix that names
41462+ old. Write access permission is required for the directory
41463+ containing old and the directory containing new. If the old
41464+ argument points to the pathname of a directory, write access
41465+ permission may be required for the directory named by old, and, if
41466+ it exists, the directory named by new.
41467+
41468+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
41469+
41470+ If the link named by the new argument exists and the file's link
41471+ count becomes 0 when it is removed and no process has the file
41472+ open, the space occupied by the file shall be freed and the file
41473+ shall no longer be accessible. If one or more processes have the
41474+ file open when the last link is removed, the link shall be removed
41475+ before rename() returns, but the removal of the file contents shall
41476+ be postponed until all references to the file are closed.
41477+
41478+ [iput() handles this, but we can do this manually, a la
41479+ reiser4_unlink()]
41480+
41481+ Upon successful completion, rename() shall mark for update the
41482+ st_ctime and st_mtime fields of the parent directory of each file.
41483+
41484+ [N/A]
41485+
41486+ */
41487+ reiser4_context *ctx;
41488+ int result;
41489+ int is_dir; /* is @old_name directory */
41490+ struct inode *old_inode;
41491+ struct inode *new_inode;
41492+ reiser4_dir_entry_desc old_entry;
41493+ reiser4_dir_entry_desc new_entry;
41494+ coord_t *new_coord;
41495+ struct reiser4_dentry_fsdata *new_fsdata;
41496+ lock_handle new_lh;
41497+ dir_plugin *dplug;
41498+ file_plugin *fplug;
41499+
41500+ ctx = reiser4_init_context(old_dir->i_sb);
41501+ if (IS_ERR(ctx))
41502+ return PTR_ERR(ctx);
41503+
41504+ assert("nikita-2318", old_dir != NULL);
41505+ assert("nikita-2319", new_dir != NULL);
41506+ assert("nikita-2320", old_name != NULL);
41507+ assert("nikita-2321", new_name != NULL);
41508+
41509+ old_inode = old_name->d_inode;
41510+ new_inode = new_name->d_inode;
41511+
41512+ dplug = inode_dir_plugin(old_dir);
41513+ fplug = NULL;
41514+
41515+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
41516+ if (IS_ERR(new_fsdata)) {
41517+ result = PTR_ERR(new_fsdata);
41518+ goto exit;
41519+ }
41520+
41521+ new_coord = &new_fsdata->dec.entry_coord;
41522+ coord_clear_iplug(new_coord);
41523+
41524+ is_dir = S_ISDIR(old_inode->i_mode);
41525+
41526+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41527+
41528+ /* if target is existing directory and it's not empty---return error.
41529+
41530+ This check is done specifically, because is_dir_empty() requires
41531+ tree traversal and have to be done before locks are taken.
41532+ */
41533+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
41534+ return RETERR(-ENOTEMPTY);
41535+
41536+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
41537+ if (result != 0)
41538+ goto exit;
41539+
41540+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
41541+ new_dir, new_name);
41542+ if (result != 0)
41543+ goto exit;
41544+
41545+ init_lh(&new_lh);
41546+
41547+ /* find entry for @new_name */
41548+ result = reiser4_find_entry(new_dir, new_name, &new_lh,
41549+ ZNODE_WRITE_LOCK, &new_entry);
41550+
41551+ if (IS_CBKERR(result)) {
41552+ done_lh(&new_lh);
41553+ goto exit;
41554+ }
41555+
41556+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
41557+
41558+ /* add or replace name for @old_inode as @new_name */
41559+ if (new_inode != NULL) {
41560+ /* target (@new_name) exists. */
41561+ /* Not clear what to do with objects that are
41562+ both directories and files at the same time. */
41563+ if (result == CBK_COORD_FOUND) {
41564+ result = replace_name(old_inode,
41565+ new_dir,
41566+ new_inode, new_coord, &new_lh);
41567+ if (result == 0)
41568+ fplug = inode_file_plugin(new_inode);
41569+ } else if (result == CBK_COORD_NOTFOUND) {
41570+ /* VFS told us that @new_name is bound to existing
41571+ inode, but we failed to find directory entry. */
41572+ warning("nikita-2324", "Target not found");
41573+ result = RETERR(-ENOENT);
41574+ }
41575+ } else {
41576+ /* target (@new_name) doesn't exists. */
41577+ if (result == CBK_COORD_NOTFOUND)
41578+ result = add_name(old_inode,
41579+ new_dir,
41580+ new_name, new_coord, &new_lh, is_dir);
41581+ else if (result == CBK_COORD_FOUND) {
41582+ /* VFS told us that @new_name is "negative" dentry,
41583+ but we found directory entry. */
41584+ warning("nikita-2331", "Target found unexpectedly");
41585+ result = RETERR(-EIO);
41586+ }
41587+ }
41588+
41589+ assert("nikita-3462", ergo(result == 0,
41590+ old_inode->i_nlink >= 2 + !!is_dir));
41591+
41592+ /* We are done with all modifications to the @new_dir, release lock on
41593+ node. */
41594+ done_lh(&new_lh);
41595+
41596+ if (fplug != NULL) {
41597+ /* detach @new_inode from name-space */
41598+ result = fplug->detach(new_inode, new_dir);
41599+ if (result != 0)
41600+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
41601+ (unsigned long long)get_inode_oid(new_inode),
41602+ result, possible_leak);
41603+ }
41604+
41605+ if (new_inode != NULL)
41606+ reiser4_update_sd(new_inode);
41607+
41608+ if (result == 0) {
41609+ memset(&old_entry, 0, sizeof old_entry);
41610+ old_entry.obj = old_inode;
41611+
41612+ dplug->build_entry_key(old_dir,
41613+ &old_name->d_name, &old_entry.key);
41614+
41615+ /* At this stage new name was introduced for
41616+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41617+ counters were updated.
41618+
41619+ We want to remove @old_name now. If @old_inode wasn't
41620+ directory this is simple.
41621+ */
41622+ result = dplug->rem_entry(old_dir, old_name, &old_entry);
41623+ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
41624+ if (result != 0 && result != -ENOMEM) {
41625+ warning("nikita-2335",
41626+ "Cannot remove old name: %i", result);
41627+ } else {
41628+ result = reiser4_del_nlink(old_inode, old_dir, 0);
41629+ if (result != 0 && result != -ENOMEM) {
41630+ warning("nikita-2337",
41631+ "Cannot drop link on old: %i", result);
41632+ }
41633+ }
41634+
41635+ if (result == 0 && is_dir) {
41636+ /* @old_inode is directory. We also have to update
41637+ dotdot entry. */
41638+ coord_t *dotdot_coord;
41639+ lock_handle dotdot_lh;
41640+ struct dentry dotdot_name;
41641+ reiser4_dir_entry_desc dotdot_entry;
41642+ struct reiser4_dentry_fsdata dataonstack;
41643+ struct reiser4_dentry_fsdata *fsdata;
41644+
41645+ memset(&dataonstack, 0, sizeof dataonstack);
41646+ memset(&dotdot_entry, 0, sizeof dotdot_entry);
41647+ dotdot_entry.obj = old_dir;
41648+ memset(&dotdot_name, 0, sizeof dotdot_name);
41649+ dotdot_name.d_name.name = "..";
41650+ dotdot_name.d_name.len = 2;
41651+ /*
41652+ * allocate ->d_fsdata on the stack to avoid using
41653+ * reiser4_get_dentry_fsdata(). Locking is not needed,
41654+ * because dentry is private to the current thread.
41655+ */
41656+ dotdot_name.d_fsdata = &dataonstack;
41657+ init_lh(&dotdot_lh);
41658+
41659+ fsdata = &dataonstack;
41660+ dotdot_coord = &fsdata->dec.entry_coord;
41661+ coord_clear_iplug(dotdot_coord);
41662+
41663+ result = reiser4_find_entry(old_inode,
41664+ &dotdot_name,
41665+ &dotdot_lh,
41666+ ZNODE_WRITE_LOCK,
41667+ &dotdot_entry);
41668+ if (result == 0) {
41669+ /* replace_name() decreases i_nlink on
41670+ * @old_dir */
41671+ result = replace_name(new_dir,
41672+ old_inode,
41673+ old_dir,
41674+ dotdot_coord, &dotdot_lh);
41675+ } else
41676+ result = RETERR(-EIO);
41677+ done_lh(&dotdot_lh);
41678+ }
41679+ }
41680+ reiser4_update_dir(new_dir);
41681+ reiser4_update_dir(old_dir);
41682+ reiser4_update_sd(old_inode);
41683+ if (result == 0) {
41684+ file_plugin *fplug;
41685+
41686+ if (new_inode != NULL) {
41687+ /* add safe-link for target file (in case we removed
41688+ * last reference to the poor fellow */
41689+ fplug = inode_file_plugin(new_inode);
41690+ if (new_inode->i_nlink == 0)
41691+ result = safe_link_add(new_inode, SAFE_UNLINK);
41692+ }
41693+ }
41694+ exit:
41695+ context_set_commit_async(ctx);
41696+ reiser4_exit_context(ctx);
41697+ return result;
41698+}
41699+#endif
41700diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/acl.h linux-2.6.24/fs/reiser4/plugin/item/acl.h
41701--- linux-2.6.24.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 03:00:00.000000000 +0300
41702+++ linux-2.6.24/fs/reiser4/plugin/item/acl.h 2008-01-25 11:39:07.000224175 +0300
41703@@ -0,0 +1,66 @@
41704+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
41705+
41706+/* Directory entry. */
41707+
41708+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
41709+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
41710+
41711+#include "../../forward.h"
41712+#include "../../dformat.h"
41713+#include "../../kassign.h"
41714+#include "../../key.h"
41715+
41716+#include <linux/fs.h>
41717+#include <linux/dcache.h> /* for struct dentry */
41718+
41719+typedef struct directory_entry_format {
41720+ /* key of object stat-data. It's not necessary to store whole
41721+ key here, because it's always key of stat-data, so minor
41722+ packing locality and offset can be omitted here. But this
41723+ relies on particular key allocation scheme for stat-data, so,
41724+ for extensibility sake, whole key can be stored here.
41725+
41726+ We store key as array of bytes, because we don't want 8-byte
41727+ alignment of dir entries.
41728+ */
41729+ obj_key_id id;
41730+ /* file name. Null terminated string. */
41731+ d8 name[0];
41732+} directory_entry_format;
41733+
41734+void print_de(const char *prefix, coord_t * coord);
41735+int extract_key_de(const coord_t * coord, reiser4_key * key);
41736+int update_key_de(const coord_t * coord, const reiser4_key * key,
41737+ lock_handle * lh);
41738+char *extract_name_de(const coord_t * coord, char *buf);
41739+unsigned extract_file_type_de(const coord_t * coord);
41740+int add_entry_de(struct inode *dir, coord_t * coord,
41741+ lock_handle * lh, const struct dentry *name,
41742+ reiser4_dir_entry_desc * entry);
41743+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
41744+ lock_handle * lh, reiser4_dir_entry_desc * entry);
41745+int max_name_len_de(const struct inode *dir);
41746+
41747+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
41748+
41749+char *extract_dent_name(const coord_t * coord,
41750+ directory_entry_format * dent, char *buf);
41751+
41752+#if REISER4_LARGE_KEY
41753+#define DE_NAME_BUF_LEN (24)
41754+#else
41755+#define DE_NAME_BUF_LEN (16)
41756+#endif
41757+
41758+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
41759+#endif
41760+
41761+/* Make Linus happy.
41762+ Local variables:
41763+ c-indentation-style: "K&R"
41764+ mode-name: "LC"
41765+ c-basic-offset: 8
41766+ tab-width: 8
41767+ fill-column: 120
41768+ End:
41769+*/
41770diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.24/fs/reiser4/plugin/item/blackbox.c
41771--- linux-2.6.24.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 03:00:00.000000000 +0300
41772+++ linux-2.6.24/fs/reiser4/plugin/item/blackbox.c 2008-01-25 11:39:07.004225206 +0300
41773@@ -0,0 +1,142 @@
41774+/* Copyright 2003 by Hans Reiser, licensing governed by
41775+ * reiser4/README */
41776+
41777+/* Black box item implementation */
41778+
41779+#include "../../forward.h"
41780+#include "../../debug.h"
41781+#include "../../dformat.h"
41782+#include "../../kassign.h"
41783+#include "../../coord.h"
41784+#include "../../tree.h"
41785+#include "../../lock.h"
41786+
41787+#include "blackbox.h"
41788+#include "item.h"
41789+#include "../plugin.h"
41790+
41791+int
41792+store_black_box(reiser4_tree * tree,
41793+ const reiser4_key * key, void *data, int length)
41794+{
41795+ int result;
41796+ reiser4_item_data idata;
41797+ coord_t coord;
41798+ lock_handle lh;
41799+
41800+ memset(&idata, 0, sizeof idata);
41801+
41802+ idata.data = data;
41803+ idata.user = 0;
41804+ idata.length = length;
41805+ idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
41806+
41807+ init_lh(&lh);
41808+ result = insert_by_key(tree, key,
41809+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
41810+
41811+ assert("nikita-3413",
41812+ ergo(result == 0,
41813+ WITH_COORD(&coord,
41814+ item_length_by_coord(&coord) == length)));
41815+
41816+ done_lh(&lh);
41817+ return result;
41818+}
41819+
41820+int
41821+load_black_box(reiser4_tree * tree,
41822+ reiser4_key * key, void *data, int length, int exact)
41823+{
41824+ int result;
41825+ coord_t coord;
41826+ lock_handle lh;
41827+
41828+ init_lh(&lh);
41829+ result = coord_by_key(tree, key,
41830+ &coord, &lh, ZNODE_READ_LOCK,
41831+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
41832+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
41833+
41834+ if (result == 0) {
41835+ int ilen;
41836+
41837+ result = zload(coord.node);
41838+ if (result == 0) {
41839+ ilen = item_length_by_coord(&coord);
41840+ if (ilen <= length) {
41841+ memcpy(data, item_body_by_coord(&coord), ilen);
41842+ unit_key_by_coord(&coord, key);
41843+ } else if (exact) {
41844+ /*
41845+ * item is larger than buffer provided by the
41846+ * user. Only issue a warning if @exact is
41847+ * set. If @exact is false, we are iterating
41848+ * over all safe-links and here we are reaching
41849+ * the end of the iteration.
41850+ */
41851+ warning("nikita-3415",
41852+ "Wrong black box length: %i > %i",
41853+ ilen, length);
41854+ result = RETERR(-EIO);
41855+ }
41856+ zrelse(coord.node);
41857+ }
41858+ }
41859+
41860+ done_lh(&lh);
41861+ return result;
41862+
41863+}
41864+
41865+int
41866+update_black_box(reiser4_tree * tree,
41867+ const reiser4_key * key, void *data, int length)
41868+{
41869+ int result;
41870+ coord_t coord;
41871+ lock_handle lh;
41872+
41873+ init_lh(&lh);
41874+ result = coord_by_key(tree, key,
41875+ &coord, &lh, ZNODE_READ_LOCK,
41876+ FIND_EXACT,
41877+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
41878+ if (result == 0) {
41879+ int ilen;
41880+
41881+ result = zload(coord.node);
41882+ if (result == 0) {
41883+ ilen = item_length_by_coord(&coord);
41884+ if (length <= ilen) {
41885+ memcpy(item_body_by_coord(&coord), data,
41886+ length);
41887+ } else {
41888+ warning("nikita-3437",
41889+ "Wrong black box length: %i < %i",
41890+ ilen, length);
41891+ result = RETERR(-EIO);
41892+ }
41893+ zrelse(coord.node);
41894+ }
41895+ }
41896+
41897+ done_lh(&lh);
41898+ return result;
41899+
41900+}
41901+
41902+int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
41903+{
41904+ return reiser4_cut_tree(tree, key, key, NULL, 1);
41905+}
41906+
41907+/* Make Linus happy.
41908+ Local variables:
41909+ c-indentation-style: "K&R"
41910+ mode-name: "LC"
41911+ c-basic-offset: 8
41912+ tab-width: 8
41913+ fill-column: 120
41914+ End:
41915+*/
41916diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.24/fs/reiser4/plugin/item/blackbox.h
41917--- linux-2.6.24.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 03:00:00.000000000 +0300
41918+++ linux-2.6.24/fs/reiser4/plugin/item/blackbox.h 2008-01-25 11:39:07.004225206 +0300
41919@@ -0,0 +1,33 @@
41920+/* Copyright 2003 by Hans Reiser, licensing governed by
41921+ * reiser4/README */
41922+
41923+/* "Black box" entry to fixed-width contain user supplied data */
41924+
41925+#if !defined( __FS_REISER4_BLACK_BOX_H__ )
41926+#define __FS_REISER4_BLACK_BOX_H__
41927+
41928+#include "../../forward.h"
41929+#include "../../dformat.h"
41930+#include "../../kassign.h"
41931+#include "../../key.h"
41932+
41933+extern int store_black_box(reiser4_tree * tree,
41934+ const reiser4_key * key, void *data, int length);
41935+extern int load_black_box(reiser4_tree * tree,
41936+ reiser4_key * key, void *data, int length, int exact);
41937+extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
41938+extern int update_black_box(reiser4_tree * tree,
41939+ const reiser4_key * key, void *data, int length);
41940+
41941+/* __FS_REISER4_BLACK_BOX_H__ */
41942+#endif
41943+
41944+/* Make Linus happy.
41945+ Local variables:
41946+ c-indentation-style: "K&R"
41947+ mode-name: "LC"
41948+ c-basic-offset: 8
41949+ tab-width: 8
41950+ fill-column: 120
41951+ End:
41952+*/
41953diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/cde.c linux-2.6.24/fs/reiser4/plugin/item/cde.c
41954--- linux-2.6.24.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 03:00:00.000000000 +0300
41955+++ linux-2.6.24/fs/reiser4/plugin/item/cde.c 2008-01-25 11:39:07.004225206 +0300
41956@@ -0,0 +1,1008 @@
41957+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
41958+
41959+/* Directory entry implementation */
41960+
41961+/* DESCRIPTION:
41962+
41963+ This is "compound" directory item plugin implementation. This directory
41964+ item type is compound (as opposed to the "simple directory item" in
41965+ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
41966+ entries.
41967+
41968+ The reason behind this decision is disk space efficiency: all directory
41969+ entries inside the same directory have identical fragment in their
41970+ keys. This, of course, depends on key assignment policy. In our default key
41971+ assignment policy, all directory entries have the same locality which is
41972+ equal to the object id of their directory.
41973+
41974+ Composing directory item out of several directory entries for the same
41975+ directory allows us to store said key fragment only once. That is, this is
41976+ some ad hoc form of key compression (stem compression) that is implemented
41977+ here, because general key compression is not supposed to be implemented in
41978+ v4.0.
41979+
41980+ Another decision that was made regarding all directory item plugins, is
41981+ that they will store entry keys unaligned. This is for that sake of disk
41982+ space efficiency again.
41983+
41984+ In should be noted, that storing keys unaligned increases CPU consumption,
41985+ at least on some architectures.
41986+
41987+ Internal on-disk structure of the compound directory item is the following:
41988+
41989+ HEADER cde_item_format. Here number of entries is stored.
41990+ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
41991+ ENTRY_HEADER_1 offset of entry body are stored.
41992+ ENTRY_HEADER_2 (basically two last parts of key)
41993+ ...
41994+ ENTRY_HEADER_N
41995+ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
41996+ ENTRY_BODY_1 NUL-terminated name are stored.
41997+ ENTRY_BODY_2 (part of statadta key in the
41998+ sence that since all SDs have
41999+ zero offset, this offset is not
42000+ stored on disk).
42001+ ...
42002+ ENTRY_BODY_N
42003+
42004+ When it comes to the balancing, each directory entry in compound directory
42005+ item is unit, that is, something that can be cut from one item and pasted
42006+ into another item of the same type. Handling of unit cut and paste is major
42007+ reason for the complexity of code below.
42008+
42009+*/
42010+
42011+#include "../../forward.h"
42012+#include "../../debug.h"
42013+#include "../../dformat.h"
42014+#include "../../kassign.h"
42015+#include "../../key.h"
42016+#include "../../coord.h"
42017+#include "sde.h"
42018+#include "cde.h"
42019+#include "item.h"
42020+#include "../node/node.h"
42021+#include "../plugin.h"
42022+#include "../../znode.h"
42023+#include "../../carry.h"
42024+#include "../../tree.h"
42025+#include "../../inode.h"
42026+
42027+#include <linux/fs.h> /* for struct inode */
42028+#include <linux/dcache.h> /* for struct dentry */
42029+#include <linux/quotaops.h>
42030+
42031+#if 0
42032+#define CHECKME(coord) \
42033+({ \
42034+ const char *message; \
42035+ coord_t dup; \
42036+ \
42037+ coord_dup_nocheck(&dup, (coord)); \
42038+ dup.unit_pos = 0; \
42039+ assert("nikita-2871", cde_check(&dup, &message) == 0); \
42040+})
42041+#else
42042+#define CHECKME(coord) noop
42043+#endif
42044+
42045+/* return body of compound directory item at @coord */
42046+static inline cde_item_format *formatted_at(const coord_t * coord)
42047+{
42048+ assert("nikita-1282", coord != NULL);
42049+ return item_body_by_coord(coord);
42050+}
42051+
42052+/* return entry header at @coord */
42053+static inline cde_unit_header *header_at(const coord_t *
42054+ coord /* coord of item */ ,
42055+ int idx /* index of unit */ )
42056+{
42057+ assert("nikita-1283", coord != NULL);
42058+ return &formatted_at(coord)->entry[idx];
42059+}
42060+
42061+/* return number of units in compound directory item at @coord */
42062+static int units(const coord_t * coord /* coord of item */ )
42063+{
42064+ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
42065+}
42066+
42067+/* return offset of the body of @idx-th entry in @coord */
42068+static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
42069+ int idx /* index of unit */ )
42070+{
42071+ if (idx < units(coord))
42072+ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
42073+ else if (idx == units(coord))
42074+ return item_length_by_coord(coord);
42075+ else
42076+ impossible("nikita-1308", "Wrong idx");
42077+ return 0;
42078+}
42079+
42080+/* set offset of the body of @idx-th entry in @coord */
42081+static void set_offset(const coord_t * coord /* coord of item */ ,
42082+ int idx /* index of unit */ ,
42083+ unsigned int offset /* new offset */ )
42084+{
42085+ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
42086+}
42087+
42088+static void adj_offset(const coord_t * coord /* coord of item */ ,
42089+ int idx /* index of unit */ ,
42090+ int delta /* offset change */ )
42091+{
42092+ d16 *doffset;
42093+ __u16 offset;
42094+
42095+ doffset = &header_at(coord, idx)->offset;
42096+ offset = le16_to_cpu(get_unaligned(doffset));
42097+ offset += delta;
42098+ put_unaligned(cpu_to_le16((__u16) offset), doffset);
42099+}
42100+
42101+/* return pointer to @offset-th byte from the beginning of @coord */
42102+static char *address(const coord_t * coord /* coord of item */ ,
42103+ int offset)
42104+{
42105+ return ((char *)item_body_by_coord(coord)) + offset;
42106+}
42107+
42108+/* return pointer to the body of @idx-th entry in @coord */
42109+static directory_entry_format *entry_at(const coord_t * coord /* coord of
42110+ * item */ ,
42111+ int idx /* index of unit */ )
42112+{
42113+ return (directory_entry_format *) address(coord,
42114+ (int)offset_of(coord, idx));
42115+}
42116+
42117+/* return number of unit referenced by @coord */
42118+static int idx_of(const coord_t * coord /* coord of item */ )
42119+{
42120+ assert("nikita-1285", coord != NULL);
42121+ return coord->unit_pos;
42122+}
42123+
42124+/* find position where entry with @entry_key would be inserted into @coord */
42125+static int find(const coord_t * coord /* coord of item */ ,
42126+ const reiser4_key * entry_key /* key to look for */ ,
42127+ cmp_t * last /* result of last comparison */ )
42128+{
42129+ int entries;
42130+
42131+ int left;
42132+ int right;
42133+
42134+ cde_unit_header *header;
42135+
42136+ assert("nikita-1295", coord != NULL);
42137+ assert("nikita-1296", entry_key != NULL);
42138+ assert("nikita-1297", last != NULL);
42139+
42140+ entries = units(coord);
42141+ left = 0;
42142+ right = entries - 1;
42143+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
42144+ int median;
42145+
42146+ median = (left + right) >> 1;
42147+
42148+ header = header_at(coord, median);
42149+ *last = de_id_key_cmp(&header->hash, entry_key);
42150+ switch (*last) {
42151+ case LESS_THAN:
42152+ left = median;
42153+ break;
42154+ case GREATER_THAN:
42155+ right = median;
42156+ break;
42157+ case EQUAL_TO:{
42158+ do {
42159+ median--;
42160+ header--;
42161+ } while (median >= 0 &&
42162+ de_id_key_cmp(&header->hash,
42163+ entry_key) == EQUAL_TO);
42164+ return median + 1;
42165+ }
42166+ }
42167+ }
42168+ header = header_at(coord, left);
42169+ for (; left < entries; ++left, ++header) {
42170+ prefetch(header + 1);
42171+ *last = de_id_key_cmp(&header->hash, entry_key);
42172+ if (*last != LESS_THAN)
42173+ break;
42174+ }
42175+ if (left < entries)
42176+ return left;
42177+ else
42178+ return RETERR(-ENOENT);
42179+
42180+}
42181+
42182+/* expand @coord as to accommodate for insertion of @no new entries starting
42183+ from @pos, with total bodies size @size. */
42184+static int expand_item(const coord_t * coord /* coord of item */ ,
42185+ int pos /* unit position */ , int no /* number of new
42186+ * units*/ ,
42187+ int size /* total size of new units' data */ ,
42188+ unsigned int data_size /* free space already reserved
42189+ * in the item for insertion */ )
42190+{
42191+ int entries;
42192+ cde_unit_header *header;
42193+ char *dent;
42194+ int i;
42195+
42196+ assert("nikita-1310", coord != NULL);
42197+ assert("nikita-1311", pos >= 0);
42198+ assert("nikita-1312", no > 0);
42199+ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
42200+ assert("nikita-1343",
42201+ item_length_by_coord(coord) >=
42202+ (int)(size + data_size + no * sizeof *header));
42203+
42204+ entries = units(coord);
42205+
42206+ if (pos == entries)
42207+ dent = address(coord, size);
42208+ else
42209+ dent = (char *)entry_at(coord, pos);
42210+ /* place where new header will be in */
42211+ header = header_at(coord, pos);
42212+ /* free space for new entry headers */
42213+ memmove(header + no, header,
42214+ (unsigned)(address(coord, size) - (char *)header));
42215+ /* if adding to the end initialise first new header */
42216+ if (pos == entries) {
42217+ set_offset(coord, pos, (unsigned)size);
42218+ }
42219+
42220+ /* adjust entry pointer and size */
42221+ dent = dent + no * sizeof *header;
42222+ size += no * sizeof *header;
42223+ /* free space for new entries */
42224+ memmove(dent + data_size, dent,
42225+ (unsigned)(address(coord, size) - dent));
42226+
42227+ /* increase counter */
42228+ entries += no;
42229+ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
42230+
42231+ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
42232+ bytes. */
42233+ for (i = 0; i <= pos; ++i)
42234+ adj_offset(coord, i, no * sizeof *header);
42235+ /* [ pos + no ... +\infty ) entries were shifted by ( no *
42236+ sizeof *header + data_size ) bytes */
42237+ for (i = pos + no; i < entries; ++i)
42238+ adj_offset(coord, i, no * sizeof *header + data_size);
42239+ return 0;
42240+}
42241+
42242+/* insert new @entry into item */
42243+static int expand(const coord_t * coord /* coord of item */ ,
42244+ struct cde_entry * entry /* entry to insert */ ,
42245+ int len /* length of @entry data */ ,
42246+ int *pos /* position to insert */ ,
42247+ reiser4_dir_entry_desc * dir_entry /* parameters for new
42248+ * entry */ )
42249+{
42250+ cmp_t cmp_res;
42251+ int datasize;
42252+
42253+ *pos = find(coord, &dir_entry->key, &cmp_res);
42254+ if (*pos < 0)
42255+ *pos = units(coord);
42256+
42257+ datasize = sizeof(directory_entry_format);
42258+ if (is_longname(entry->name->name, entry->name->len))
42259+ datasize += entry->name->len + 1;
42260+
42261+ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
42262+ datasize);
42263+ return 0;
42264+}
42265+
42266+/* paste body of @entry into item */
42267+static int paste_entry(const coord_t * coord /* coord of item */ ,
42268+ struct cde_entry * entry /* new entry */ ,
42269+ int pos /* position to insert */ ,
42270+ reiser4_dir_entry_desc * dir_entry /* parameters for
42271+ * new entry */ )
42272+{
42273+ cde_unit_header *header;
42274+ directory_entry_format *dent;
42275+ const char *name;
42276+ int len;
42277+
42278+ header = header_at(coord, pos);
42279+ dent = entry_at(coord, pos);
42280+
42281+ build_de_id_by_key(&dir_entry->key, &header->hash);
42282+ build_inode_key_id(entry->obj, &dent->id);
42283+ /* AUDIT unsafe strcpy() operation! It should be replaced with
42284+ much less CPU hungry
42285+ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
42286+
42287+ Also a more major thing is that there should be a way to figure out
42288+ amount of space in dent -> name and be able to check that we are
42289+ not going to overwrite more than we supposed to */
42290+ name = entry->name->name;
42291+ len = entry->name->len;
42292+ if (is_longname(name, len)) {
42293+ strcpy((unsigned char *)dent->name, name);
42294+ put_unaligned(0, &dent->name[len]);
42295+ }
42296+ return 0;
42297+}
42298+
42299+/* estimate how much space is necessary in item to insert/paste set of entries
42300+ described in @data. */
42301+int estimate_cde(const coord_t * coord /* coord of item */ ,
42302+ const reiser4_item_data * data /* parameters for new item */ )
42303+{
42304+ struct cde_entry_data *e;
42305+ int result;
42306+ int i;
42307+
42308+ e = (struct cde_entry_data *) data->data;
42309+
42310+ assert("nikita-1288", e != NULL);
42311+ assert("nikita-1289", e->num_of_entries >= 0);
42312+
42313+ if (coord == NULL)
42314+ /* insert */
42315+ result = sizeof(cde_item_format);
42316+ else
42317+ /* paste */
42318+ result = 0;
42319+
42320+ result += e->num_of_entries *
42321+ (sizeof(cde_unit_header) + sizeof(directory_entry_format));
42322+ for (i = 0; i < e->num_of_entries; ++i) {
42323+ const char *name;
42324+ int len;
42325+
42326+ name = e->entry[i].name->name;
42327+ len = e->entry[i].name->len;
42328+ assert("nikita-2054", strlen(name) == len);
42329+ if (is_longname(name, len))
42330+ result += len + 1;
42331+ }
42332+ ((reiser4_item_data *) data)->length = result;
42333+ return result;
42334+}
42335+
42336+/* ->nr_units() method for this item plugin. */
42337+pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
42338+{
42339+ return units(coord);
42340+}
42341+
42342+/* ->unit_key() method for this item plugin. */
42343+reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
42344+ reiser4_key * key /* resulting key */ )
42345+{
42346+ assert("nikita-1452", coord != NULL);
42347+ assert("nikita-1345", idx_of(coord) < units(coord));
42348+ assert("nikita-1346", key != NULL);
42349+
42350+ item_key_by_coord(coord, key);
42351+ extract_key_from_de_id(extract_dir_id_from_key(key),
42352+ &header_at(coord, idx_of(coord))->hash, key);
42353+ return key;
42354+}
42355+
42356+/* mergeable_cde(): implementation of ->mergeable() item method.
42357+
42358+ Two directory items are mergeable iff they are from the same
42359+ directory. That simple.
42360+
42361+*/
42362+int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
42363+ const coord_t * p2 /* coord of second item */ )
42364+{
42365+ reiser4_key k1;
42366+ reiser4_key k2;
42367+
42368+ assert("nikita-1339", p1 != NULL);
42369+ assert("nikita-1340", p2 != NULL);
42370+
42371+ return
42372+ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
42373+ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
42374+ extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
42375+
42376+}
42377+
42378+/* ->max_key_inside() method for this item plugin. */
42379+reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
42380+ reiser4_key * result /* resulting key */ )
42381+{
42382+ assert("nikita-1342", coord != NULL);
42383+
42384+ item_key_by_coord(coord, result);
42385+ set_key_ordering(result, get_key_ordering(reiser4_max_key()));
42386+ set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
42387+ set_key_offset(result, get_key_offset(reiser4_max_key()));
42388+ return result;
42389+}
42390+
42391+/* @data contains data which are to be put into tree */
42392+int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
42393+ const reiser4_key * key /* key to check */ ,
42394+ const reiser4_item_data * data /* parameters of new
42395+ * item/unit being
42396+ * created */ )
42397+{
42398+ reiser4_key item_key;
42399+
42400+ /* FIXME-VS: do not rely on anything but iplug field of @data. Only
42401+ data->iplug is initialized */
42402+ assert("vs-457", data && data->iplug);
42403+/* assert( "vs-553", data -> user == 0 );*/
42404+ item_key_by_coord(coord, &item_key);
42405+
42406+ return (item_plugin_by_coord(coord) == data->iplug) &&
42407+ (extract_dir_id_from_key(&item_key) ==
42408+ extract_dir_id_from_key(key));
42409+}
42410+
42411+#if REISER4_DEBUG
42412+/* cde_check ->check() method for compressed directory items
42413+
42414+ used for debugging, every item should have here the most complete
42415+ possible check of the consistency of the item that the inventor can
42416+ construct
42417+*/
42418+int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
42419+ const char **error /* where to store error message */)
42420+{
42421+ int i;
42422+ int result;
42423+ char *item_start;
42424+ char *item_end;
42425+ reiser4_key key;
42426+
42427+ coord_t c;
42428+
42429+ assert("nikita-1357", coord != NULL);
42430+ assert("nikita-1358", error != NULL);
42431+
42432+ if (!ergo(coord->item_pos != 0,
42433+ is_dot_key(item_key_by_coord(coord, &key)))) {
42434+ *error = "CDE doesn't start with dot";
42435+ return -1;
42436+ }
42437+ item_start = item_body_by_coord(coord);
42438+ item_end = item_start + item_length_by_coord(coord);
42439+
42440+ coord_dup(&c, coord);
42441+ result = 0;
42442+ for (i = 0; i < units(coord); ++i) {
42443+ directory_entry_format *entry;
42444+
42445+ if ((char *)(header_at(coord, i) + 1) >
42446+ item_end - units(coord) * sizeof *entry) {
42447+ *error = "CDE header is out of bounds";
42448+ result = -1;
42449+ break;
42450+ }
42451+ entry = entry_at(coord, i);
42452+ if ((char *)entry < item_start + sizeof(cde_item_format)) {
42453+ *error = "CDE header is too low";
42454+ result = -1;
42455+ break;
42456+ }
42457+ if ((char *)(entry + 1) > item_end) {
42458+ *error = "CDE header is too high";
42459+ result = -1;
42460+ break;
42461+ }
42462+ }
42463+
42464+ return result;
42465+}
42466+#endif
42467+
42468+/* ->init() method for this item plugin. */
42469+int init_cde(coord_t * coord /* coord of item */ ,
42470+ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
42471+ UNUSED_ARG)
42472+{
42473+ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
42474+ return 0;
42475+}
42476+
42477+/* ->lookup() method for this item plugin. */
42478+lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
42479+ lookup_bias bias /* search bias */ ,
42480+ coord_t * coord /* coord of item to lookup in */ )
42481+{
42482+ cmp_t last_comp;
42483+ int pos;
42484+
42485+ reiser4_key utmost_key;
42486+
42487+ assert("nikita-1293", coord != NULL);
42488+ assert("nikita-1294", key != NULL);
42489+
42490+ CHECKME(coord);
42491+
42492+ if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
42493+ coord->unit_pos = 0;
42494+ coord->between = BEFORE_UNIT;
42495+ return CBK_COORD_NOTFOUND;
42496+ }
42497+ pos = find(coord, key, &last_comp);
42498+ if (pos >= 0) {
42499+ coord->unit_pos = (int)pos;
42500+ switch (last_comp) {
42501+ case EQUAL_TO:
42502+ coord->between = AT_UNIT;
42503+ return CBK_COORD_FOUND;
42504+ case GREATER_THAN:
42505+ coord->between = BEFORE_UNIT;
42506+ return RETERR(-ENOENT);
42507+ case LESS_THAN:
42508+ default:
42509+ impossible("nikita-1298", "Broken find");
42510+ return RETERR(-EIO);
42511+ }
42512+ } else {
42513+ coord->unit_pos = units(coord) - 1;
42514+ coord->between = AFTER_UNIT;
42515+ return (bias ==
42516+ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
42517+ CBK_COORD_NOTFOUND;
42518+ }
42519+}
42520+
42521+/* ->paste() method for this item plugin. */
42522+int paste_cde(coord_t * coord /* coord of item */ ,
42523+ reiser4_item_data * data /* parameters of new unit being
42524+ * inserted */ ,
42525+ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
42526+{
42527+ struct cde_entry_data *e;
42528+ int result;
42529+ int i;
42530+
42531+ CHECKME(coord);
42532+ e = (struct cde_entry_data *) data->data;
42533+
42534+ result = 0;
42535+ for (i = 0; i < e->num_of_entries; ++i) {
42536+ int pos;
42537+ int phantom_size;
42538+
42539+ phantom_size = data->length;
42540+ if (units(coord) == 0)
42541+ phantom_size -= sizeof(cde_item_format);
42542+
42543+ result =
42544+ expand(coord, e->entry + i, phantom_size, &pos, data->arg);
42545+ if (result != 0)
42546+ break;
42547+ result = paste_entry(coord, e->entry + i, pos, data->arg);
42548+ if (result != 0)
42549+ break;
42550+ }
42551+ CHECKME(coord);
42552+ return result;
42553+}
42554+
42555+/* amount of space occupied by all entries starting from @idx both headers and
42556+ bodies. */
42557+static unsigned int part_size(const coord_t * coord /* coord of item */ ,
42558+ int idx /* index of unit */ )
42559+{
42560+ assert("nikita-1299", coord != NULL);
42561+ assert("nikita-1300", idx < (int)units(coord));
42562+
42563+ return sizeof(cde_item_format) +
42564+ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
42565+ idx + 1) -
42566+ offset_of(coord, 0);
42567+}
42568+
42569+/* how many but not more than @want units of @source can be merged with
42570+ item in @target node. If pend == append - we try to append last item
42571+ of @target by first units of @source. If pend == prepend - we try to
42572+ "prepend" first item in @target by last units of @source. @target
42573+ node has @free_space bytes of free space. Total size of those units
42574+ are returned via @size */
42575+int can_shift_cde(unsigned free_space /* free space in item */ ,
42576+ coord_t * coord /* coord of source item */ ,
42577+ znode * target /* target node */ ,
42578+ shift_direction pend /* shift direction */ ,
42579+ unsigned *size /* resulting number of shifted bytes */ ,
42580+ unsigned want /* maximal number of bytes to shift */ )
42581+{
42582+ int shift;
42583+
42584+ CHECKME(coord);
42585+ if (want == 0) {
42586+ *size = 0;
42587+ return 0;
42588+ }
42589+
42590+ /* pend == SHIFT_LEFT <==> shifting to the left */
42591+ if (pend == SHIFT_LEFT) {
42592+ for (shift = min((int)want - 1, units(coord)); shift >= 0;
42593+ --shift) {
42594+ *size = part_size(coord, shift);
42595+ if (target != NULL)
42596+ *size -= sizeof(cde_item_format);
42597+ if (*size <= free_space)
42598+ break;
42599+ }
42600+ shift = shift + 1;
42601+ } else {
42602+ int total_size;
42603+
42604+ assert("nikita-1301", pend == SHIFT_RIGHT);
42605+
42606+ total_size = item_length_by_coord(coord);
42607+ for (shift = units(coord) - want - 1; shift < units(coord) - 1;
42608+ ++shift) {
42609+ *size = total_size - part_size(coord, shift);
42610+ if (target == NULL)
42611+ *size += sizeof(cde_item_format);
42612+ if (*size <= free_space)
42613+ break;
42614+ }
42615+ shift = units(coord) - shift - 1;
42616+ }
42617+ if (shift == 0)
42618+ *size = 0;
42619+ CHECKME(coord);
42620+ return shift;
42621+}
42622+
42623+/* ->copy_units() method for this item plugin. */
42624+void copy_units_cde(coord_t * target /* coord of target item */ ,
42625+ coord_t * source /* coord of source item */ ,
42626+ unsigned from /* starting unit */ ,
42627+ unsigned count /* how many units to copy */ ,
42628+ shift_direction where_is_free_space /* shift direction */ ,
42629+ unsigned free_space /* free space in item */ )
42630+{
42631+ char *header_from;
42632+ char *header_to;
42633+
42634+ char *entry_from;
42635+ char *entry_to;
42636+
42637+ int pos_in_target;
42638+ int data_size;
42639+ int data_delta;
42640+ int i;
42641+
42642+ assert("nikita-1303", target != NULL);
42643+ assert("nikita-1304", source != NULL);
42644+ assert("nikita-1305", (int)from < units(source));
42645+ assert("nikita-1307", (int)(from + count) <= units(source));
42646+
42647+ if (where_is_free_space == SHIFT_LEFT) {
42648+ assert("nikita-1453", from == 0);
42649+ pos_in_target = units(target);
42650+ } else {
42651+ assert("nikita-1309", (int)(from + count) == units(source));
42652+ pos_in_target = 0;
42653+ memmove(item_body_by_coord(target),
42654+ (char *)item_body_by_coord(target) + free_space,
42655+ item_length_by_coord(target) - free_space);
42656+ }
42657+
42658+ CHECKME(target);
42659+ CHECKME(source);
42660+
42661+ /* expand @target */
42662+ data_size =
42663+ offset_of(source, (int)(from + count)) - offset_of(source,
42664+ (int)from);
42665+
42666+ if (units(target) == 0)
42667+ free_space -= sizeof(cde_item_format);
42668+
42669+ expand_item(target, pos_in_target, (int)count,
42670+ (int)(item_length_by_coord(target) - free_space),
42671+ (unsigned)data_size);
42672+
42673+ /* copy first @count units of @source into @target */
42674+ data_delta =
42675+ offset_of(target, pos_in_target) - offset_of(source, (int)from);
42676+
42677+ /* copy entries */
42678+ entry_from = (char *)entry_at(source, (int)from);
42679+ entry_to = (char *)entry_at(source, (int)(from + count));
42680+ memmove(entry_at(target, pos_in_target), entry_from,
42681+ (unsigned)(entry_to - entry_from));
42682+
42683+ /* copy headers */
42684+ header_from = (char *)header_at(source, (int)from);
42685+ header_to = (char *)header_at(source, (int)(from + count));
42686+ memmove(header_at(target, pos_in_target), header_from,
42687+ (unsigned)(header_to - header_from));
42688+
42689+ /* update offsets */
42690+ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
42691+ adj_offset(target, i, data_delta);
42692+ CHECKME(target);
42693+ CHECKME(source);
42694+}
42695+
42696+/* ->cut_units() method for this item plugin. */
42697+int cut_units_cde(coord_t * coord /* coord of item */ ,
42698+ pos_in_node_t from /* start unit pos */ ,
42699+ pos_in_node_t to /* stop unit pos */ ,
42700+ struct carry_cut_data *cdata UNUSED_ARG,
42701+ reiser4_key * smallest_removed, reiser4_key * new_first)
42702+{
42703+ char *header_from;
42704+ char *header_to;
42705+
42706+ char *entry_from;
42707+ char *entry_to;
42708+
42709+ int size;
42710+ int entry_delta;
42711+ int header_delta;
42712+ int i;
42713+
42714+ unsigned count;
42715+
42716+ CHECKME(coord);
42717+
42718+ count = to - from + 1;
42719+
42720+ assert("nikita-1454", coord != NULL);
42721+ assert("nikita-1455", (int)(from + count) <= units(coord));
42722+
42723+ if (smallest_removed)
42724+ unit_key_by_coord(coord, smallest_removed);
42725+
42726+ if (new_first) {
42727+ coord_t next;
42728+
42729+ /* not everything is cut from item head */
42730+ assert("vs-1527", from == 0);
42731+ assert("vs-1528", to < units(coord) - 1);
42732+
42733+ coord_dup(&next, coord);
42734+ next.unit_pos++;
42735+ unit_key_by_coord(&next, new_first);
42736+ }
42737+
42738+ size = item_length_by_coord(coord);
42739+ if (count == (unsigned)units(coord)) {
42740+ return size;
42741+ }
42742+
42743+ header_from = (char *)header_at(coord, (int)from);
42744+ header_to = (char *)header_at(coord, (int)(from + count));
42745+
42746+ entry_from = (char *)entry_at(coord, (int)from);
42747+ entry_to = (char *)entry_at(coord, (int)(from + count));
42748+
42749+ /* move headers */
42750+ memmove(header_from, header_to,
42751+ (unsigned)(address(coord, size) - header_to));
42752+
42753+ header_delta = header_to - header_from;
42754+
42755+ entry_from -= header_delta;
42756+ entry_to -= header_delta;
42757+ size -= header_delta;
42758+
42759+ /* copy entries */
42760+ memmove(entry_from, entry_to,
42761+ (unsigned)(address(coord, size) - entry_to));
42762+
42763+ entry_delta = entry_to - entry_from;
42764+ size -= entry_delta;
42765+
42766+ /* update offsets */
42767+
42768+ for (i = 0; i < (int)from; ++i)
42769+ adj_offset(coord, i, -header_delta);
42770+
42771+ for (i = from; i < units(coord) - (int)count; ++i)
42772+ adj_offset(coord, i, -header_delta - entry_delta);
42773+
42774+ put_unaligned(cpu_to_le16((__u16) units(coord) - count),
42775+ &formatted_at(coord)->num_of_entries);
42776+
42777+ if (from == 0) {
42778+ /* entries from head was removed - move remaining to right */
42779+ memmove((char *)item_body_by_coord(coord) +
42780+ header_delta + entry_delta, item_body_by_coord(coord),
42781+ (unsigned)size);
42782+ if (REISER4_DEBUG)
42783+ memset(item_body_by_coord(coord), 0,
42784+ (unsigned)header_delta + entry_delta);
42785+ } else {
42786+ /* freed space is already at the end of item */
42787+ if (REISER4_DEBUG)
42788+ memset((char *)item_body_by_coord(coord) + size, 0,
42789+ (unsigned)header_delta + entry_delta);
42790+ }
42791+
42792+ return header_delta + entry_delta;
42793+}
42794+
42795+int kill_units_cde(coord_t * coord /* coord of item */ ,
42796+ pos_in_node_t from /* start unit pos */ ,
42797+ pos_in_node_t to /* stop unit pos */ ,
42798+ struct carry_kill_data *kdata UNUSED_ARG,
42799+ reiser4_key * smallest_removed, reiser4_key * new_first)
42800+{
42801+ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
42802+}
42803+
42804+/* ->s.dir.extract_key() method for this item plugin. */
42805+int extract_key_cde(const coord_t * coord /* coord of item */ ,
42806+ reiser4_key * key /* resulting key */ )
42807+{
42808+ directory_entry_format *dent;
42809+
42810+ assert("nikita-1155", coord != NULL);
42811+ assert("nikita-1156", key != NULL);
42812+
42813+ dent = entry_at(coord, idx_of(coord));
42814+ return extract_key_from_id(&dent->id, key);
42815+}
42816+
42817+int
42818+update_key_cde(const coord_t * coord, const reiser4_key * key,
42819+ lock_handle * lh UNUSED_ARG)
42820+{
42821+ directory_entry_format *dent;
42822+ obj_key_id obj_id;
42823+ int result;
42824+
42825+ assert("nikita-2344", coord != NULL);
42826+ assert("nikita-2345", key != NULL);
42827+
42828+ dent = entry_at(coord, idx_of(coord));
42829+ result = build_obj_key_id(key, &obj_id);
42830+ if (result == 0) {
42831+ dent->id = obj_id;
42832+ znode_make_dirty(coord->node);
42833+ }
42834+ return 0;
42835+}
42836+
42837+/* ->s.dir.extract_name() method for this item plugin. */
42838+char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
42839+{
42840+ directory_entry_format *dent;
42841+
42842+ assert("nikita-1157", coord != NULL);
42843+
42844+ dent = entry_at(coord, idx_of(coord));
42845+ return extract_dent_name(coord, dent, buf);
42846+}
42847+
42848+static int cde_bytes(int pasting, const reiser4_item_data * data)
42849+{
42850+ int result;
42851+
42852+ result = data->length;
42853+ if (!pasting)
42854+ result -= sizeof(cde_item_format);
42855+ return result;
42856+}
42857+
42858+/* ->s.dir.add_entry() method for this item plugin */
42859+int add_entry_cde(struct inode *dir /* directory object */ ,
42860+ coord_t * coord /* coord of item */ ,
42861+ lock_handle * lh /* lock handle for insertion */ ,
42862+ const struct dentry *name /* name to insert */ ,
42863+ reiser4_dir_entry_desc * dir_entry /* parameters of new
42864+ * directory entry */ )
42865+{
42866+ reiser4_item_data data;
42867+ struct cde_entry entry;
42868+ struct cde_entry_data edata;
42869+ int result;
42870+
42871+ assert("nikita-1656", coord->node == lh->node);
42872+ assert("nikita-1657", znode_is_write_locked(coord->node));
42873+
42874+ edata.num_of_entries = 1;
42875+ edata.entry = &entry;
42876+
42877+ entry.dir = dir;
42878+ entry.obj = dir_entry->obj;
42879+ entry.name = &name->d_name;
42880+
42881+ data.data = (char *)&edata;
42882+ data.user = 0; /* &edata is not user space */
42883+ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
42884+ data.arg = dir_entry;
42885+ assert("nikita-1302", data.iplug != NULL);
42886+
42887+ result = is_dot_key(&dir_entry->key);
42888+ data.length = estimate_cde(result ? coord : NULL, &data);
42889+
42890+ /* NOTE-NIKITA quota plugin? */
42891+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
42892+ return RETERR(-EDQUOT);
42893+
42894+ if (result)
42895+ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
42896+ else
42897+ result = reiser4_resize_item(coord, &data, &dir_entry->key,
42898+ lh, 0);
42899+ return result;
42900+}
42901+
42902+/* ->s.dir.rem_entry() */
42903+int rem_entry_cde(struct inode *dir /* directory of item */ ,
42904+ const struct qstr *name, coord_t * coord /* coord of item */ ,
42905+ lock_handle * lh UNUSED_ARG /* lock handle for
42906+ * removal */ ,
42907+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
42908+ * directory entry
42909+ * being removed */ )
42910+{
42911+ coord_t shadow;
42912+ int result;
42913+ int length;
42914+ ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
42915+
42916+ assert("nikita-2870", strlen(name->name) == name->len);
42917+ assert("nikita-2869",
42918+ !strcmp(name->name, extract_name_cde(coord, buf)));
42919+
42920+ length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
42921+ if (is_longname(name->name, name->len))
42922+ length += name->len + 1;
42923+
42924+ if (inode_get_bytes(dir) < length) {
42925+ warning("nikita-2628", "Dir is broke: %llu: %llu",
42926+ (unsigned long long)get_inode_oid(dir),
42927+ inode_get_bytes(dir));
42928+
42929+ return RETERR(-EIO);
42930+ }
42931+
42932+ /* cut_node() is supposed to take pointers to _different_
42933+ coords, because it will modify them without respect to
42934+ possible aliasing. To work around this, create temporary copy
42935+ of @coord.
42936+ */
42937+ coord_dup(&shadow, coord);
42938+ result =
42939+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
42940+ if (result == 0) {
42941+ /* NOTE-NIKITA quota plugin? */
42942+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
42943+ }
42944+ return result;
42945+}
42946+
42947+/* ->s.dir.max_name_len() method for this item plugin */
42948+int max_name_len_cde(const struct inode *dir /* directory */ )
42949+{
42950+ return
42951+ reiser4_tree_by_inode(dir)->nplug->max_item_size() -
42952+ sizeof(directory_entry_format) - sizeof(cde_item_format) -
42953+ sizeof(cde_unit_header) - 2;
42954+}
42955+
42956+/* Make Linus happy.
42957+ Local variables:
42958+ c-indentation-style: "K&R"
42959+ mode-name: "LC"
42960+ c-basic-offset: 8
42961+ tab-width: 8
42962+ fill-column: 120
42963+ End:
42964+*/
42965diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/cde.h linux-2.6.24/fs/reiser4/plugin/item/cde.h
42966--- linux-2.6.24.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 03:00:00.000000000 +0300
42967+++ linux-2.6.24/fs/reiser4/plugin/item/cde.h 2008-01-25 11:39:07.004225206 +0300
42968@@ -0,0 +1,87 @@
42969+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
42970+
42971+/* Compound directory item. See cde.c for description. */
42972+
42973+#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
42974+#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
42975+
42976+#include "../../forward.h"
42977+#include "../../kassign.h"
42978+#include "../../dformat.h"
42979+
42980+#include <linux/fs.h> /* for struct inode */
42981+#include <linux/dcache.h> /* for struct dentry, etc */
42982+
42983+typedef struct cde_unit_header {
42984+ de_id hash;
42985+ d16 offset;
42986+} cde_unit_header;
42987+
42988+typedef struct cde_item_format {
42989+ d16 num_of_entries;
42990+ cde_unit_header entry[0];
42991+} cde_item_format;
42992+
42993+struct cde_entry {
42994+ const struct inode *dir;
42995+ const struct inode *obj;
42996+ const struct qstr *name;
42997+};
42998+
42999+struct cde_entry_data {
43000+ int num_of_entries;
43001+ struct cde_entry *entry;
43002+};
43003+
43004+/* plugin->item.b.* */
43005+reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
43006+int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
43007+ const reiser4_item_data *);
43008+int mergeable_cde(const coord_t * p1, const coord_t * p2);
43009+pos_in_node_t nr_units_cde(const coord_t * coord);
43010+reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
43011+int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
43012+void print_cde(const char *prefix, coord_t * coord);
43013+int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
43014+lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
43015+ coord_t * coord);
43016+int paste_cde(coord_t * coord, reiser4_item_data * data,
43017+ carry_plugin_info * info UNUSED_ARG);
43018+int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
43019+ shift_direction pend, unsigned *size, unsigned want);
43020+void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
43021+ unsigned count, shift_direction where_is_free_space,
43022+ unsigned free_space);
43023+int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43024+ struct carry_cut_data *, reiser4_key * smallest_removed,
43025+ reiser4_key * new_first);
43026+int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43027+ struct carry_kill_data *, reiser4_key * smallest_removed,
43028+ reiser4_key * new_first);
43029+void print_cde(const char *prefix, coord_t * coord);
43030+int reiser4_check_cde(const coord_t * coord, const char **error);
43031+
43032+/* plugin->u.item.s.dir.* */
43033+int extract_key_cde(const coord_t * coord, reiser4_key * key);
43034+int update_key_cde(const coord_t * coord, const reiser4_key * key,
43035+ lock_handle * lh);
43036+char *extract_name_cde(const coord_t * coord, char *buf);
43037+int add_entry_cde(struct inode *dir, coord_t * coord,
43038+ lock_handle * lh, const struct dentry *name,
43039+ reiser4_dir_entry_desc * entry);
43040+int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
43041+ lock_handle * lh, reiser4_dir_entry_desc * entry);
43042+int max_name_len_cde(const struct inode *dir);
43043+
43044+/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
43045+#endif
43046+
43047+/* Make Linus happy.
43048+ Local variables:
43049+ c-indentation-style: "K&R"
43050+ mode-name: "LC"
43051+ c-basic-offset: 8
43052+ tab-width: 8
43053+ fill-column: 120
43054+ End:
43055+*/
43056diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.24/fs/reiser4/plugin/item/ctail.c
43057--- linux-2.6.24.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 03:00:00.000000000 +0300
43058+++ linux-2.6.24/fs/reiser4/plugin/item/ctail.c 2008-01-25 11:39:07.008226236 +0300
43059@@ -0,0 +1,1613 @@
43060+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
43061+
43062+/* ctails (aka "clustered tails") are items for cryptcompress objects */
43063+
43064+/* DESCRIPTION:
43065+
43066+Each cryptcompress object is stored on disk as a set of clusters sliced
43067+into ctails.
43068+
43069+Internal on-disk structure:
43070+
43071+ HEADER (1) Here stored disk cluster shift
43072+ BODY
43073+*/
43074+
43075+#include "../../forward.h"
43076+#include "../../debug.h"
43077+#include "../../dformat.h"
43078+#include "../../kassign.h"
43079+#include "../../key.h"
43080+#include "../../coord.h"
43081+#include "item.h"
43082+#include "../node/node.h"
43083+#include "../plugin.h"
43084+#include "../object.h"
43085+#include "../../znode.h"
43086+#include "../../carry.h"
43087+#include "../../tree.h"
43088+#include "../../inode.h"
43089+#include "../../super.h"
43090+#include "../../context.h"
43091+#include "../../page_cache.h"
43092+#include "../cluster.h"
43093+#include "../../flush.h"
43094+#include "../../tree_walk.h"
43095+
43096+#include <linux/pagevec.h>
43097+#include <linux/swap.h>
43098+#include <linux/fs.h>
43099+
43100+/* return body of ctail item at @coord */
43101+static ctail_item_format *ctail_formatted_at(const coord_t * coord)
43102+{
43103+ assert("edward-60", coord != NULL);
43104+ return item_body_by_coord(coord);
43105+}
43106+
43107+static int cluster_shift_by_coord(const coord_t * coord)
43108+{
43109+ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
43110+}
43111+
43112+static inline void dclust_set_extension_shift(hint_t * hint)
43113+{
43114+ assert("edward-1270",
43115+ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
43116+ hint->ext_coord.extension.ctail.shift =
43117+ cluster_shift_by_coord(&hint->ext_coord.coord);
43118+}
43119+
43120+static loff_t off_by_coord(const coord_t * coord)
43121+{
43122+ reiser4_key key;
43123+ return get_key_offset(item_key_by_coord(coord, &key));
43124+}
43125+
43126+int coord_is_unprepped_ctail(const coord_t * coord)
43127+{
43128+ assert("edward-1233", coord != NULL);
43129+ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
43130+ assert("edward-1235",
43131+ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
43132+ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
43133+
43134+ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
43135+}
43136+
43137+static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
43138+{
43139+ int shift;
43140+
43141+ if (inode != NULL) {
43142+ shift = inode_cluster_shift(inode);
43143+ assert("edward-1236",
43144+ ergo(!coord_is_unprepped_ctail(coord),
43145+ shift == cluster_shift_by_coord(coord)));
43146+ } else {
43147+ assert("edward-1237", !coord_is_unprepped_ctail(coord));
43148+ shift = cluster_shift_by_coord(coord);
43149+ }
43150+ return off_by_coord(coord) >> shift;
43151+}
43152+
43153+static int disk_cluster_size(const coord_t * coord)
43154+{
43155+ assert("edward-1156",
43156+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
43157+ /* calculation of disk cluster size
43158+ is meaninless if ctail is unprepped */
43159+ assert("edward-1238", !coord_is_unprepped_ctail(coord));
43160+
43161+ return 1 << cluster_shift_by_coord(coord);
43162+}
43163+
43164+/* true if the key is of first disk cluster item */
43165+static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
43166+{
43167+ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
43168+
43169+ return coord_is_unprepped_ctail(coord) ||
43170+ ((get_key_offset(key) &
43171+ ((loff_t) disk_cluster_size(coord) - 1)) == 0);
43172+}
43173+
43174+static char *first_unit(coord_t * coord)
43175+{
43176+ /* FIXME: warning: pointer of type `void *' used in arithmetic */
43177+ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
43178+}
43179+
43180+/* plugin->u.item.b.max_key_inside :
43181+ tail_max_key_inside */
43182+
43183+/* plugin->u.item.b.can_contain_key */
43184+int
43185+can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
43186+ const reiser4_item_data * data)
43187+{
43188+ reiser4_key item_key;
43189+
43190+ if (item_plugin_by_coord(coord) != data->iplug)
43191+ return 0;
43192+
43193+ item_key_by_coord(coord, &item_key);
43194+ if (get_key_locality(key) != get_key_locality(&item_key) ||
43195+ get_key_objectid(key) != get_key_objectid(&item_key))
43196+ return 0;
43197+ if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
43198+ get_key_offset(key))
43199+ return 0;
43200+ if (is_disk_cluster_key(key, coord))
43201+ return 0;
43202+ return 1;
43203+}
43204+
43205+/* plugin->u.item.b.mergeable */
43206+int mergeable_ctail(const coord_t * p1, const coord_t * p2)
43207+{
43208+ reiser4_key key1, key2;
43209+
43210+ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
43211+ assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
43212+ UNIX_FILE_METADATA_ITEM_TYPE));
43213+
43214+ if (item_id_by_coord(p2) != CTAIL_ID) {
43215+ /* second item is of another type */
43216+ return 0;
43217+ }
43218+
43219+ item_key_by_coord(p1, &key1);
43220+ item_key_by_coord(p2, &key2);
43221+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
43222+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
43223+ get_key_type(&key1) != get_key_type(&key2)) {
43224+ /* items of different objects */
43225+ return 0;
43226+ }
43227+ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
43228+ /* not adjacent items */
43229+ return 0;
43230+ if (is_disk_cluster_key(&key2, p2))
43231+ return 0;
43232+ return 1;
43233+}
43234+
43235+/* plugin->u.item.b.nr_units */
43236+pos_in_node_t nr_units_ctail(const coord_t * coord)
43237+{
43238+ return (item_length_by_coord(coord) -
43239+ sizeof(ctail_formatted_at(coord)->cluster_shift));
43240+}
43241+
43242+/* plugin->u.item.b.estimate:
43243+ estimate how much space is needed to insert/paste @data->length bytes
43244+ into ctail at @coord */
43245+int estimate_ctail(const coord_t * coord /* coord of item */ ,
43246+ const reiser4_item_data *
43247+ data /* parameters for new item */ )
43248+{
43249+ if (coord == NULL)
43250+ /* insert */
43251+ return (sizeof(ctail_item_format) + data->length);
43252+ else
43253+ /* paste */
43254+ return data->length;
43255+}
43256+
43257+/* ->init() method for this item plugin. */
43258+int init_ctail(coord_t * to /* coord of item */ ,
43259+ coord_t * from /* old_item */ ,
43260+ reiser4_item_data * data /* structure used for insertion */ )
43261+{
43262+ int cluster_shift; /* cpu value to convert */
43263+
43264+ if (data) {
43265+ assert("edward-463", data->length > sizeof(ctail_item_format));
43266+ cluster_shift = *((int *)(data->arg));
43267+ data->length -= sizeof(ctail_item_format);
43268+ } else {
43269+ assert("edward-464", from != NULL);
43270+ assert("edward-855", ctail_ok(from));
43271+ cluster_shift = (int)(cluster_shift_by_coord(from));
43272+ }
43273+ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
43274+ assert("edward-856", ctail_ok(to));
43275+ return 0;
43276+}
43277+
43278+/* plugin->u.item.b.lookup:
43279+ NULL: We are looking for item keys only */
43280+
43281+#if REISER4_DEBUG
43282+int ctail_ok(const coord_t * coord)
43283+{
43284+ return coord_is_unprepped_ctail(coord) ||
43285+ cluster_shift_ok(cluster_shift_by_coord(coord));
43286+}
43287+
43288+/* plugin->u.item.b.check */
43289+int check_ctail(const coord_t * coord, const char **error)
43290+{
43291+ if (!ctail_ok(coord)) {
43292+ if (error)
43293+ *error = "bad cluster shift in ctail";
43294+ return 1;
43295+ }
43296+ return 0;
43297+}
43298+#endif
43299+
43300+/* plugin->u.item.b.paste */
43301+int
43302+paste_ctail(coord_t * coord, reiser4_item_data * data,
43303+ carry_plugin_info * info UNUSED_ARG)
43304+{
43305+ unsigned old_nr_units;
43306+
43307+ assert("edward-268", data->data != NULL);
43308+ /* copy only from kernel space */
43309+ assert("edward-66", data->user == 0);
43310+
43311+ old_nr_units =
43312+ item_length_by_coord(coord) - sizeof(ctail_item_format) -
43313+ data->length;
43314+
43315+ /* ctail items never get pasted in the middle */
43316+
43317+ if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
43318+
43319+ /* paste at the beginning when create new item */
43320+ assert("edward-450",
43321+ item_length_by_coord(coord) ==
43322+ data->length + sizeof(ctail_item_format));
43323+ assert("edward-451", old_nr_units == 0);
43324+ } else if (coord->unit_pos == old_nr_units - 1
43325+ && coord->between == AFTER_UNIT) {
43326+
43327+ /* paste at the end */
43328+ coord->unit_pos++;
43329+ } else
43330+ impossible("edward-453", "bad paste position");
43331+
43332+ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
43333+
43334+ assert("edward-857", ctail_ok(coord));
43335+
43336+ return 0;
43337+}
43338+
43339+/* plugin->u.item.b.fast_paste */
43340+
43341+/* plugin->u.item.b.can_shift
43342+ number of units is returned via return value, number of bytes via @size. For
43343+ ctail items they coincide */
43344+int
43345+can_shift_ctail(unsigned free_space, coord_t * source,
43346+ znode * target, shift_direction direction UNUSED_ARG,
43347+ unsigned *size /* number of bytes */ , unsigned want)
43348+{
43349+ /* make sure that that we do not want to shift more than we have */
43350+ assert("edward-68", want > 0 && want <= nr_units_ctail(source));
43351+
43352+ *size = min(want, free_space);
43353+
43354+ if (!target) {
43355+ /* new item will be created */
43356+ if (*size <= sizeof(ctail_item_format)) {
43357+ *size = 0;
43358+ return 0;
43359+ }
43360+ return *size - sizeof(ctail_item_format);
43361+ }
43362+ return *size;
43363+}
43364+
43365+/* plugin->u.item.b.copy_units
43366+ cooperates with ->can_shift() */
43367+void
43368+copy_units_ctail(coord_t * target, coord_t * source,
43369+ unsigned from, unsigned count /* units */ ,
43370+ shift_direction where_is_free_space,
43371+ unsigned free_space /* bytes */ )
43372+{
43373+ /* make sure that item @target is expanded already */
43374+ assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
43375+ assert("edward-70", free_space == count || free_space == count + 1);
43376+
43377+ assert("edward-858", ctail_ok(source));
43378+
43379+ if (where_is_free_space == SHIFT_LEFT) {
43380+ /* append item @target with @count first bytes of @source:
43381+ this restriction came from ordinary tails */
43382+ assert("edward-71", from == 0);
43383+ assert("edward-860", ctail_ok(target));
43384+
43385+ memcpy(first_unit(target) + nr_units_ctail(target) - count,
43386+ first_unit(source), count);
43387+ } else {
43388+ /* target item is moved to right already */
43389+ reiser4_key key;
43390+
43391+ assert("edward-72", nr_units_ctail(source) == from + count);
43392+
43393+ if (free_space == count) {
43394+ init_ctail(target, source, NULL);
43395+ } else {
43396+ /* new item has been created */
43397+ assert("edward-862", ctail_ok(target));
43398+ }
43399+ memcpy(first_unit(target), first_unit(source) + from, count);
43400+
43401+ assert("edward-863", ctail_ok(target));
43402+
43403+ /* new units are inserted before first unit in an item,
43404+ therefore, we have to update item key */
43405+ item_key_by_coord(source, &key);
43406+ set_key_offset(&key, get_key_offset(&key) + from);
43407+
43408+ node_plugin_by_node(target->node)->update_item_key(target, &key,
43409+ NULL /*info */);
43410+ }
43411+}
43412+
43413+/* plugin->u.item.b.create_hook */
43414+int create_hook_ctail(const coord_t * coord, void *arg)
43415+{
43416+ assert("edward-864", znode_is_loaded(coord->node));
43417+
43418+ znode_set_convertible(coord->node);
43419+ return 0;
43420+}
43421+
43422+/* plugin->u.item.b.kill_hook */
43423+int kill_hook_ctail(const coord_t * coord, pos_in_node_t from,
43424+ pos_in_node_t count, carry_kill_data * kdata)
43425+{
43426+ struct inode *inode;
43427+
43428+ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
43429+ assert("edward-291", znode_is_write_locked(coord->node));
43430+
43431+ inode = kdata->inode;
43432+ if (inode) {
43433+ reiser4_key key;
43434+ struct cryptcompress_info * info;
43435+ cloff_t index;
43436+
43437+ item_key_by_coord(coord, &key);
43438+ info = cryptcompress_inode_data(inode);
43439+ index = off_to_clust(get_key_offset(&key), inode);
43440+
43441+ if (from == 0) {
43442+ info->trunc_index = index;
43443+ if (is_disk_cluster_key(&key, coord)) {
43444+ /*
43445+ * first item of disk cluster is to be killed
43446+ */
43447+ truncate_complete_page_cluster(
43448+ inode, index, kdata->params.truncate);
43449+ inode_sub_bytes(inode,
43450+ inode_cluster_size(inode));
43451+ }
43452+ }
43453+ }
43454+ return 0;
43455+}
43456+
43457+/* for shift_hook_ctail(),
43458+ return true if the first disk cluster item has dirty child
43459+*/
43460+static int ctail_convertible(const coord_t * coord)
43461+{
43462+ int result;
43463+ reiser4_key key;
43464+ jnode *child = NULL;
43465+
43466+ assert("edward-477", coord != NULL);
43467+ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
43468+
43469+ if (coord_is_unprepped_ctail(coord))
43470+ /* unprepped ctail should be converted */
43471+ return 1;
43472+
43473+ item_key_by_coord(coord, &key);
43474+ child = jlookup(current_tree,
43475+ get_key_objectid(&key),
43476+ off_to_pg(off_by_coord(coord)));
43477+ if (!child)
43478+ return 0;
43479+ result = JF_ISSET(child, JNODE_DIRTY);
43480+ jput(child);
43481+ return result;
43482+}
43483+
43484+/* FIXME-EDWARD */
43485+/* plugin->u.item.b.shift_hook */
43486+int shift_hook_ctail(const coord_t * item /* coord of item */ ,
43487+ unsigned from UNUSED_ARG /* start unit */ ,
43488+ unsigned count UNUSED_ARG /* stop unit */ ,
43489+ znode * old_node /* old parent */ )
43490+{
43491+ assert("edward-479", item != NULL);
43492+ assert("edward-480", item->node != old_node);
43493+
43494+ if (!znode_convertible(old_node) || znode_convertible(item->node))
43495+ return 0;
43496+ if (ctail_convertible(item))
43497+ znode_set_convertible(item->node);
43498+ return 0;
43499+}
43500+
43501+static int
43502+cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43503+ int cut, void *p, reiser4_key * smallest_removed,
43504+ reiser4_key * new_first)
43505+{
43506+ pos_in_node_t count; /* number of units to cut */
43507+ char *item;
43508+
43509+ count = to - from + 1;
43510+ item = item_body_by_coord(coord);
43511+
43512+ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
43513+
43514+ if (smallest_removed) {
43515+ /* store smallest key removed */
43516+ item_key_by_coord(coord, smallest_removed);
43517+ set_key_offset(smallest_removed,
43518+ get_key_offset(smallest_removed) + from);
43519+ }
43520+
43521+ if (new_first) {
43522+ assert("vs-1531", from == 0);
43523+
43524+ item_key_by_coord(coord, new_first);
43525+ set_key_offset(new_first,
43526+ get_key_offset(new_first) + from + count);
43527+ }
43528+
43529+ if (!cut)
43530+ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
43531+
43532+ if (from == 0) {
43533+ if (count != nr_units_ctail(coord)) {
43534+ /* part of item is removed, so move free space at the beginning
43535+ of the item and update item key */
43536+ reiser4_key key;
43537+ memcpy(item + to + 1, item, sizeof(ctail_item_format));
43538+ item_key_by_coord(coord, &key);
43539+ set_key_offset(&key, get_key_offset(&key) + count);
43540+ node_plugin_by_node(coord->node)->update_item_key(coord,
43541+ &key,
43542+ NULL);
43543+ } else {
43544+ /* cut_units should not be called to cut evrything */
43545+ assert("vs-1532", ergo(cut, 0));
43546+ /* whole item is cut, so more then amount of space occupied
43547+ by units got freed */
43548+ count += sizeof(ctail_item_format);
43549+ }
43550+ if (REISER4_DEBUG)
43551+ memset(item, 0, count);
43552+ } else if (REISER4_DEBUG)
43553+ memset(item + sizeof(ctail_item_format) + from, 0, count);
43554+ return count;
43555+}
43556+
43557+/* plugin->u.item.b.cut_units */
43558+int
43559+cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43560+ carry_cut_data * cdata, reiser4_key * smallest_removed,
43561+ reiser4_key * new_first)
43562+{
43563+ return cut_or_kill_ctail_units(item, from, to, 1, NULL,
43564+ smallest_removed, new_first);
43565+}
43566+
43567+/* plugin->u.item.b.kill_units */
43568+int
43569+kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43570+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
43571+ reiser4_key * new_first)
43572+{
43573+ return cut_or_kill_ctail_units(item, from, to, 0, kdata,
43574+ smallest_removed, new_first);
43575+}
43576+
43577+/* plugin->u.item.s.file.read */
43578+int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
43579+{
43580+ uf_coord_t *uf_coord;
43581+ coord_t *coord;
43582+
43583+ uf_coord = &hint->ext_coord;
43584+ coord = &uf_coord->coord;
43585+ assert("edward-127", f->user == 0);
43586+ assert("edward-129", coord && coord->node);
43587+ assert("edward-130", coord_is_existing_unit(coord));
43588+ assert("edward-132", znode_is_loaded(coord->node));
43589+
43590+ /* start read only from the beginning of ctail */
43591+ assert("edward-133", coord->unit_pos == 0);
43592+ /* read only whole ctails */
43593+ assert("edward-135", nr_units_ctail(coord) <= f->length);
43594+
43595+ assert("edward-136", reiser4_schedulable());
43596+ assert("edward-886", ctail_ok(coord));
43597+
43598+ if (f->data)
43599+ memcpy(f->data, (char *)first_unit(coord),
43600+ (size_t) nr_units_ctail(coord));
43601+
43602+ dclust_set_extension_shift(hint);
43603+ mark_page_accessed(znode_page(coord->node));
43604+ move_flow_forward(f, nr_units_ctail(coord));
43605+
43606+ return 0;
43607+}
43608+
43609+/**
43610+ * Prepare transform stream with plain text for page
43611+ * @page taking into account synchronization issues.
43612+ */
43613+static int ctail_read_disk_cluster(struct cluster_handle * clust,
43614+ struct inode * inode, struct page * page,
43615+ znode_lock_mode mode)
43616+{
43617+ int result;
43618+
43619+ assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
43620+ assert("edward-671", clust->hint != NULL);
43621+ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
43622+ assert("edward-672", cryptcompress_inode_ok(inode));
43623+ assert("edward-1527", PageLocked(page));
43624+
43625+ unlock_page(page);
43626+
43627+ /* set input stream */
43628+ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
43629+ if (result) {
43630+ lock_page(page);
43631+ return result;
43632+ }
43633+ result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
43634+ lock_page(page);
43635+ if (result)
43636+ return result;
43637+ /*
43638+ * at this point we have locked position in the tree
43639+ */
43640+ assert("edward-1528", znode_is_any_locked(clust->hint->lh.node));
43641+
43642+ if (page->mapping != inode->i_mapping) {
43643+ /* page was truncated */
43644+ reiser4_unset_hint(clust->hint);
43645+ reset_cluster_params(clust);
43646+ return AOP_TRUNCATED_PAGE;
43647+ }
43648+ if (PageUptodate(page)) {
43649+ /* disk cluster can be obsolete, don't use it! */
43650+ reiser4_unset_hint(clust->hint);
43651+ reset_cluster_params(clust);
43652+ return 0;
43653+ }
43654+ if (clust->dstat == FAKE_DISK_CLUSTER ||
43655+ clust->dstat == UNPR_DISK_CLUSTER ||
43656+ clust->dstat == TRNC_DISK_CLUSTER) {
43657+ /*
43658+ * this information about disk cluster will be valid
43659+ * as long as we keep the position in the tree locked
43660+ */
43661+ tfm_cluster_set_uptodate(&clust->tc);
43662+ return 0;
43663+ }
43664+ /* now prepare output stream.. */
43665+ result = grab_coa(&clust->tc, inode_compression_plugin(inode));
43666+ if (result)
43667+ return result;
43668+ /* ..and fill this with plain text */
43669+ result = reiser4_inflate_cluster(clust, inode);
43670+ if (result)
43671+ return result;
43672+ /*
43673+ * The stream is ready! It won't be obsolete as
43674+ * long as we keep last disk cluster item locked.
43675+ */
43676+ tfm_cluster_set_uptodate(&clust->tc);
43677+ return 0;
43678+}
43679+
43680+/*
43681+ * fill one page with plain text.
43682+ */
43683+int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust,
43684+ struct page *page, znode_lock_mode mode)
43685+{
43686+ int ret;
43687+ unsigned cloff;
43688+ char *data;
43689+ size_t to_page;
43690+ struct tfm_cluster * tc = &clust->tc;
43691+
43692+ assert("edward-212", PageLocked(page));
43693+
43694+ if (unlikely(page->mapping != inode->i_mapping))
43695+ return AOP_TRUNCATED_PAGE;
43696+ if (PageUptodate(page))
43697+ goto exit;
43698+ to_page = pbytes(page_index(page), inode);
43699+ if (to_page == 0) {
43700+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43701+ SetPageUptodate(page);
43702+ goto exit;
43703+ }
43704+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
43705+ clust->index = pg_to_clust(page->index, inode);
43706+
43707+ /* this will unlock/lock the page */
43708+ ret = ctail_read_disk_cluster(clust, inode, page, mode);
43709+
43710+ assert("edward-212", PageLocked(page));
43711+ if (ret)
43712+ return ret;
43713+
43714+ /* refresh bytes */
43715+ to_page = pbytes(page_index(page), inode);
43716+ if (to_page == 0) {
43717+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43718+ SetPageUptodate(page);
43719+ goto exit;
43720+ }
43721+ }
43722+ if (PageUptodate(page))
43723+ /* somebody else fill it already */
43724+ goto exit;
43725+
43726+ assert("edward-119", tfm_cluster_is_uptodate(tc));
43727+ assert("edward-1529", znode_is_any_locked(clust->hint->lh.node));
43728+
43729+ switch (clust->dstat) {
43730+ case UNPR_DISK_CLUSTER:
43731+ BUG_ON(1);
43732+ case TRNC_DISK_CLUSTER:
43733+ /*
43734+ * Race with truncate!
43735+ * We resolve it in favour of the last one (the only way,
43736+ * as in this case plain text is unrecoverable)
43737+ */
43738+ case FAKE_DISK_CLUSTER:
43739+ /* fill the page by zeroes */
43740+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43741+ SetPageUptodate(page);
43742+ break;
43743+ case PREP_DISK_CLUSTER:
43744+ /* fill page by transformed stream with plain text */
43745+ assert("edward-1058", !PageUptodate(page));
43746+ assert("edward-120", tc->len <= inode_cluster_size(inode));
43747+
43748+ /* page index in this logical cluster */
43749+ cloff = pg_to_off_to_cloff(page->index, inode);
43750+
43751+ data = kmap(page);
43752+ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page);
43753+ memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page);
43754+ flush_dcache_page(page);
43755+ kunmap(page);
43756+ SetPageUptodate(page);
43757+ break;
43758+ default:
43759+ impossible("edward-1169", "bad disk cluster state");
43760+ }
43761+ exit:
43762+ return 0;
43763+}
43764+
43765+/* plugin->u.item.s.file.readpage */
43766+int readpage_ctail(void *vp, struct page *page)
43767+{
43768+ int result;
43769+ hint_t * hint;
43770+ struct cluster_handle * clust = vp;
43771+
43772+ assert("edward-114", clust != NULL);
43773+ assert("edward-115", PageLocked(page));
43774+ assert("edward-116", !PageUptodate(page));
43775+ assert("edward-118", page->mapping && page->mapping->host);
43776+ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
43777+
43778+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
43779+ if (hint == NULL) {
43780+ unlock_page(page);
43781+ return RETERR(-ENOMEM);
43782+ }
43783+ clust->hint = hint;
43784+ result = load_file_hint(clust->file, hint);
43785+ if (result) {
43786+ kfree(hint);
43787+ unlock_page(page);
43788+ return result;
43789+ }
43790+ assert("vs-25", hint->ext_coord.lh == &hint->lh);
43791+
43792+ result = do_readpage_ctail(page->mapping->host, clust, page,
43793+ ZNODE_READ_LOCK);
43794+ assert("edward-213", PageLocked(page));
43795+ assert("edward-1163", ergo(!result, PageUptodate(page)));
43796+
43797+ unlock_page(page);
43798+ done_lh(&hint->lh);
43799+ hint->ext_coord.valid = 0;
43800+ save_file_hint(clust->file, hint);
43801+ kfree(hint);
43802+ tfm_cluster_clr_uptodate(&clust->tc);
43803+
43804+ return result;
43805+}
43806+
43807+/* Helper function for ->readpages() */
43808+static int ctail_read_page_cluster(struct cluster_handle * clust,
43809+ struct inode *inode)
43810+{
43811+ int i;
43812+ int result;
43813+ assert("edward-779", clust != NULL);
43814+ assert("edward-1059", clust->win == NULL);
43815+ assert("edward-780", inode != NULL);
43816+
43817+ result = prepare_page_cluster(inode, clust, READ_OP);
43818+ if (result)
43819+ return result;
43820+
43821+ assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc));
43822+
43823+ for (i = 0; i < clust->nr_pages; i++) {
43824+ struct page *page = clust->pages[i];
43825+ lock_page(page);
43826+ result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
43827+ unlock_page(page);
43828+ if (result)
43829+ break;
43830+ }
43831+ tfm_cluster_clr_uptodate(&clust->tc);
43832+ put_page_cluster(clust, inode, READ_OP);
43833+ return result;
43834+}
43835+
43836+/* filler for read_cache_pages() */
43837+static int ctail_readpages_filler(void * data, struct page * page)
43838+{
43839+ int ret = 0;
43840+ struct cluster_handle * clust = data;
43841+ struct inode * inode = clust->file->f_dentry->d_inode;
43842+
43843+ assert("edward-1525", page->mapping == inode->i_mapping);
43844+
43845+ if (PageUptodate(page)) {
43846+ unlock_page(page);
43847+ return 0;
43848+ }
43849+ if (pbytes(page_index(page), inode) == 0) {
43850+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43851+ SetPageUptodate(page);
43852+ unlock_page(page);
43853+ return 0;
43854+ }
43855+ move_cluster_forward(clust, inode, page->index);
43856+ unlock_page(page);
43857+ /*
43858+ * read the whole page cluster
43859+ */
43860+ ret = ctail_read_page_cluster(clust, inode);
43861+
43862+ assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
43863+ return ret;
43864+}
43865+
43866+/*
43867+ * We populate a bit more then upper readahead suggests:
43868+ * with each nominated page we read the whole page cluster
43869+ * this page belongs to.
43870+ */
43871+int readpages_ctail(struct file *file, struct address_space *mapping,
43872+ struct list_head *pages)
43873+{
43874+ int ret = 0;
43875+ hint_t *hint;
43876+ struct cluster_handle clust;
43877+ struct inode *inode = mapping->host;
43878+
43879+ assert("edward-1521", inode == file->f_dentry->d_inode);
43880+
43881+ cluster_init_read(&clust, NULL);
43882+ clust.file = file;
43883+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
43884+ if (hint == NULL) {
43885+ warning("vs-28", "failed to allocate hint");
43886+ ret = RETERR(-ENOMEM);
43887+ goto exit1;
43888+ }
43889+ clust.hint = hint;
43890+ ret = load_file_hint(clust.file, hint);
43891+ if (ret) {
43892+ warning("edward-1522", "failed to load hint");
43893+ goto exit2;
43894+ }
43895+ assert("vs-26", hint->ext_coord.lh == &hint->lh);
43896+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
43897+ if (ret) {
43898+ warning("edward-1523", "failed to alloc pgset");
43899+ goto exit3;
43900+ }
43901+ ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
43902+
43903+ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
43904+ exit3:
43905+ done_lh(&hint->lh);
43906+ save_file_hint(file, hint);
43907+ hint->ext_coord.valid = 0;
43908+ exit2:
43909+ kfree(hint);
43910+ exit1:
43911+ put_cluster_handle(&clust);
43912+ return ret;
43913+}
43914+
43915+/*
43916+ plugin->u.item.s.file.append_key
43917+ key of the first item of the next disk cluster
43918+*/
43919+reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
43920+{
43921+ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
43922+ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
43923+
43924+ item_key_by_coord(coord, key);
43925+ set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1)
43926+ << cluster_shift_by_coord(coord));
43927+ return key;
43928+}
43929+
43930+static int insert_unprepped_ctail(struct cluster_handle * clust,
43931+ struct inode *inode)
43932+{
43933+ int result;
43934+ char buf[UCTAIL_NR_UNITS];
43935+ reiser4_item_data data;
43936+ reiser4_key key;
43937+ int shift = (int)UCTAIL_SHIFT;
43938+
43939+ memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
43940+ result = key_by_inode_cryptcompress(inode,
43941+ clust_to_off(clust->index, inode),
43942+ &key);
43943+ if (result)
43944+ return result;
43945+ data.user = 0;
43946+ data.iplug = item_plugin_by_id(CTAIL_ID);
43947+ data.arg = &shift;
43948+ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
43949+ data.data = buf;
43950+
43951+ result = insert_by_coord(&clust->hint->ext_coord.coord,
43952+ &data, &key, clust->hint->ext_coord.lh, 0);
43953+ return result;
43954+}
43955+
43956+static int
43957+insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
43958+ int cluster_shift)
43959+{
43960+ int result;
43961+ carry_pool *pool;
43962+ carry_level *lowest_level;
43963+ reiser4_item_data *data;
43964+ carry_op *op;
43965+
43966+ pool =
43967+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
43968+ sizeof(*data));
43969+ if (IS_ERR(pool))
43970+ return PTR_ERR(pool);
43971+ lowest_level = (carry_level *) (pool + 1);
43972+ init_carry_level(lowest_level, pool);
43973+ data = (reiser4_item_data *) (lowest_level + 3);
43974+
43975+ assert("edward-466", coord->between == AFTER_ITEM
43976+ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
43977+ || coord->between == EMPTY_NODE
43978+ || coord->between == BEFORE_UNIT);
43979+
43980+ if (coord->between == AFTER_UNIT) {
43981+ coord->unit_pos = 0;
43982+ coord->between = AFTER_ITEM;
43983+ }
43984+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
43985+ 0 /* operate directly on coord -> node */);
43986+ if (IS_ERR(op) || (op == NULL)) {
43987+ done_carry_pool(pool);
43988+ return RETERR(op ? PTR_ERR(op) : -EIO);
43989+ }
43990+ data->user = 0;
43991+ data->iplug = item_plugin_by_id(CTAIL_ID);
43992+ data->arg = &cluster_shift;
43993+
43994+ data->length = 0;
43995+ data->data = NULL;
43996+
43997+ op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
43998+ op->u.insert_flow.insert_point = coord;
43999+ op->u.insert_flow.flow = f;
44000+ op->u.insert_flow.data = data;
44001+ op->u.insert_flow.new_nodes = 0;
44002+
44003+ lowest_level->track_type = CARRY_TRACK_CHANGE;
44004+ lowest_level->tracked = lh;
44005+
44006+ result = reiser4_carry(lowest_level, NULL);
44007+ done_carry_pool(pool);
44008+
44009+ return result;
44010+}
44011+
44012+/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
44013+static int insert_cryptcompress_flow_in_place(coord_t * coord,
44014+ lock_handle * lh, flow_t * f,
44015+ int cluster_shift)
44016+{
44017+ int ret;
44018+ coord_t pos;
44019+ lock_handle lock;
44020+
44021+ assert("edward-484",
44022+ coord->between == AT_UNIT || coord->between == AFTER_ITEM);
44023+ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
44024+
44025+ coord_dup(&pos, coord);
44026+ pos.unit_pos = 0;
44027+ pos.between = AFTER_ITEM;
44028+
44029+ init_lh(&lock);
44030+ copy_lh(&lock, lh);
44031+
44032+ ret = insert_cryptcompress_flow(&pos, &lock, f, cluster_shift);
44033+ done_lh(&lock);
44034+ assert("edward-1347", znode_is_write_locked(lh->node));
44035+ assert("edward-1228", !ret);
44036+ return ret;
44037+}
44038+
44039+/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
44040+static int overwrite_ctail(coord_t * coord, flow_t * f)
44041+{
44042+ unsigned count;
44043+
44044+ assert("edward-269", f->user == 0);
44045+ assert("edward-270", f->data != NULL);
44046+ assert("edward-271", f->length > 0);
44047+ assert("edward-272", coord_is_existing_unit(coord));
44048+ assert("edward-273", coord->unit_pos == 0);
44049+ assert("edward-274", znode_is_write_locked(coord->node));
44050+ assert("edward-275", reiser4_schedulable());
44051+ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
44052+ assert("edward-1243", ctail_ok(coord));
44053+
44054+ count = nr_units_ctail(coord);
44055+
44056+ if (count > f->length)
44057+ count = f->length;
44058+ memcpy(first_unit(coord), f->data, count);
44059+ move_flow_forward(f, count);
44060+ coord->unit_pos += count;
44061+ return 0;
44062+}
44063+
44064+/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
44065+ cut ctail (part or whole) starting from next unit position */
44066+static int cut_ctail(coord_t * coord)
44067+{
44068+ coord_t stop;
44069+
44070+ assert("edward-435", coord->between == AT_UNIT &&
44071+ coord->item_pos < coord_num_items(coord) &&
44072+ coord->unit_pos <= coord_num_units(coord));
44073+
44074+ if (coord->unit_pos == coord_num_units(coord))
44075+ /* nothing to cut */
44076+ return 0;
44077+ coord_dup(&stop, coord);
44078+ stop.unit_pos = coord_last_unit_pos(coord);
44079+
44080+ return cut_node_content(coord, &stop, NULL, NULL, NULL);
44081+}
44082+
44083+int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
44084+ struct inode * inode)
44085+{
44086+ int result;
44087+ assert("edward-1244", inode != NULL);
44088+ assert("edward-1245", clust->hint != NULL);
44089+ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
44090+ assert("edward-1247", clust->reserved == 1);
44091+
44092+ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
44093+ if (cbk_errored(result))
44094+ return result;
44095+ assert("edward-1249", result == CBK_COORD_NOTFOUND);
44096+ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
44097+
44098+ assert("edward-1295",
44099+ clust->hint->ext_coord.lh->node ==
44100+ clust->hint->ext_coord.coord.node);
44101+
44102+ coord_set_between_clusters(&clust->hint->ext_coord.coord);
44103+
44104+ result = insert_unprepped_ctail(clust, inode);
44105+ all_grabbed2free();
44106+
44107+ assert("edward-1251", !result);
44108+ assert("edward-1252", cryptcompress_inode_ok(inode));
44109+ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
44110+ assert("edward-1254",
44111+ reiser4_clustered_blocks(reiser4_get_current_sb()));
44112+ assert("edward-1255",
44113+ znode_convertible(clust->hint->ext_coord.coord.node));
44114+
44115+ return result;
44116+}
44117+
44118+static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
44119+{
44120+ int result = 0;
44121+ struct convert_item_info * info;
44122+
44123+ assert("edward-468", pos != NULL);
44124+ assert("edward-469", pos->sq != NULL);
44125+ assert("edward-845", item_convert_data(pos) != NULL);
44126+
44127+ info = item_convert_data(pos);
44128+ assert("edward-679", info->flow.data != NULL);
44129+
44130+ switch (mode) {
44131+ case CRC_APPEND_ITEM:
44132+ assert("edward-1229", info->flow.length != 0);
44133+ assert("edward-1256",
44134+ cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
44135+ result =
44136+ insert_cryptcompress_flow_in_place(&pos->coord,
44137+ &pos->lock,
44138+ &info->flow,
44139+ info->cluster_shift);
44140+ break;
44141+ case CRC_OVERWRITE_ITEM:
44142+ assert("edward-1230", info->flow.length != 0);
44143+ overwrite_ctail(&pos->coord, &info->flow);
44144+ if (info->flow.length != 0)
44145+ break;
44146+ case CRC_CUT_ITEM:
44147+ assert("edward-1231", info->flow.length == 0);
44148+ result = cut_ctail(&pos->coord);
44149+ break;
44150+ default:
44151+ result = RETERR(-EIO);
44152+ impossible("edward-244", "bad convert mode");
44153+ }
44154+ return result;
44155+}
44156+
44157+/* plugin->u.item.f.scan */
44158+int scan_ctail(flush_scan * scan)
44159+{
44160+ int result = 0;
44161+ struct page *page;
44162+ struct inode *inode;
44163+ jnode *node = scan->node;
44164+
44165+ assert("edward-227", scan->node != NULL);
44166+ assert("edward-228", jnode_is_cluster_page(scan->node));
44167+ assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
44168+
44169+ page = jnode_page(node);
44170+ inode = page->mapping->host;
44171+
44172+ if (!reiser4_scanning_left(scan))
44173+ return result;
44174+ if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
44175+ znode_make_dirty(scan->parent_lock.node);
44176+
44177+ if (!znode_convertible(scan->parent_lock.node)) {
44178+ if (JF_ISSET(scan->node, JNODE_DIRTY))
44179+ znode_set_convertible(scan->parent_lock.node);
44180+ else {
44181+ warning("edward-681",
44182+ "cluster page is already processed");
44183+ return -EAGAIN;
44184+ }
44185+ }
44186+ return result;
44187+}
44188+
44189+/* If true, this function attaches children */
44190+static int should_attach_convert_idata(flush_pos_t * pos)
44191+{
44192+ int result;
44193+ assert("edward-431", pos != NULL);
44194+ assert("edward-432", pos->child == NULL);
44195+ assert("edward-619", znode_is_write_locked(pos->coord.node));
44196+ assert("edward-470",
44197+ item_plugin_by_coord(&pos->coord) ==
44198+ item_plugin_by_id(CTAIL_ID));
44199+
44200+ /* check for leftmost child */
44201+ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
44202+
44203+ if (!pos->child)
44204+ return 0;
44205+ spin_lock_jnode(pos->child);
44206+ result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
44207+ pos->child->atom == ZJNODE(pos->coord.node)->atom);
44208+ spin_unlock_jnode(pos->child);
44209+ if (!result && pos->child) {
44210+ /* existing child isn't to attach, clear up this one */
44211+ jput(pos->child);
44212+ pos->child = NULL;
44213+ }
44214+ return result;
44215+}
44216+
44217+/**
44218+ * Collect all needed information about the object here,
44219+ * as in-memory inode can be evicted from memory before
44220+ * disk update completion.
44221+ */
44222+static int init_convert_data_ctail(struct convert_item_info * idata,
44223+ struct inode *inode)
44224+{
44225+ assert("edward-813", idata != NULL);
44226+ assert("edward-814", inode != NULL);
44227+
44228+ idata->cluster_shift = inode_cluster_shift(inode);
44229+ idata->d_cur = DC_FIRST_ITEM;
44230+ idata->d_next = DC_INVALID_STATE;
44231+
44232+ return 0;
44233+}
44234+
44235+static int alloc_item_convert_data(struct convert_info * sq)
44236+{
44237+ assert("edward-816", sq != NULL);
44238+ assert("edward-817", sq->itm == NULL);
44239+
44240+ sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
44241+ if (sq->itm == NULL)
44242+ return RETERR(-ENOMEM);
44243+ return 0;
44244+}
44245+
44246+static void free_item_convert_data(struct convert_info * sq)
44247+{
44248+ assert("edward-818", sq != NULL);
44249+ assert("edward-819", sq->itm != NULL);
44250+ assert("edward-820", sq->iplug != NULL);
44251+
44252+ kfree(sq->itm);
44253+ sq->itm = NULL;
44254+ return;
44255+}
44256+
44257+static int alloc_convert_data(flush_pos_t * pos)
44258+{
44259+ assert("edward-821", pos != NULL);
44260+ assert("edward-822", pos->sq == NULL);
44261+
44262+ pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
44263+ if (!pos->sq)
44264+ return RETERR(-ENOMEM);
44265+ memset(pos->sq, 0, sizeof(*pos->sq));
44266+ cluster_init_write(&pos->sq->clust, NULL);
44267+ return 0;
44268+}
44269+
44270+void free_convert_data(flush_pos_t * pos)
44271+{
44272+ struct convert_info *sq;
44273+
44274+ assert("edward-823", pos != NULL);
44275+ assert("edward-824", pos->sq != NULL);
44276+
44277+ sq = pos->sq;
44278+ if (sq->itm)
44279+ free_item_convert_data(sq);
44280+ put_cluster_handle(&sq->clust);
44281+ kfree(pos->sq);
44282+ pos->sq = NULL;
44283+ return;
44284+}
44285+
44286+static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
44287+{
44288+ struct convert_info *sq;
44289+
44290+ assert("edward-825", pos != NULL);
44291+ assert("edward-826", pos->sq != NULL);
44292+ assert("edward-827", item_convert_data(pos) != NULL);
44293+ assert("edward-828", inode != NULL);
44294+
44295+ sq = pos->sq;
44296+
44297+ memset(sq->itm, 0, sizeof(*sq->itm));
44298+
44299+ /* iplug->init_convert_data() */
44300+ return init_convert_data_ctail(sq->itm, inode);
44301+}
44302+
44303+/* create and attach disk cluster info used by 'convert' phase of the flush
44304+ squalloc() */
44305+static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
44306+{
44307+ int ret = 0;
44308+ struct convert_item_info *info;
44309+ struct cluster_handle *clust;
44310+ file_plugin *fplug = inode_file_plugin(inode);
44311+ compression_plugin *cplug = inode_compression_plugin(inode);
44312+
44313+ assert("edward-248", pos != NULL);
44314+ assert("edward-249", pos->child != NULL);
44315+ assert("edward-251", inode != NULL);
44316+ assert("edward-682", cryptcompress_inode_ok(inode));
44317+ assert("edward-252",
44318+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
44319+ assert("edward-473",
44320+ item_plugin_by_coord(&pos->coord) ==
44321+ item_plugin_by_id(CTAIL_ID));
44322+
44323+ if (!pos->sq) {
44324+ ret = alloc_convert_data(pos);
44325+ if (ret)
44326+ return ret;
44327+ }
44328+ clust = &pos->sq->clust;
44329+ ret = grab_coa(&clust->tc, cplug);
44330+ if (ret)
44331+ goto err;
44332+ ret = set_cluster_by_page(clust,
44333+ jnode_page(pos->child),
44334+ MAX_CLUSTER_NRPAGES);
44335+ if (ret)
44336+ goto err;
44337+
44338+ assert("edward-829", pos->sq != NULL);
44339+ assert("edward-250", item_convert_data(pos) == NULL);
44340+
44341+ pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
44342+
44343+ ret = alloc_item_convert_data(pos->sq);
44344+ if (ret)
44345+ goto err;
44346+ ret = init_item_convert_data(pos, inode);
44347+ if (ret)
44348+ goto err;
44349+ info = item_convert_data(pos);
44350+
44351+ ret = checkout_logical_cluster(clust, pos->child, inode);
44352+ if (ret)
44353+ goto err;
44354+
44355+ reiser4_deflate_cluster(clust, inode);
44356+ inc_item_convert_count(pos);
44357+
44358+ /* prepare flow for insertion */
44359+ fplug->flow_by_inode(inode,
44360+ (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
44361+ 0 /* kernel space */ ,
44362+ clust->tc.len,
44363+ clust_to_off(clust->index, inode),
44364+ WRITE_OP, &info->flow);
44365+ jput(pos->child);
44366+ return 0;
44367+ err:
44368+ jput(pos->child);
44369+ free_convert_data(pos);
44370+ return ret;
44371+}
44372+
44373+/* clear up disk cluster info */
44374+static void detach_convert_idata(struct convert_info * sq)
44375+{
44376+ struct convert_item_info *info;
44377+
44378+ assert("edward-253", sq != NULL);
44379+ assert("edward-840", sq->itm != NULL);
44380+
44381+ info = sq->itm;
44382+ assert("edward-1212", info->flow.length == 0);
44383+
44384+ free_item_convert_data(sq);
44385+ return;
44386+}
44387+
44388+/* plugin->u.item.f.utmost_child */
44389+
44390+/* This function sets leftmost child for a first cluster item,
44391+ if the child exists, and NULL in other cases.
44392+ NOTE-EDWARD: Do not call this for RIGHT_SIDE */
44393+
44394+int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
44395+{
44396+ reiser4_key key;
44397+
44398+ item_key_by_coord(coord, &key);
44399+
44400+ assert("edward-257", coord != NULL);
44401+ assert("edward-258", child != NULL);
44402+ assert("edward-259", side == LEFT_SIDE);
44403+ assert("edward-260",
44404+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
44405+
44406+ if (!is_disk_cluster_key(&key, coord))
44407+ *child = NULL;
44408+ else
44409+ *child = jlookup(current_tree,
44410+ get_key_objectid(item_key_by_coord
44411+ (coord, &key)),
44412+ off_to_pg(get_key_offset(&key)));
44413+ return 0;
44414+}
44415+
44416+/* Returns true if @p2 is the next item to @p1
44417+ in the _same_ disk cluster.
44418+ Disk cluster is a set of items. If ->clustered() != NULL,
44419+ with each item the whole disk cluster should be read/modified
44420+*/
44421+
44422+/* Go rightward and check for next disk cluster item, set
44423+ * d_next to DC_CHAINED_ITEM, if the last one exists.
44424+ * If the current position is last item, go to right neighbor.
44425+ * Skip empty nodes. Note, that right neighbors may be not in
44426+ * the slum because of races. If so, make it dirty and
44427+ * convertible.
44428+ */
44429+static int next_item_dc_stat(flush_pos_t * pos)
44430+{
44431+ int ret = 0;
44432+ int stop = 0;
44433+ znode *cur;
44434+ coord_t coord;
44435+ lock_handle lh;
44436+ lock_handle right_lock;
44437+
44438+ assert("edward-1232", !node_is_empty(pos->coord.node));
44439+ assert("edward-1014",
44440+ pos->coord.item_pos < coord_num_items(&pos->coord));
44441+ assert("edward-1015", chaining_data_present(pos));
44442+ assert("edward-1017",
44443+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
44444+
44445+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
44446+
44447+ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
44448+ return ret;
44449+ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
44450+ return ret;
44451+
44452+ /* Check next slum item.
44453+ * Note, that it can not be killed by concurrent truncate,
44454+ * as the last one will want the lock held by us.
44455+ */
44456+ init_lh(&right_lock);
44457+ cur = pos->coord.node;
44458+
44459+ while (!stop) {
44460+ init_lh(&lh);
44461+ ret = reiser4_get_right_neighbor(&lh,
44462+ cur,
44463+ ZNODE_WRITE_LOCK,
44464+ GN_CAN_USE_UPPER_LEVELS);
44465+ if (ret)
44466+ break;
44467+ ret = zload(lh.node);
44468+ if (ret) {
44469+ done_lh(&lh);
44470+ break;
44471+ }
44472+ coord_init_before_first_item(&coord, lh.node);
44473+
44474+ if (node_is_empty(lh.node)) {
44475+ znode_make_dirty(lh.node);
44476+ znode_set_convertible(lh.node);
44477+ stop = 0;
44478+ } else if (same_disk_cluster(&pos->coord, &coord)) {
44479+
44480+ item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
44481+
44482+ if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
44483+ /*
44484+ warning("edward-1024",
44485+ "next slum item mergeable, "
44486+ "but znode %p isn't dirty\n",
44487+ lh.node);
44488+ */
44489+ znode_make_dirty(lh.node);
44490+ }
44491+ if (!znode_convertible(lh.node)) {
44492+ /*
44493+ warning("edward-1272",
44494+ "next slum item mergeable, "
44495+ "but znode %p isn't convertible\n",
44496+ lh.node);
44497+ */
44498+ znode_set_convertible(lh.node);
44499+ }
44500+ stop = 1;
44501+ } else
44502+ stop = 1;
44503+ zrelse(lh.node);
44504+ done_lh(&right_lock);
44505+ copy_lh(&right_lock, &lh);
44506+ done_lh(&lh);
44507+ cur = right_lock.node;
44508+ }
44509+ done_lh(&right_lock);
44510+
44511+ if (ret == -E_NO_NEIGHBOR)
44512+ ret = 0;
44513+ return ret;
44514+}
44515+
44516+static int
44517+assign_convert_mode(struct convert_item_info * idata,
44518+ cryptcompress_write_mode_t * mode)
44519+{
44520+ int result = 0;
44521+
44522+ assert("edward-1025", idata != NULL);
44523+
44524+ if (idata->flow.length) {
44525+ /* append or overwrite */
44526+ switch (idata->d_cur) {
44527+ case DC_FIRST_ITEM:
44528+ case DC_CHAINED_ITEM:
44529+ *mode = CRC_OVERWRITE_ITEM;
44530+ break;
44531+ case DC_AFTER_CLUSTER:
44532+ *mode = CRC_APPEND_ITEM;
44533+ break;
44534+ default:
44535+ impossible("edward-1018", "wrong current item state");
44536+ }
44537+ } else {
44538+ /* cut or invalidate */
44539+ switch (idata->d_cur) {
44540+ case DC_FIRST_ITEM:
44541+ case DC_CHAINED_ITEM:
44542+ *mode = CRC_CUT_ITEM;
44543+ break;
44544+ case DC_AFTER_CLUSTER:
44545+ result = 1;
44546+ break;
44547+ default:
44548+ impossible("edward-1019", "wrong current item state");
44549+ }
44550+ }
44551+ return result;
44552+}
44553+
44554+/* plugin->u.item.f.convert */
44555+/* write ctail in guessed mode */
44556+int convert_ctail(flush_pos_t * pos)
44557+{
44558+ int result;
44559+ int nr_items;
44560+ cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
44561+
44562+ assert("edward-1020", pos != NULL);
44563+ assert("edward-1213", coord_num_items(&pos->coord) != 0);
44564+ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
44565+ assert("edward-1258", ctail_ok(&pos->coord));
44566+ assert("edward-261", pos->coord.node != NULL);
44567+
44568+ nr_items = coord_num_items(&pos->coord);
44569+ if (!chaining_data_present(pos)) {
44570+ if (should_attach_convert_idata(pos)) {
44571+ /* attach convert item info */
44572+ struct inode *inode;
44573+
44574+ assert("edward-264", pos->child != NULL);
44575+ assert("edward-265", jnode_page(pos->child) != NULL);
44576+ assert("edward-266",
44577+ jnode_page(pos->child)->mapping != NULL);
44578+
44579+ inode = jnode_page(pos->child)->mapping->host;
44580+
44581+ assert("edward-267", inode != NULL);
44582+
44583+ /* attach item convert info by child and put the last one */
44584+ result = attach_convert_idata(pos, inode);
44585+ pos->child = NULL;
44586+ if (result == -E_REPEAT) {
44587+ /* jnode became clean, or there is no dirty
44588+ pages (nothing to update in disk cluster) */
44589+ warning("edward-1021",
44590+ "convert_ctail: nothing to attach");
44591+ return 0;
44592+ }
44593+ if (result != 0)
44594+ return result;
44595+ } else
44596+ /* unconvertible */
44597+ return 0;
44598+ } else {
44599+ /* use old convert info */
44600+
44601+ struct convert_item_info *idata;
44602+
44603+ idata = item_convert_data(pos);
44604+
44605+ result = assign_convert_mode(idata, &mode);
44606+ if (result) {
44607+ /* disk cluster is over,
44608+ nothing to update anymore */
44609+ detach_convert_idata(pos->sq);
44610+ return 0;
44611+ }
44612+ }
44613+
44614+ assert("edward-433", chaining_data_present(pos));
44615+ assert("edward-1022",
44616+ pos->coord.item_pos < coord_num_items(&pos->coord));
44617+
44618+ /* check if next item is of current disk cluster */
44619+ result = next_item_dc_stat(pos);
44620+ if (result) {
44621+ detach_convert_idata(pos->sq);
44622+ return result;
44623+ }
44624+ result = do_convert_ctail(pos, mode);
44625+ if (result) {
44626+ detach_convert_idata(pos->sq);
44627+ return result;
44628+ }
44629+ switch (mode) {
44630+ case CRC_CUT_ITEM:
44631+ assert("edward-1214", item_convert_data(pos)->flow.length == 0);
44632+ assert("edward-1215",
44633+ coord_num_items(&pos->coord) == nr_items ||
44634+ coord_num_items(&pos->coord) == nr_items - 1);
44635+ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
44636+ break;
44637+ if (coord_num_items(&pos->coord) != nr_items) {
44638+ /* the item was killed, no more chained items */
44639+ detach_convert_idata(pos->sq);
44640+ if (!node_is_empty(pos->coord.node))
44641+ /* make sure the next item will be scanned */
44642+ coord_init_before_item(&pos->coord);
44643+ break;
44644+ }
44645+ case CRC_APPEND_ITEM:
44646+ assert("edward-434", item_convert_data(pos)->flow.length == 0);
44647+ detach_convert_idata(pos->sq);
44648+ break;
44649+ case CRC_OVERWRITE_ITEM:
44650+ if (coord_is_unprepped_ctail(&pos->coord)) {
44651+ /* convert unpprepped ctail to prepped one */
44652+ assert("edward-1259",
44653+ cluster_shift_ok(item_convert_data(pos)->
44654+ cluster_shift));
44655+ put_unaligned((d8)item_convert_data(pos)->cluster_shift,
44656+ &ctail_formatted_at(&pos->coord)->
44657+ cluster_shift);
44658+ }
44659+ break;
44660+ }
44661+ return result;
44662+}
44663+
44664+/* Make Linus happy.
44665+ Local variables:
44666+ c-indentation-style: "K&R"
44667+ mode-name: "LC"
44668+ c-basic-offset: 8
44669+ tab-width: 8
44670+ fill-column: 120
44671+ End:
44672+*/
44673diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.24/fs/reiser4/plugin/item/ctail.h
44674--- linux-2.6.24.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 03:00:00.000000000 +0300
44675+++ linux-2.6.24/fs/reiser4/plugin/item/ctail.h 2008-01-25 11:39:07.008226236 +0300
44676@@ -0,0 +1,102 @@
44677+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44678+
44679+/* Ctail items are fragments (or bodies) of special tipe to provide
44680+ optimal storage of encrypted and(or) compressed files. */
44681+
44682+
44683+#if !defined( __FS_REISER4_CTAIL_H__ )
44684+#define __FS_REISER4_CTAIL_H__
44685+
44686+/* Disk format of ctail item */
44687+typedef struct ctail_item_format {
44688+ /* packed shift;
44689+ if its value is different from UCTAIL_SHIFT (see below), then
44690+ size of disk cluster is calculated as (1 << cluster_shift) */
44691+ d8 cluster_shift;
44692+ /* ctail body */
44693+ d8 body[0];
44694+} __attribute__ ((packed)) ctail_item_format;
44695+
44696+/* "Unprepped" disk cluster is represented by a single ctail item
44697+ with the following "magic" attributes: */
44698+/* "magic" cluster_shift */
44699+#define UCTAIL_SHIFT 0xff
44700+/* How many units unprepped ctail item has */
44701+#define UCTAIL_NR_UNITS 1
44702+
44703+/* The following is a set of various item states in a disk cluster.
44704+ Disk cluster is a set of items whose keys belong to the interval
44705+ [dc_key , dc_key + disk_cluster_size - 1] */
44706+typedef enum {
44707+ DC_INVALID_STATE = 0,
44708+ DC_FIRST_ITEM = 1,
44709+ DC_CHAINED_ITEM = 2,
44710+ DC_AFTER_CLUSTER = 3
44711+} dc_item_stat;
44712+
44713+/* ctail-specific extension.
44714+ In particular this describes parameters of disk cluster an item belongs to */
44715+struct ctail_coord_extension {
44716+ int shift; /* this contains cluster_shift extracted from
44717+ ctail_item_format (above), or UCTAIL_SHIFT
44718+ (the last one is the "magic" of unprepped disk clusters)*/
44719+ int dsize; /* size of a prepped disk cluster */
44720+ int ncount; /* count of nodes occupied by a disk cluster */
44721+};
44722+
44723+struct cut_list;
44724+
44725+/* plugin->item.b.* */
44726+int can_contain_key_ctail(const coord_t *, const reiser4_key *,
44727+ const reiser4_item_data *);
44728+int mergeable_ctail(const coord_t * p1, const coord_t * p2);
44729+pos_in_node_t nr_units_ctail(const coord_t * coord);
44730+int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
44731+void print_ctail(const char *prefix, coord_t * coord);
44732+lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
44733+
44734+int paste_ctail(coord_t * coord, reiser4_item_data * data,
44735+ carry_plugin_info * info UNUSED_ARG);
44736+int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
44737+int can_shift_ctail(unsigned free_space, coord_t * coord,
44738+ znode * target, shift_direction pend, unsigned *size,
44739+ unsigned want);
44740+void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
44741+ unsigned count, shift_direction where_is_free_space,
44742+ unsigned free_space);
44743+int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44744+ carry_cut_data *, reiser4_key * smallest_removed,
44745+ reiser4_key * new_first);
44746+int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44747+ carry_kill_data *, reiser4_key * smallest_removed,
44748+ reiser4_key * new_first);
44749+int ctail_ok(const coord_t * coord);
44750+int check_ctail(const coord_t * coord, const char **error);
44751+
44752+/* plugin->u.item.s.* */
44753+int read_ctail(struct file *, flow_t *, hint_t *);
44754+int readpage_ctail(void *, struct page *);
44755+int readpages_ctail(struct file *, struct address_space *, struct list_head *);
44756+reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
44757+int create_hook_ctail(const coord_t * coord, void *arg);
44758+int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
44759+ carry_kill_data *);
44760+int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
44761+
44762+/* plugin->u.item.f */
44763+int utmost_child_ctail(const coord_t *, sideof, jnode **);
44764+int scan_ctail(flush_scan *);
44765+int convert_ctail(flush_pos_t *);
44766+size_t inode_scaled_cluster_size(struct inode *);
44767+
44768+#endif /* __FS_REISER4_CTAIL_H__ */
44769+
44770+/* Make Linus happy.
44771+ Local variables:
44772+ c-indentation-style: "K&R"
44773+ mode-name: "LC"
44774+ c-basic-offset: 8
44775+ tab-width: 8
44776+ fill-column: 120
44777+ End:
44778+*/
44779diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/extent.c linux-2.6.24/fs/reiser4/plugin/item/extent.c
44780--- linux-2.6.24.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 03:00:00.000000000 +0300
44781+++ linux-2.6.24/fs/reiser4/plugin/item/extent.c 2008-01-25 11:39:07.008226236 +0300
44782@@ -0,0 +1,197 @@
44783+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44784+
44785+#include "item.h"
44786+#include "../../key.h"
44787+#include "../../super.h"
44788+#include "../../carry.h"
44789+#include "../../inode.h"
44790+#include "../../page_cache.h"
44791+#include "../../flush.h"
44792+#include "../object.h"
44793+
44794+/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
44795+/* Audited by: green(2002.06.13) */
44796+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
44797+ int nr_extents)
44798+{
44799+ data->data = ext_unit;
44800+ /* data->data is kernel space */
44801+ data->user = 0;
44802+ data->length = sizeof(reiser4_extent) * nr_extents;
44803+ data->arg = NULL;
44804+ data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
44805+ return data;
44806+}
44807+
44808+/* how many bytes are addressed by @nr first extents of the extent item */
44809+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
44810+{
44811+ pos_in_node_t i;
44812+ reiser4_block_nr blocks;
44813+ reiser4_extent *ext;
44814+
44815+ ext = item_body_by_coord(coord);
44816+ assert("vs-263", nr <= nr_units_extent(coord));
44817+
44818+ blocks = 0;
44819+ for (i = 0; i < nr; i++, ext++) {
44820+ blocks += extent_get_width(ext);
44821+ }
44822+
44823+ return blocks * current_blocksize;
44824+}
44825+
44826+extent_state state_of_extent(reiser4_extent * ext)
44827+{
44828+ switch ((int)extent_get_start(ext)) {
44829+ case 0:
44830+ return HOLE_EXTENT;
44831+ case 1:
44832+ return UNALLOCATED_EXTENT;
44833+ default:
44834+ break;
44835+ }
44836+ return ALLOCATED_EXTENT;
44837+}
44838+
44839+int extent_is_unallocated(const coord_t * item)
44840+{
44841+ assert("jmacd-5133", item_is_extent(item));
44842+
44843+ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
44844+}
44845+
44846+/* set extent's start and width */
44847+void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
44848+ reiser4_block_nr width)
44849+{
44850+ extent_set_start(ext, start);
44851+ extent_set_width(ext, width);
44852+}
44853+
44854+/**
44855+ * reiser4_replace_extent - replace extent and paste 1 or 2 after it
44856+ * @un_extent: coordinate of extent to be overwritten
44857+ * @lh: need better comment
44858+ * @key: need better comment
44859+ * @exts_to_add: data prepared for insertion into tree
44860+ * @replace: need better comment
44861+ * @flags: need better comment
44862+ * @return_insert_position: need better comment
44863+ *
44864+ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
44865+ * @return_inserted_position is 1 - @un_extent and @lh are returned set to
44866+ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
44867+ * set to extent which was overwritten.
44868+ */
44869+int reiser4_replace_extent(struct replace_handle *h,
44870+ int return_inserted_position)
44871+{
44872+ int result;
44873+ znode *orig_znode;
44874+ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
44875+
44876+ assert("vs-990", coord_is_existing_unit(h->coord));
44877+ assert("vs-1375", znode_is_write_locked(h->coord->node));
44878+ assert("vs-1426", extent_get_width(&h->overwrite) != 0);
44879+ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
44880+ assert("vs-1427", ergo(h->nr_new_extents == 2,
44881+ extent_get_width(&h->new_extents[1]) != 0));
44882+
44883+ /* compose structure for paste */
44884+ init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
44885+
44886+ coord_dup(&h->coord_after, h->coord);
44887+ init_lh(&h->lh_after);
44888+ copy_lh(&h->lh_after, h->lh);
44889+ reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
44890+ reiser4_tap_monitor(&h->watch);
44891+
44892+ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
44893+ orig_znode = h->coord->node;
44894+
44895+#if REISER4_DEBUG
44896+ /* make sure that key is set properly */
44897+ unit_key_by_coord(h->coord, &h->tmp);
44898+ set_key_offset(&h->tmp,
44899+ get_key_offset(&h->tmp) +
44900+ extent_get_width(&h->overwrite) * current_blocksize);
44901+ assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
44902+#endif
44903+
44904+ /* set insert point after unit to be replaced */
44905+ h->coord->between = AFTER_UNIT;
44906+
44907+ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
44908+ &h->paste_key, &h->item, h->flags);
44909+ if (!result) {
44910+ /* now we have to replace the unit after which new units were
44911+ inserted. Its position is tracked by @watch */
44912+ reiser4_extent *ext;
44913+ znode *node;
44914+
44915+ node = h->coord_after.node;
44916+ if (node != orig_znode) {
44917+ coord_clear_iplug(&h->coord_after);
44918+ result = zload(node);
44919+ }
44920+
44921+ if (likely(!result)) {
44922+ ext = extent_by_coord(&h->coord_after);
44923+
44924+ assert("vs-987", znode_is_loaded(node));
44925+ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
44926+
44927+ /* overwrite extent unit */
44928+ memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
44929+ znode_make_dirty(node);
44930+
44931+ if (node != orig_znode)
44932+ zrelse(node);
44933+
44934+ if (return_inserted_position == 0) {
44935+ /* coord and lh are to be set to overwritten
44936+ extent */
44937+ assert("vs-1662",
44938+ WITH_DATA(node, !memcmp(&h->overwrite,
44939+ extent_by_coord(
44940+ &h->coord_after),
44941+ sizeof(reiser4_extent))));
44942+
44943+ *h->coord = h->coord_after;
44944+ done_lh(h->lh);
44945+ copy_lh(h->lh, &h->lh_after);
44946+ } else {
44947+ /* h->coord and h->lh are to be set to first of
44948+ inserted units */
44949+ assert("vs-1663",
44950+ WITH_DATA(h->coord->node,
44951+ !memcmp(&h->new_extents[0],
44952+ extent_by_coord(h->coord),
44953+ sizeof(reiser4_extent))));
44954+ assert("vs-1664", h->lh->node == h->coord->node);
44955+ }
44956+ }
44957+ }
44958+ reiser4_tap_done(&h->watch);
44959+
44960+ return result;
44961+}
44962+
44963+lock_handle *znode_lh(znode *node)
44964+{
44965+ assert("vs-1371", znode_is_write_locked(node));
44966+ assert("vs-1372", znode_is_wlocked_once(node));
44967+ return list_entry(node->lock.owners.next, lock_handle, owners_link);
44968+}
44969+
44970+/*
44971+ * Local variables:
44972+ * c-indentation-style: "K&R"
44973+ * mode-name: "LC"
44974+ * c-basic-offset: 8
44975+ * tab-width: 8
44976+ * fill-column: 79
44977+ * scroll-step: 1
44978+ * End:
44979+ */
44980diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.24/fs/reiser4/plugin/item/extent_file_ops.c
44981--- linux-2.6.24.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 03:00:00.000000000 +0300
44982+++ linux-2.6.24/fs/reiser4/plugin/item/extent_file_ops.c 2008-01-25 11:40:16.698169785 +0300
44983@@ -0,0 +1,1450 @@
44984+/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44985+
44986+#include "item.h"
44987+#include "../../inode.h"
44988+#include "../../page_cache.h"
44989+#include "../object.h"
44990+
44991+#include <linux/quotaops.h>
44992+#include <linux/swap.h>
44993+
44994+static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
44995+{
44996+ reiser4_extent *ext;
44997+
44998+ ext = (reiser4_extent *) (zdata(node) + offset);
44999+ return ext;
45000+}
45001+
45002+/**
45003+ * check_uf_coord - verify coord extension
45004+ * @uf_coord:
45005+ * @key:
45006+ *
45007+ * Makes sure that all fields of @uf_coord are set properly. If @key is
45008+ * specified - check whether @uf_coord is set correspondingly.
45009+ */
45010+static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
45011+{
45012+#if REISER4_DEBUG
45013+ const coord_t *coord;
45014+ const struct extent_coord_extension *ext_coord;
45015+ reiser4_extent *ext;
45016+
45017+ coord = &uf_coord->coord;
45018+ ext_coord = &uf_coord->extension.extent;
45019+ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
45020+
45021+ assert("",
45022+ WITH_DATA(coord->node,
45023+ (uf_coord->valid == 1 &&
45024+ coord_is_iplug_set(coord) &&
45025+ item_is_extent(coord) &&
45026+ ext_coord->nr_units == nr_units_extent(coord) &&
45027+ ext == extent_by_coord(coord) &&
45028+ ext_coord->width == extent_get_width(ext) &&
45029+ coord->unit_pos < ext_coord->nr_units &&
45030+ ext_coord->pos_in_unit < ext_coord->width &&
45031+ memcmp(ext, &ext_coord->extent,
45032+ sizeof(reiser4_extent)) == 0)));
45033+ if (key) {
45034+ reiser4_key coord_key;
45035+
45036+ unit_key_by_coord(&uf_coord->coord, &coord_key);
45037+ set_key_offset(&coord_key,
45038+ get_key_offset(&coord_key) +
45039+ (uf_coord->extension.extent.
45040+ pos_in_unit << PAGE_CACHE_SHIFT));
45041+ assert("", keyeq(key, &coord_key));
45042+ }
45043+#endif
45044+}
45045+
45046+static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
45047+{
45048+ check_uf_coord(uf_coord, NULL);
45049+
45050+ return ext_by_offset(uf_coord->coord.node,
45051+ uf_coord->extension.extent.ext_offset);
45052+}
45053+
45054+#if REISER4_DEBUG
45055+
45056+/**
45057+ * offset_is_in_unit
45058+ *
45059+ *
45060+ *
45061+ */
45062+/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
45063+ pos_in_unit inside of unit correspondingly */
45064+static int offset_is_in_unit(const coord_t *coord, loff_t off)
45065+{
45066+ reiser4_key unit_key;
45067+ __u64 unit_off;
45068+ reiser4_extent *ext;
45069+
45070+ ext = extent_by_coord(coord);
45071+
45072+ unit_key_extent(coord, &unit_key);
45073+ unit_off = get_key_offset(&unit_key);
45074+ if (off < unit_off)
45075+ return 0;
45076+ if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
45077+ return 0;
45078+ return 1;
45079+}
45080+
45081+static int
45082+coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
45083+{
45084+ reiser4_key item_key;
45085+
45086+ assert("vs-771", coord_is_existing_unit(coord));
45087+ assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
45088+ assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
45089+
45090+ return offset_is_in_unit(coord, get_key_offset(key));
45091+}
45092+
45093+#endif
45094+
45095+/**
45096+ * can_append -
45097+ * @key:
45098+ * @coord:
45099+ *
45100+ * Returns 1 if @key is equal to an append key of item @coord is set to
45101+ */
45102+static int can_append(const reiser4_key *key, const coord_t *coord)
45103+{
45104+ reiser4_key append_key;
45105+
45106+ return keyeq(key, append_key_extent(coord, &append_key));
45107+}
45108+
45109+/**
45110+ * append_hole
45111+ * @coord:
45112+ * @lh:
45113+ * @key:
45114+ *
45115+ */
45116+static int append_hole(coord_t *coord, lock_handle *lh,
45117+ const reiser4_key *key)
45118+{
45119+ reiser4_key append_key;
45120+ reiser4_block_nr hole_width;
45121+ reiser4_extent *ext, new_ext;
45122+ reiser4_item_data idata;
45123+
45124+ /* last item of file may have to be appended with hole */
45125+ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
45126+ assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
45127+
45128+ /* key of first byte which is not addressed by this extent */
45129+ append_key_extent(coord, &append_key);
45130+
45131+ assert("", keyle(&append_key, key));
45132+
45133+ /*
45134+ * extent item has to be appended with hole. Calculate length of that
45135+ * hole
45136+ */
45137+ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
45138+ current_blocksize - 1) >> current_blocksize_bits);
45139+ assert("vs-954", hole_width > 0);
45140+
45141+ /* set coord after last unit */
45142+ coord_init_after_item_end(coord);
45143+
45144+ /* get last extent in the item */
45145+ ext = extent_by_coord(coord);
45146+ if (state_of_extent(ext) == HOLE_EXTENT) {
45147+ /*
45148+ * last extent of a file is hole extent. Widen that extent by
45149+ * @hole_width blocks. Note that we do not worry about
45150+ * overflowing - extent width is 64 bits
45151+ */
45152+ reiser4_set_extent(ext, HOLE_EXTENT_START,
45153+ extent_get_width(ext) + hole_width);
45154+ znode_make_dirty(coord->node);
45155+ return 0;
45156+ }
45157+
45158+ /* append last item of the file with hole extent unit */
45159+ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
45160+ state_of_extent(ext) == UNALLOCATED_EXTENT));
45161+
45162+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45163+ init_new_extent(&idata, &new_ext, 1);
45164+ return insert_into_item(coord, lh, &append_key, &idata, 0);
45165+}
45166+
45167+/**
45168+ * check_jnodes
45169+ * @twig: longterm locked twig node
45170+ * @key:
45171+ *
45172+ */
45173+static void check_jnodes(znode *twig, const reiser4_key *key, int count)
45174+{
45175+#if REISER4_DEBUG
45176+ coord_t c;
45177+ reiser4_key node_key, jnode_key;
45178+
45179+ jnode_key = *key;
45180+
45181+ assert("", twig != NULL);
45182+ assert("", znode_get_level(twig) == TWIG_LEVEL);
45183+ assert("", znode_is_write_locked(twig));
45184+
45185+ zload(twig);
45186+ /* get the smallest key in twig node */
45187+ coord_init_first_unit(&c, twig);
45188+ unit_key_by_coord(&c, &node_key);
45189+ assert("", keyle(&node_key, &jnode_key));
45190+
45191+ coord_init_last_unit(&c, twig);
45192+ unit_key_by_coord(&c, &node_key);
45193+ if (item_plugin_by_coord(&c)->s.file.append_key)
45194+ item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
45195+ set_key_offset(&jnode_key,
45196+ get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
45197+ assert("", keylt(&jnode_key, &node_key));
45198+ zrelse(twig);
45199+#endif
45200+}
45201+
45202+/**
45203+ * append_last_extent - append last file item
45204+ * @uf_coord: coord to start insertion from
45205+ * @jnodes: array of jnodes
45206+ * @count: number of jnodes in the array
45207+ *
45208+ * There is already at least one extent item of file @inode in the tree. Append
45209+ * the last of them with unallocated extent unit of width @count. Assign
45210+ * fake block numbers to jnodes corresponding to the inserted extent.
45211+ */
45212+static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45213+ jnode **jnodes, int count)
45214+{
45215+ int result;
45216+ reiser4_extent new_ext;
45217+ reiser4_item_data idata;
45218+ coord_t *coord;
45219+ struct extent_coord_extension *ext_coord;
45220+ reiser4_extent *ext;
45221+ reiser4_block_nr block;
45222+ jnode *node;
45223+ int i;
45224+
45225+ coord = &uf_coord->coord;
45226+ ext_coord = &uf_coord->extension.extent;
45227+ ext = ext_by_ext_coord(uf_coord);
45228+
45229+ /* check correctness of position in the item */
45230+ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
45231+ assert("vs-1311", coord->between == AFTER_UNIT);
45232+ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
45233+
45234+ if (!can_append(key, coord)) {
45235+ /* hole extent has to be inserted */
45236+ result = append_hole(coord, uf_coord->lh, key);
45237+ uf_coord->valid = 0;
45238+ return result;
45239+ }
45240+
45241+ if (count == 0)
45242+ return 0;
45243+
45244+ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
45245+
45246+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
45247+ count);
45248+ BUG_ON(result != 0);
45249+
45250+ switch (state_of_extent(ext)) {
45251+ case UNALLOCATED_EXTENT:
45252+ /*
45253+ * last extent unit of the file is unallocated one. Increase
45254+ * its width by @count
45255+ */
45256+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
45257+ extent_get_width(ext) + count);
45258+ znode_make_dirty(coord->node);
45259+
45260+ /* update coord extension */
45261+ ext_coord->width += count;
45262+ ON_DEBUG(extent_set_width
45263+ (&uf_coord->extension.extent.extent,
45264+ ext_coord->width));
45265+ break;
45266+
45267+ case HOLE_EXTENT:
45268+ case ALLOCATED_EXTENT:
45269+ /*
45270+ * last extent unit of the file is either hole or allocated
45271+ * one. Append one unallocated extent of width @count
45272+ */
45273+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45274+ init_new_extent(&idata, &new_ext, 1);
45275+ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
45276+ uf_coord->valid = 0;
45277+ if (result)
45278+ return result;
45279+ break;
45280+
45281+ default:
45282+ return RETERR(-EIO);
45283+ }
45284+
45285+ /*
45286+ * make sure that we hold long term locked twig node containing all
45287+ * jnodes we are about to capture
45288+ */
45289+ check_jnodes(uf_coord->lh->node, key, count);
45290+
45291+ /*
45292+ * assign fake block numbers to all jnodes. FIXME: make sure whether
45293+ * twig node containing inserted extent item is locked
45294+ */
45295+ block = fake_blocknr_unformatted(count);
45296+ for (i = 0; i < count; i ++, block ++) {
45297+ node = jnodes[i];
45298+ spin_lock_jnode(node);
45299+ JF_SET(node, JNODE_CREATED);
45300+ jnode_set_block(node, &block);
45301+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45302+ BUG_ON(result != 0);
45303+ jnode_make_dirty_locked(node);
45304+ spin_unlock_jnode(node);
45305+ }
45306+ return count;
45307+}
45308+
45309+/**
45310+ * insert_first_hole - inser hole extent into tree
45311+ * @coord:
45312+ * @lh:
45313+ * @key:
45314+ *
45315+ *
45316+ */
45317+static int insert_first_hole(coord_t *coord, lock_handle *lh,
45318+ const reiser4_key *key)
45319+{
45320+ reiser4_extent new_ext;
45321+ reiser4_item_data idata;
45322+ reiser4_key item_key;
45323+ reiser4_block_nr hole_width;
45324+
45325+ /* @coord must be set for inserting of new item */
45326+ assert("vs-711", coord_is_between_items(coord));
45327+
45328+ item_key = *key;
45329+ set_key_offset(&item_key, 0ull);
45330+
45331+ hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
45332+ current_blocksize_bits);
45333+ assert("vs-710", hole_width > 0);
45334+
45335+ /* compose body of hole extent and insert item into tree */
45336+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45337+ init_new_extent(&idata, &new_ext, 1);
45338+ return insert_extent_by_coord(coord, &idata, &item_key, lh);
45339+}
45340+
45341+
45342+/**
45343+ * insert_first_extent - insert first file item
45344+ * @inode: inode of file
45345+ * @uf_coord: coord to start insertion from
45346+ * @jnodes: array of jnodes
45347+ * @count: number of jnodes in the array
45348+ * @inode:
45349+ *
45350+ * There are no items of file @inode in the tree yet. Insert unallocated extent
45351+ * of width @count into tree or hole extent if writing not to the
45352+ * beginning. Assign fake block numbers to jnodes corresponding to the inserted
45353+ * unallocated extent. Returns number of jnodes or error code.
45354+ */
45355+static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45356+ jnode **jnodes, int count,
45357+ struct inode *inode)
45358+{
45359+ int result;
45360+ int i;
45361+ reiser4_extent new_ext;
45362+ reiser4_item_data idata;
45363+ reiser4_block_nr block;
45364+ struct unix_file_info *uf_info;
45365+ jnode *node;
45366+
45367+ /* first extent insertion starts at leaf level */
45368+ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
45369+ assert("vs-711", coord_is_between_items(&uf_coord->coord));
45370+
45371+ if (get_key_offset(key) != 0) {
45372+ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
45373+ uf_coord->valid = 0;
45374+ uf_info = unix_file_inode_data(inode);
45375+
45376+ /*
45377+ * first item insertion is only possible when writing to empty
45378+ * file or performing tail conversion
45379+ */
45380+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
45381+ (reiser4_inode_get_flag(inode,
45382+ REISER4_PART_MIXED) &&
45383+ reiser4_inode_get_flag(inode,
45384+ REISER4_PART_IN_CONV))));
45385+ /* if file was empty - update its state */
45386+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
45387+ uf_info->container = UF_CONTAINER_EXTENTS;
45388+ return result;
45389+ }
45390+
45391+ if (count == 0)
45392+ return 0;
45393+
45394+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
45395+ BUG_ON(result != 0);
45396+
45397+ /*
45398+ * prepare for tree modification: compose body of item and item data
45399+ * structure needed for insertion
45400+ */
45401+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45402+ init_new_extent(&idata, &new_ext, 1);
45403+
45404+ /* insert extent item into the tree */
45405+ result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
45406+ uf_coord->lh);
45407+ if (result)
45408+ return result;
45409+
45410+ /*
45411+ * make sure that we hold long term locked twig node containing all
45412+ * jnodes we are about to capture
45413+ */
45414+ check_jnodes(uf_coord->lh->node, key, count);
45415+ /*
45416+ * assign fake block numbers to all jnodes, capture and mark them dirty
45417+ */
45418+ block = fake_blocknr_unformatted(count);
45419+ for (i = 0; i < count; i ++, block ++) {
45420+ node = jnodes[i];
45421+ spin_lock_jnode(node);
45422+ JF_SET(node, JNODE_CREATED);
45423+ jnode_set_block(node, &block);
45424+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45425+ BUG_ON(result != 0);
45426+ jnode_make_dirty_locked(node);
45427+ spin_unlock_jnode(node);
45428+ }
45429+
45430+ /*
45431+ * invalidate coordinate, research must be performed to continue
45432+ * because write will continue on twig level
45433+ */
45434+ uf_coord->valid = 0;
45435+ return count;
45436+}
45437+
45438+/**
45439+ * plug_hole - replace hole extent with unallocated and holes
45440+ * @uf_coord:
45441+ * @key:
45442+ * @node:
45443+ * @h: structure containing coordinate, lock handle, key, etc
45444+ *
45445+ * Creates an unallocated extent of width 1 within a hole. In worst case two
45446+ * additional extents can be created.
45447+ */
45448+static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
45449+{
45450+ struct replace_handle rh;
45451+ reiser4_extent *ext;
45452+ reiser4_block_nr width, pos_in_unit;
45453+ coord_t *coord;
45454+ struct extent_coord_extension *ext_coord;
45455+ int return_inserted_position;
45456+
45457+ check_uf_coord(uf_coord, key);
45458+
45459+ rh.coord = coord_by_uf_coord(uf_coord);
45460+ rh.lh = uf_coord->lh;
45461+ rh.flags = 0;
45462+
45463+ coord = coord_by_uf_coord(uf_coord);
45464+ ext_coord = ext_coord_by_uf_coord(uf_coord);
45465+ ext = ext_by_ext_coord(uf_coord);
45466+
45467+ width = ext_coord->width;
45468+ pos_in_unit = ext_coord->pos_in_unit;
45469+
45470+ *how = 0;
45471+ if (width == 1) {
45472+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
45473+ znode_make_dirty(coord->node);
45474+ /* update uf_coord */
45475+ ON_DEBUG(ext_coord->extent = *ext);
45476+ *how = 1;
45477+ return 0;
45478+ } else if (pos_in_unit == 0) {
45479+ /* we deal with first element of extent */
45480+ if (coord->unit_pos) {
45481+ /* there is an extent to the left */
45482+ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
45483+ /*
45484+ * left neighboring unit is an unallocated
45485+ * extent. Increase its width and decrease
45486+ * width of hole
45487+ */
45488+ extent_set_width(ext - 1,
45489+ extent_get_width(ext - 1) + 1);
45490+ extent_set_width(ext, width - 1);
45491+ znode_make_dirty(coord->node);
45492+
45493+ /* update coord extension */
45494+ coord->unit_pos--;
45495+ ext_coord->width = extent_get_width(ext - 1);
45496+ ext_coord->pos_in_unit = ext_coord->width - 1;
45497+ ext_coord->ext_offset -= sizeof(reiser4_extent);
45498+ ON_DEBUG(ext_coord->extent =
45499+ *extent_by_coord(coord));
45500+ *how = 2;
45501+ return 0;
45502+ }
45503+ }
45504+ /* extent for replace */
45505+ reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
45506+ /* extent to be inserted */
45507+ reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
45508+ width - 1);
45509+ rh.nr_new_extents = 1;
45510+
45511+ /* have reiser4_replace_extent to return with @coord and
45512+ @uf_coord->lh set to unit which was replaced */
45513+ return_inserted_position = 0;
45514+ *how = 3;
45515+ } else if (pos_in_unit == width - 1) {
45516+ /* we deal with last element of extent */
45517+ if (coord->unit_pos < nr_units_extent(coord) - 1) {
45518+ /* there is an extent unit to the right */
45519+ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
45520+ /*
45521+ * right neighboring unit is an unallocated
45522+ * extent. Increase its width and decrease
45523+ * width of hole
45524+ */
45525+ extent_set_width(ext + 1,
45526+ extent_get_width(ext + 1) + 1);
45527+ extent_set_width(ext, width - 1);
45528+ znode_make_dirty(coord->node);
45529+
45530+ /* update coord extension */
45531+ coord->unit_pos++;
45532+ ext_coord->width = extent_get_width(ext + 1);
45533+ ext_coord->pos_in_unit = 0;
45534+ ext_coord->ext_offset += sizeof(reiser4_extent);
45535+ ON_DEBUG(ext_coord->extent =
45536+ *extent_by_coord(coord));
45537+ *how = 4;
45538+ return 0;
45539+ }
45540+ }
45541+ /* extent for replace */
45542+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
45543+ /* extent to be inserted */
45544+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45545+ 1);
45546+ rh.nr_new_extents = 1;
45547+
45548+ /* have reiser4_replace_extent to return with @coord and
45549+ @uf_coord->lh set to unit which was inserted */
45550+ return_inserted_position = 1;
45551+ *how = 5;
45552+ } else {
45553+ /* extent for replace */
45554+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
45555+ pos_in_unit);
45556+ /* extents to be inserted */
45557+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45558+ 1);
45559+ reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
45560+ width - pos_in_unit - 1);
45561+ rh.nr_new_extents = 2;
45562+
45563+ /* have reiser4_replace_extent to return with @coord and
45564+ @uf_coord->lh set to first of units which were inserted */
45565+ return_inserted_position = 1;
45566+ *how = 6;
45567+ }
45568+ unit_key_by_coord(coord, &rh.paste_key);
45569+ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
45570+ extent_get_width(&rh.overwrite) * current_blocksize);
45571+
45572+ uf_coord->valid = 0;
45573+ return reiser4_replace_extent(&rh, return_inserted_position);
45574+}
45575+
45576+/**
45577+ * overwrite_one_block -
45578+ * @uf_coord:
45579+ * @key:
45580+ * @node:
45581+ *
45582+ * If @node corresponds to hole extent - create unallocated extent for it and
45583+ * assign fake block number. If @node corresponds to allocated extent - assign
45584+ * block number of jnode
45585+ */
45586+static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
45587+ jnode *node, int *hole_plugged)
45588+{
45589+ int result;
45590+ struct extent_coord_extension *ext_coord;
45591+ reiser4_extent *ext;
45592+ reiser4_block_nr block;
45593+ int how;
45594+
45595+ assert("vs-1312", uf_coord->coord.between == AT_UNIT);
45596+
45597+ result = 0;
45598+ ext_coord = ext_coord_by_uf_coord(uf_coord);
45599+ ext = ext_by_ext_coord(uf_coord);
45600+ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
45601+
45602+ switch (state_of_extent(ext)) {
45603+ case ALLOCATED_EXTENT:
45604+ block = extent_get_start(ext) + ext_coord->pos_in_unit;
45605+ break;
45606+
45607+ case HOLE_EXTENT:
45608+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
45609+ BUG_ON(result != 0);
45610+ result = plug_hole(uf_coord, key, &how);
45611+ if (result)
45612+ return result;
45613+ block = fake_blocknr_unformatted(1);
45614+ if (hole_plugged)
45615+ *hole_plugged = 1;
45616+ JF_SET(node, JNODE_CREATED);
45617+ break;
45618+
45619+ default:
45620+ return RETERR(-EIO);
45621+ }
45622+
45623+ jnode_set_block(node, &block);
45624+ return 0;
45625+}
45626+
45627+/**
45628+ * move_coord - move coordinate forward
45629+ * @uf_coord:
45630+ *
45631+ * Move coordinate one data block pointer forward. Return 1 if coord is set to
45632+ * the last one already or is invalid.
45633+ */
45634+static int move_coord(uf_coord_t *uf_coord)
45635+{
45636+ struct extent_coord_extension *ext_coord;
45637+
45638+ if (uf_coord->valid == 0)
45639+ return 1;
45640+ ext_coord = &uf_coord->extension.extent;
45641+ ext_coord->pos_in_unit ++;
45642+ if (ext_coord->pos_in_unit < ext_coord->width)
45643+ /* coordinate moved within the unit */
45644+ return 0;
45645+
45646+ /* end of unit is reached. Try to move to next unit */
45647+ ext_coord->pos_in_unit = 0;
45648+ uf_coord->coord.unit_pos ++;
45649+ if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
45650+ /* coordinate moved to next unit */
45651+ ext_coord->ext_offset += sizeof(reiser4_extent);
45652+ ext_coord->width =
45653+ extent_get_width(ext_by_offset
45654+ (uf_coord->coord.node,
45655+ ext_coord->ext_offset));
45656+ ON_DEBUG(ext_coord->extent =
45657+ *ext_by_offset(uf_coord->coord.node,
45658+ ext_coord->ext_offset));
45659+ return 0;
45660+ }
45661+ /* end of item is reached */
45662+ uf_coord->valid = 0;
45663+ return 1;
45664+}
45665+
45666+/**
45667+ * overwrite_extent -
45668+ * @inode:
45669+ *
45670+ * Returns number of handled jnodes.
45671+ */
45672+static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45673+ jnode **jnodes, int count, int *plugged_hole)
45674+{
45675+ int result;
45676+ reiser4_key k;
45677+ int i;
45678+ jnode *node;
45679+
45680+ k = *key;
45681+ for (i = 0; i < count; i ++) {
45682+ node = jnodes[i];
45683+ if (*jnode_get_block(node) == 0) {
45684+ result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
45685+ if (result)
45686+ return result;
45687+ }
45688+ /*
45689+ * make sure that we hold long term locked twig node containing
45690+ * all jnodes we are about to capture
45691+ */
45692+ check_jnodes(uf_coord->lh->node, &k, 1);
45693+ /*
45694+ * assign fake block numbers to all jnodes, capture and mark
45695+ * them dirty
45696+ */
45697+ spin_lock_jnode(node);
45698+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45699+ BUG_ON(result != 0);
45700+ jnode_make_dirty_locked(node);
45701+ spin_unlock_jnode(node);
45702+
45703+ if (uf_coord->valid == 0)
45704+ return i + 1;
45705+
45706+ check_uf_coord(uf_coord, &k);
45707+
45708+ if (move_coord(uf_coord)) {
45709+ /*
45710+ * failed to move to the next node pointer. Either end
45711+ * of file or end of twig node is reached. In the later
45712+ * case we might go to the right neighbor.
45713+ */
45714+ uf_coord->valid = 0;
45715+ return i + 1;
45716+ }
45717+ set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
45718+ }
45719+
45720+ return count;
45721+}
45722+
45723+/**
45724+ * reiser4_update_extent
45725+ * @file:
45726+ * @jnodes:
45727+ * @count:
45728+ * @off:
45729+ *
45730+ */
45731+int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
45732+ int *plugged_hole)
45733+{
45734+ int result;
45735+ znode *loaded;
45736+ uf_coord_t uf_coord;
45737+ coord_t *coord;
45738+ lock_handle lh;
45739+ reiser4_key key;
45740+
45741+ assert("", reiser4_lock_counters()->d_refs == 0);
45742+
45743+ key_by_inode_and_offset_common(inode, pos, &key);
45744+
45745+ init_uf_coord(&uf_coord, &lh);
45746+ coord = &uf_coord.coord;
45747+ result = find_file_item_nohint(coord, &lh, &key,
45748+ ZNODE_WRITE_LOCK, inode);
45749+ if (IS_CBKERR(result)) {
45750+ assert("", reiser4_lock_counters()->d_refs == 0);
45751+ return result;
45752+ }
45753+
45754+ result = zload(coord->node);
45755+ BUG_ON(result != 0);
45756+ loaded = coord->node;
45757+
45758+ if (coord->between == AFTER_UNIT) {
45759+ /*
45760+ * append existing extent item with unallocated extent of width
45761+ * nr_jnodes
45762+ */
45763+ init_coord_extension_extent(&uf_coord,
45764+ get_key_offset(&key));
45765+ result = append_last_extent(&uf_coord, &key,
45766+ &node, 1);
45767+ } else if (coord->between == AT_UNIT) {
45768+ /*
45769+ * overwrite
45770+ * not optimal yet. Will be optimized if new write will show
45771+ * performance win.
45772+ */
45773+ init_coord_extension_extent(&uf_coord,
45774+ get_key_offset(&key));
45775+ result = overwrite_extent(&uf_coord, &key,
45776+ &node, 1, plugged_hole);
45777+ } else {
45778+ /*
45779+ * there are no items of this file in the tree yet. Create
45780+ * first item of the file inserting one unallocated extent of
45781+ * width nr_jnodes
45782+ */
45783+ result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
45784+ }
45785+ assert("", result == 1 || result < 0);
45786+ zrelse(loaded);
45787+ done_lh(&lh);
45788+ assert("", reiser4_lock_counters()->d_refs == 0);
45789+ return (result == 1) ? 0 : result;
45790+}
45791+
45792+/**
45793+ * update_extents
45794+ * @file:
45795+ * @jnodes:
45796+ * @count:
45797+ * @off:
45798+ *
45799+ */
45800+static int update_extents(struct file *file, struct inode *inode,
45801+ jnode **jnodes, int count, loff_t pos)
45802+{
45803+ struct hint hint;
45804+ reiser4_key key;
45805+ int result;
45806+ znode *loaded;
45807+
45808+ result = load_file_hint(file, &hint);
45809+ BUG_ON(result != 0);
45810+
45811+ if (count != 0)
45812+ /*
45813+ * count == 0 is special case: expanding truncate
45814+ */
45815+ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
45816+ key_by_inode_and_offset_common(inode, pos, &key);
45817+
45818+ assert("", reiser4_lock_counters()->d_refs == 0);
45819+
45820+ do {
45821+ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
45822+ if (IS_CBKERR(result)) {
45823+ assert("", reiser4_lock_counters()->d_refs == 0);
45824+ return result;
45825+ }
45826+
45827+ result = zload(hint.ext_coord.coord.node);
45828+ BUG_ON(result != 0);
45829+ loaded = hint.ext_coord.coord.node;
45830+
45831+ if (hint.ext_coord.coord.between == AFTER_UNIT) {
45832+ /*
45833+ * append existing extent item with unallocated extent
45834+ * of width nr_jnodes
45835+ */
45836+ if (hint.ext_coord.valid == 0)
45837+ /* NOTE: get statistics on this */
45838+ init_coord_extension_extent(&hint.ext_coord,
45839+ get_key_offset(&key));
45840+ result = append_last_extent(&hint.ext_coord, &key,
45841+ jnodes, count);
45842+ } else if (hint.ext_coord.coord.between == AT_UNIT) {
45843+ /*
45844+ * overwrite
45845+ * not optimal yet. Will be optimized if new write will
45846+ * show performance win.
45847+ */
45848+ if (hint.ext_coord.valid == 0)
45849+ /* NOTE: get statistics on this */
45850+ init_coord_extension_extent(&hint.ext_coord,
45851+ get_key_offset(&key));
45852+ result = overwrite_extent(&hint.ext_coord, &key,
45853+ jnodes, count, NULL);
45854+ } else {
45855+ /*
45856+ * there are no items of this file in the tree
45857+ * yet. Create first item of the file inserting one
45858+ * unallocated extent of * width nr_jnodes
45859+ */
45860+ result = insert_first_extent(&hint.ext_coord, &key,
45861+ jnodes, count, inode);
45862+ }
45863+ zrelse(loaded);
45864+ if (result < 0) {
45865+ done_lh(hint.ext_coord.lh);
45866+ break;
45867+ }
45868+
45869+ jnodes += result;
45870+ count -= result;
45871+ set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
45872+
45873+ /* seal and unlock znode */
45874+ if (hint.ext_coord.valid)
45875+ reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
45876+ else
45877+ reiser4_unset_hint(&hint);
45878+
45879+ } while (count > 0);
45880+
45881+ save_file_hint(file, &hint);
45882+ assert("", reiser4_lock_counters()->d_refs == 0);
45883+ return result;
45884+}
45885+
45886+/**
45887+ * write_extent_reserve_space - reserve space for extent write operation
45888+ * @inode:
45889+ *
45890+ * Estimates and reserves space which may be required for writing
45891+ * WRITE_GRANULARITY pages of file.
45892+ */
45893+static int write_extent_reserve_space(struct inode *inode)
45894+{
45895+ __u64 count;
45896+ reiser4_tree *tree;
45897+
45898+ /*
45899+ * to write WRITE_GRANULARITY pages to a file by extents we have to
45900+ * reserve disk space for:
45901+
45902+ * 1. find_file_item may have to insert empty node to the tree (empty
45903+ * leaf node between two extent items). This requires 1 block and
45904+ * number of blocks which are necessary to perform insertion of an
45905+ * internal item into twig level.
45906+
45907+ * 2. for each of written pages there might be needed 1 block and
45908+ * number of blocks which might be necessary to perform insertion of or
45909+ * paste to an extent item.
45910+
45911+ * 3. stat data update
45912+ */
45913+ tree = reiser4_tree_by_inode(inode);
45914+ count = estimate_one_insert_item(tree) +
45915+ WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
45916+ estimate_one_insert_item(tree);
45917+ grab_space_enable();
45918+ return reiser4_grab_space(count, 0 /* flags */);
45919+}
45920+
45921+/*
45922+ * filemap_copy_from_user no longer exists in generic code, because it
45923+ * is deadlocky (copying from user while holding the page lock is bad).
45924+ * As a temporary fix for reiser4, just define it here.
45925+ */
45926+static inline size_t
45927+filemap_copy_from_user(struct page *page, unsigned long offset,
45928+ const char __user *buf, unsigned bytes)
45929+{
45930+ char *kaddr;
45931+ int left;
45932+
45933+ kaddr = kmap_atomic(page, KM_USER0);
45934+ left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
45935+ kunmap_atomic(kaddr, KM_USER0);
45936+
45937+ if (left != 0) {
45938+ /* Do it the slow way */
45939+ kaddr = kmap(page);
45940+ left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
45941+ kunmap(page);
45942+ }
45943+ return bytes - left;
45944+}
45945+
45946+/**
45947+ * reiser4_write_extent - write method of extent item plugin
45948+ * @file: file to write to
45949+ * @buf: address of user-space buffer
45950+ * @count: number of bytes to write
45951+ * @pos: position in file to write to
45952+ *
45953+ */
45954+ssize_t reiser4_write_extent(struct file *file, struct inode * inode,
45955+ const char __user *buf, size_t count, loff_t *pos)
45956+{
45957+ int have_to_update_extent;
45958+ int nr_pages, nr_dirty;
45959+ struct page *page;
45960+ jnode *jnodes[WRITE_GRANULARITY + 1];
45961+ unsigned long index;
45962+ unsigned long end;
45963+ int i;
45964+ int to_page, page_off;
45965+ size_t left, written;
45966+ int result = 0;
45967+
45968+ if (write_extent_reserve_space(inode))
45969+ return RETERR(-ENOSPC);
45970+
45971+ if (count == 0) {
45972+ /* truncate case */
45973+ update_extents(file, inode, jnodes, 0, *pos);
45974+ return 0;
45975+ }
45976+
45977+ BUG_ON(get_current_context()->trans->atom != NULL);
45978+
45979+ left = count;
45980+ index = *pos >> PAGE_CACHE_SHIFT;
45981+ /* calculate number of pages which are to be written */
45982+ end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
45983+ nr_pages = end - index + 1;
45984+ nr_dirty = 0;
45985+ assert("", nr_pages <= WRITE_GRANULARITY + 1);
45986+
45987+ /* get pages and jnodes */
45988+ for (i = 0; i < nr_pages; i ++) {
45989+ page = find_or_create_page(inode->i_mapping, index + i,
45990+ reiser4_ctx_gfp_mask_get());
45991+ if (page == NULL) {
45992+ nr_pages = i;
45993+ result = RETERR(-ENOMEM);
45994+ goto out;
45995+ }
45996+
45997+ jnodes[i] = jnode_of_page(page);
45998+ if (IS_ERR(jnodes[i])) {
45999+ unlock_page(page);
46000+ page_cache_release(page);
46001+ nr_pages = i;
46002+ result = RETERR(-ENOMEM);
46003+ goto out;
46004+ }
46005+ /* prevent jnode and page from disconnecting */
46006+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
46007+ unlock_page(page);
46008+ }
46009+
46010+ BUG_ON(get_current_context()->trans->atom != NULL);
46011+
46012+ have_to_update_extent = 0;
46013+
46014+ page_off = (*pos & (PAGE_CACHE_SIZE - 1));
46015+ for (i = 0; i < nr_pages; i ++) {
46016+ to_page = PAGE_CACHE_SIZE - page_off;
46017+ if (to_page > left)
46018+ to_page = left;
46019+ page = jnode_page(jnodes[i]);
46020+ if (page_offset(page) < inode->i_size &&
46021+ !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
46022+ /*
46023+ * the above is not optimal for partial write to last
46024+ * page of file when file size is not at boundary of
46025+ * page
46026+ */
46027+ lock_page(page);
46028+ if (!PageUptodate(page)) {
46029+ result = readpage_unix_file(NULL, page);
46030+ BUG_ON(result != 0);
46031+ /* wait for read completion */
46032+ lock_page(page);
46033+ BUG_ON(!PageUptodate(page));
46034+ } else
46035+ result = 0;
46036+ unlock_page(page);
46037+ }
46038+
46039+ BUG_ON(get_current_context()->trans->atom != NULL);
46040+ fault_in_pages_readable(buf, to_page);
46041+ BUG_ON(get_current_context()->trans->atom != NULL);
46042+
46043+ lock_page(page);
46044+ if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
46045+ simple_prepare_write(file, page, page_off,
46046+ page_off + to_page);
46047+
46048+ written = filemap_copy_from_user(page, page_off, buf, to_page);
46049+ if (unlikely(written != to_page)) {
46050+ unlock_page(page);
46051+ result = RETERR(-EFAULT);
46052+ break;
46053+ }
46054+
46055+ flush_dcache_page(page);
46056+ reiser4_set_page_dirty_internal(page);
46057+ unlock_page(page);
46058+ nr_dirty++;
46059+
46060+ mark_page_accessed(page);
46061+ SetPageUptodate(page);
46062+
46063+ if (jnodes[i]->blocknr == 0)
46064+ have_to_update_extent ++;
46065+
46066+ page_off = 0;
46067+ buf += to_page;
46068+ left -= to_page;
46069+ BUG_ON(get_current_context()->trans->atom != NULL);
46070+ }
46071+
46072+ if (have_to_update_extent) {
46073+ update_extents(file, inode, jnodes, nr_dirty, *pos);
46074+ } else {
46075+ for (i = 0; i < nr_dirty; i ++) {
46076+ int ret;
46077+ spin_lock_jnode(jnodes[i]);
46078+ ret = reiser4_try_capture(jnodes[i],
46079+ ZNODE_WRITE_LOCK, 0);
46080+ BUG_ON(ret != 0);
46081+ jnode_make_dirty_locked(jnodes[i]);
46082+ spin_unlock_jnode(jnodes[i]);
46083+ }
46084+ }
46085+out:
46086+ for (i = 0; i < nr_pages; i ++) {
46087+ page_cache_release(jnode_page(jnodes[i]));
46088+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
46089+ jput(jnodes[i]);
46090+ }
46091+
46092+ /* the only errors handled so far is ENOMEM and
46093+ EFAULT on copy_from_user */
46094+
46095+ return (count - left) ? (count - left) : result;
46096+}
46097+
46098+int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
46099+ struct page *page)
46100+{
46101+ jnode *j;
46102+ struct address_space *mapping;
46103+ unsigned long index;
46104+ oid_t oid;
46105+ reiser4_block_nr block;
46106+
46107+ mapping = page->mapping;
46108+ oid = get_inode_oid(mapping->host);
46109+ index = page->index;
46110+
46111+ switch (state_of_extent(ext)) {
46112+ case HOLE_EXTENT:
46113+ /*
46114+ * it is possible to have hole page with jnode, if page was
46115+ * eflushed previously.
46116+ */
46117+ j = jfind(mapping, index);
46118+ if (j == NULL) {
46119+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
46120+ SetPageUptodate(page);
46121+ unlock_page(page);
46122+ return 0;
46123+ }
46124+ spin_lock_jnode(j);
46125+ if (!jnode_page(j)) {
46126+ jnode_attach_page(j, page);
46127+ } else {
46128+ BUG_ON(jnode_page(j) != page);
46129+ assert("vs-1504", jnode_page(j) == page);
46130+ }
46131+ block = *jnode_get_io_block(j);
46132+ spin_unlock_jnode(j);
46133+ if (block == 0) {
46134+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
46135+ SetPageUptodate(page);
46136+ unlock_page(page);
46137+ jput(j);
46138+ return 0;
46139+ }
46140+ break;
46141+
46142+ case ALLOCATED_EXTENT:
46143+ j = jnode_of_page(page);
46144+ if (IS_ERR(j))
46145+ return PTR_ERR(j);
46146+ if (*jnode_get_block(j) == 0) {
46147+ reiser4_block_nr blocknr;
46148+
46149+ blocknr = extent_get_start(ext) + pos;
46150+ jnode_set_block(j, &blocknr);
46151+ } else
46152+ assert("vs-1403",
46153+ j->blocknr == extent_get_start(ext) + pos);
46154+ break;
46155+
46156+ case UNALLOCATED_EXTENT:
46157+ j = jfind(mapping, index);
46158+ assert("nikita-2688", j);
46159+ assert("vs-1426", jnode_page(j) == NULL);
46160+
46161+ spin_lock_jnode(j);
46162+ jnode_attach_page(j, page);
46163+ spin_unlock_jnode(j);
46164+ break;
46165+
46166+ default:
46167+ warning("vs-957", "wrong extent\n");
46168+ return RETERR(-EIO);
46169+ }
46170+
46171+ BUG_ON(j == 0);
46172+ reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
46173+ jput(j);
46174+ return 0;
46175+}
46176+
46177+/* Implements plugin->u.item.s.file.read operation for extent items. */
46178+int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
46179+{
46180+ int result;
46181+ struct page *page;
46182+ unsigned long cur_page, next_page;
46183+ unsigned long page_off, count;
46184+ struct address_space *mapping;
46185+ loff_t file_off;
46186+ uf_coord_t *uf_coord;
46187+ coord_t *coord;
46188+ struct extent_coord_extension *ext_coord;
46189+ unsigned long nr_pages;
46190+ char *kaddr;
46191+
46192+ assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
46193+ assert("vs-572", flow->user == 1);
46194+ assert("vs-1351", flow->length > 0);
46195+
46196+ uf_coord = &hint->ext_coord;
46197+
46198+ check_uf_coord(uf_coord, NULL);
46199+ assert("vs-33", uf_coord->lh == &hint->lh);
46200+
46201+ coord = &uf_coord->coord;
46202+ assert("vs-1119", znode_is_rlocked(coord->node));
46203+ assert("vs-1120", znode_is_loaded(coord->node));
46204+ assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
46205+
46206+ mapping = file->f_dentry->d_inode->i_mapping;
46207+ ext_coord = &uf_coord->extension.extent;
46208+
46209+ /* offset in a file to start read from */
46210+ file_off = get_key_offset(&flow->key);
46211+ /* offset within the page to start read from */
46212+ page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
46213+ /* bytes which can be read from the page which contains file_off */
46214+ count = PAGE_CACHE_SIZE - page_off;
46215+
46216+ /* index of page containing offset read is to start from */
46217+ cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
46218+ next_page = cur_page;
46219+ /* number of pages flow spans over */
46220+ nr_pages =
46221+ ((file_off + flow->length + PAGE_CACHE_SIZE -
46222+ 1) >> PAGE_CACHE_SHIFT) - cur_page;
46223+
46224+ /* we start having twig node read locked. However, we do not want to
46225+ keep that lock all the time readahead works. So, set a sel and
46226+ release twig node. */
46227+ reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
46228+ /* &hint->lh is done-ed */
46229+
46230+ do {
46231+ reiser4_txn_restart_current();
46232+ page = read_mapping_page(mapping, cur_page, file);
46233+ if (IS_ERR(page))
46234+ return PTR_ERR(page);
46235+ lock_page(page);
46236+ if (!PageUptodate(page)) {
46237+ unlock_page(page);
46238+ page_cache_release(page);
46239+ warning("jmacd-97178", "extent_read: page is not up to date");
46240+ return RETERR(-EIO);
46241+ }
46242+ mark_page_accessed(page);
46243+ unlock_page(page);
46244+
46245+ /* If users can be writing to this page using arbitrary virtual
46246+ addresses, take care about potential aliasing before reading
46247+ the page on the kernel side.
46248+ */
46249+ if (mapping_writably_mapped(mapping))
46250+ flush_dcache_page(page);
46251+
46252+ assert("nikita-3034", reiser4_schedulable());
46253+
46254+ /* number of bytes which are to be read from the page */
46255+ if (count > flow->length)
46256+ count = flow->length;
46257+
46258+ result = fault_in_pages_writeable(flow->data, count);
46259+ if (result) {
46260+ page_cache_release(page);
46261+ return RETERR(-EFAULT);
46262+ }
46263+
46264+ kaddr = kmap_atomic(page, KM_USER0);
46265+ result = __copy_to_user_inatomic(flow->data,
46266+ kaddr + page_off, count);
46267+ kunmap_atomic(kaddr, KM_USER0);
46268+ if (result != 0) {
46269+ kaddr = kmap(page);
46270+ result = __copy_to_user(flow->data, kaddr + page_off, count);
46271+ kunmap(page);
46272+ if (unlikely(result))
46273+ return RETERR(-EFAULT);
46274+ }
46275+
46276+ page_cache_release(page);
46277+
46278+ /* increase key (flow->key), update user area pointer (flow->data) */
46279+ move_flow_forward(flow, count);
46280+
46281+ page_off = 0;
46282+ cur_page ++;
46283+ count = PAGE_CACHE_SIZE;
46284+ nr_pages--;
46285+ } while (flow->length);
46286+
46287+ return 0;
46288+}
46289+
46290+/*
46291+ plugin->s.file.readpage
46292+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
46293+ or
46294+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
46295+
46296+ At the beginning: coord->node is read locked, zloaded, page is
46297+ locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
46298+*/
46299+int reiser4_readpage_extent(void *vp, struct page *page)
46300+{
46301+ uf_coord_t *uf_coord = vp;
46302+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
46303+ ON_DEBUG(reiser4_key key);
46304+
46305+ assert("vs-1040", PageLocked(page));
46306+ assert("vs-1050", !PageUptodate(page));
46307+ assert("vs-1039", page->mapping && page->mapping->host);
46308+
46309+ assert("vs-1044", znode_is_loaded(coord->node));
46310+ assert("vs-758", item_is_extent(coord));
46311+ assert("vs-1046", coord_is_existing_unit(coord));
46312+ assert("vs-1045", znode_is_rlocked(coord->node));
46313+ assert("vs-1047",
46314+ page->mapping->host->i_ino ==
46315+ get_key_objectid(item_key_by_coord(coord, &key)));
46316+ check_uf_coord(uf_coord, NULL);
46317+
46318+ return reiser4_do_readpage_extent(
46319+ ext_by_ext_coord(uf_coord),
46320+ uf_coord->extension.extent.pos_in_unit, page);
46321+}
46322+
46323+/**
46324+ * get_block_address_extent
46325+ * @coord:
46326+ * @block:
46327+ * @result:
46328+ *
46329+ *
46330+ */
46331+int get_block_address_extent(const coord_t *coord, sector_t block,
46332+ sector_t *result)
46333+{
46334+ reiser4_extent *ext;
46335+
46336+ if (!coord_is_existing_unit(coord))
46337+ return RETERR(-EINVAL);
46338+
46339+ ext = extent_by_coord(coord);
46340+
46341+ if (state_of_extent(ext) != ALLOCATED_EXTENT)
46342+ /* FIXME: bad things may happen if it is unallocated extent */
46343+ *result = 0;
46344+ else {
46345+ reiser4_key key;
46346+
46347+ unit_key_by_coord(coord, &key);
46348+ assert("vs-1645",
46349+ block >= get_key_offset(&key) >> current_blocksize_bits);
46350+ assert("vs-1646",
46351+ block <
46352+ (get_key_offset(&key) >> current_blocksize_bits) +
46353+ extent_get_width(ext));
46354+ *result =
46355+ extent_get_start(ext) + (block -
46356+ (get_key_offset(&key) >>
46357+ current_blocksize_bits));
46358+ }
46359+ return 0;
46360+}
46361+
46362+/*
46363+ plugin->u.item.s.file.append_key
46364+ key of first byte which is the next to last byte by addressed by this extent
46365+*/
46366+reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
46367+{
46368+ item_key_by_coord(coord, key);
46369+ set_key_offset(key,
46370+ get_key_offset(key) + reiser4_extent_size(coord,
46371+ nr_units_extent
46372+ (coord)));
46373+
46374+ assert("vs-610", get_key_offset(key)
46375+ && (get_key_offset(key) & (current_blocksize - 1)) == 0);
46376+ return key;
46377+}
46378+
46379+/* plugin->u.item.s.file.init_coord_extension */
46380+void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
46381+{
46382+ coord_t *coord;
46383+ struct extent_coord_extension *ext_coord;
46384+ reiser4_key key;
46385+ loff_t offset;
46386+
46387+ assert("vs-1295", uf_coord->valid == 0);
46388+
46389+ coord = &uf_coord->coord;
46390+ assert("vs-1288", coord_is_iplug_set(coord));
46391+ assert("vs-1327", znode_is_loaded(coord->node));
46392+
46393+ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
46394+ return;
46395+
46396+ ext_coord = &uf_coord->extension.extent;
46397+ ext_coord->nr_units = nr_units_extent(coord);
46398+ ext_coord->ext_offset =
46399+ (char *)extent_by_coord(coord) - zdata(coord->node);
46400+ ext_coord->width = extent_get_width(extent_by_coord(coord));
46401+ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
46402+ uf_coord->valid = 1;
46403+
46404+ /* pos_in_unit is the only uninitialized field in extended coord */
46405+ if (coord->between == AFTER_UNIT) {
46406+ assert("vs-1330",
46407+ coord->unit_pos == nr_units_extent(coord) - 1);
46408+
46409+ ext_coord->pos_in_unit = ext_coord->width - 1;
46410+ } else {
46411+ /* AT_UNIT */
46412+ unit_key_by_coord(coord, &key);
46413+ offset = get_key_offset(&key);
46414+
46415+ assert("vs-1328", offset <= lookuped);
46416+ assert("vs-1329",
46417+ lookuped <
46418+ offset + ext_coord->width * current_blocksize);
46419+ ext_coord->pos_in_unit =
46420+ ((lookuped - offset) >> current_blocksize_bits);
46421+ }
46422+}
46423+
46424+/*
46425+ * Local variables:
46426+ * c-indentation-style: "K&R"
46427+ * mode-name: "LC"
46428+ * c-basic-offset: 8
46429+ * tab-width: 8
46430+ * fill-column: 79
46431+ * scroll-step: 1
46432+ * End:
46433+ */
46434diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.24/fs/reiser4/plugin/item/extent_flush_ops.c
46435--- linux-2.6.24.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 03:00:00.000000000 +0300
46436+++ linux-2.6.24/fs/reiser4/plugin/item/extent_flush_ops.c 2008-01-25 11:39:07.016228297 +0300
46437@@ -0,0 +1,1028 @@
46438+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46439+
46440+#include "item.h"
46441+#include "../../tree.h"
46442+#include "../../jnode.h"
46443+#include "../../super.h"
46444+#include "../../flush.h"
46445+#include "../../carry.h"
46446+#include "../object.h"
46447+
46448+#include <linux/pagemap.h>
46449+
46450+static reiser4_block_nr extent_unit_start(const coord_t * item);
46451+
46452+/* Return either first or last extent (depending on @side) of the item
46453+ @coord is set to. Set @pos_in_unit either to first or to last block
46454+ of extent. */
46455+static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
46456+ reiser4_block_nr * pos_in_unit)
46457+{
46458+ reiser4_extent *ext;
46459+
46460+ if (side == LEFT_SIDE) {
46461+ /* get first extent of item */
46462+ ext = extent_item(coord);
46463+ *pos_in_unit = 0;
46464+ } else {
46465+ /* get last extent of item and last position within it */
46466+ assert("vs-363", side == RIGHT_SIDE);
46467+ ext = extent_item(coord) + coord_last_unit_pos(coord);
46468+ *pos_in_unit = extent_get_width(ext) - 1;
46469+ }
46470+
46471+ return ext;
46472+}
46473+
46474+/* item_plugin->f.utmost_child */
46475+/* Return the child. Coord is set to extent item. Find jnode corresponding
46476+ either to first or to last unformatted node pointed by the item */
46477+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
46478+{
46479+ reiser4_extent *ext;
46480+ reiser4_block_nr pos_in_unit;
46481+
46482+ ext = extent_utmost_ext(coord, side, &pos_in_unit);
46483+
46484+ switch (state_of_extent(ext)) {
46485+ case HOLE_EXTENT:
46486+ *childp = NULL;
46487+ return 0;
46488+ case ALLOCATED_EXTENT:
46489+ case UNALLOCATED_EXTENT:
46490+ break;
46491+ default:
46492+ /* this should never happen */
46493+ assert("vs-1417", 0);
46494+ }
46495+
46496+ {
46497+ reiser4_key key;
46498+ reiser4_tree *tree;
46499+ unsigned long index;
46500+
46501+ if (side == LEFT_SIDE) {
46502+ /* get key of first byte addressed by the extent */
46503+ item_key_by_coord(coord, &key);
46504+ } else {
46505+ /* get key of byte which next after last byte addressed by the extent */
46506+ append_key_extent(coord, &key);
46507+ }
46508+
46509+ assert("vs-544",
46510+ (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
46511+ /* index of first or last (depending on @side) page addressed
46512+ by the extent */
46513+ index =
46514+ (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
46515+ if (side == RIGHT_SIDE)
46516+ index--;
46517+
46518+ tree = coord->node->zjnode.tree;
46519+ *childp = jlookup(tree, get_key_objectid(&key), index);
46520+ }
46521+
46522+ return 0;
46523+}
46524+
46525+/* item_plugin->f.utmost_child_real_block */
46526+/* Return the child's block, if allocated. */
46527+int
46528+utmost_child_real_block_extent(const coord_t * coord, sideof side,
46529+ reiser4_block_nr * block)
46530+{
46531+ reiser4_extent *ext;
46532+
46533+ ext = extent_by_coord(coord);
46534+
46535+ switch (state_of_extent(ext)) {
46536+ case ALLOCATED_EXTENT:
46537+ *block = extent_get_start(ext);
46538+ if (side == RIGHT_SIDE)
46539+ *block += extent_get_width(ext) - 1;
46540+ break;
46541+ case HOLE_EXTENT:
46542+ case UNALLOCATED_EXTENT:
46543+ *block = 0;
46544+ break;
46545+ default:
46546+ /* this should never happen */
46547+ assert("vs-1418", 0);
46548+ }
46549+
46550+ return 0;
46551+}
46552+
46553+/* item_plugin->f.scan */
46554+/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
46555+ This scan continues, advancing the parent coordinate, until either it encounters a
46556+ formatted child or it finishes scanning this node.
46557+
46558+ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
46559+ not sure this is last property (same atom) is enforced, but it should be the case since
46560+ one atom must write the parent and the others must read the parent, thus fusing?). In
46561+ any case, the code below asserts this case for unallocated extents. Unallocated
46562+ extents are thus optimized because we can skip to the endpoint when scanning.
46563+
46564+ It returns control to reiser4_scan_extent, handles these terminating conditions,
46565+ e.g., by loading the next twig.
46566+*/
46567+int reiser4_scan_extent(flush_scan * scan)
46568+{
46569+ coord_t coord;
46570+ jnode *neighbor;
46571+ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
46572+ reiser4_block_nr unit_start;
46573+ __u64 oid;
46574+ reiser4_key key;
46575+ int ret = 0, allocated, incr;
46576+ reiser4_tree *tree;
46577+
46578+ if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
46579+ scan->stop = 1;
46580+ return 0; /* Race with truncate, this node is already
46581+ * truncated. */
46582+ }
46583+
46584+ coord_dup(&coord, &scan->parent_coord);
46585+
46586+ assert("jmacd-1404", !reiser4_scan_finished(scan));
46587+ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
46588+ assert("jmacd-1406", jnode_is_unformatted(scan->node));
46589+
46590+ /* The scan_index variable corresponds to the current page index of the
46591+ unformatted block scan position. */
46592+ scan_index = index_jnode(scan->node);
46593+
46594+ assert("jmacd-7889", item_is_extent(&coord));
46595+
46596+ repeat:
46597+ /* objectid of file */
46598+ oid = get_key_objectid(item_key_by_coord(&coord, &key));
46599+
46600+ allocated = !extent_is_unallocated(&coord);
46601+ /* Get the values of this extent unit: */
46602+ unit_index = extent_unit_index(&coord);
46603+ unit_width = extent_unit_width(&coord);
46604+ unit_start = extent_unit_start(&coord);
46605+
46606+ assert("jmacd-7187", unit_width > 0);
46607+ assert("jmacd-7188", scan_index >= unit_index);
46608+ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
46609+
46610+ /* Depending on the scan direction, we set different maximum values for scan_index
46611+ (scan_max) and the number of nodes that would be passed if the scan goes the
46612+ entire way (scan_dist). Incr is an integer reflecting the incremental
46613+ direction of scan_index. */
46614+ if (reiser4_scanning_left(scan)) {
46615+ scan_max = unit_index;
46616+ scan_dist = scan_index - unit_index;
46617+ incr = -1;
46618+ } else {
46619+ scan_max = unit_index + unit_width - 1;
46620+ scan_dist = scan_max - unit_index;
46621+ incr = +1;
46622+ }
46623+
46624+ tree = coord.node->zjnode.tree;
46625+
46626+ /* If the extent is allocated we have to check each of its blocks. If the extent
46627+ is unallocated we can skip to the scan_max. */
46628+ if (allocated) {
46629+ do {
46630+ neighbor = jlookup(tree, oid, scan_index);
46631+ if (neighbor == NULL)
46632+ goto stop_same_parent;
46633+
46634+ if (scan->node != neighbor
46635+ && !reiser4_scan_goto(scan, neighbor)) {
46636+ /* @neighbor was jput() by reiser4_scan_goto */
46637+ goto stop_same_parent;
46638+ }
46639+
46640+ ret = scan_set_current(scan, neighbor, 1, &coord);
46641+ if (ret != 0) {
46642+ goto exit;
46643+ }
46644+
46645+ /* reference to @neighbor is stored in @scan, no need
46646+ to jput(). */
46647+ scan_index += incr;
46648+
46649+ } while (incr + scan_max != scan_index);
46650+
46651+ } else {
46652+ /* Optimized case for unallocated extents, skip to the end. */
46653+ neighbor = jlookup(tree, oid, scan_max /*index */ );
46654+ if (neighbor == NULL) {
46655+ /* Race with truncate */
46656+ scan->stop = 1;
46657+ ret = 0;
46658+ goto exit;
46659+ }
46660+
46661+ assert("zam-1043",
46662+ reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
46663+
46664+ ret = scan_set_current(scan, neighbor, scan_dist, &coord);
46665+ if (ret != 0) {
46666+ goto exit;
46667+ }
46668+ }
46669+
46670+ if (coord_sideof_unit(&coord, scan->direction) == 0
46671+ && item_is_extent(&coord)) {
46672+ /* Continue as long as there are more extent units. */
46673+
46674+ scan_index =
46675+ extent_unit_index(&coord) +
46676+ (reiser4_scanning_left(scan) ?
46677+ extent_unit_width(&coord) - 1 : 0);
46678+ goto repeat;
46679+ }
46680+
46681+ if (0) {
46682+ stop_same_parent:
46683+
46684+ /* If we are scanning left and we stop in the middle of an allocated
46685+ extent, we know the preceder immediately.. */
46686+ /* middle of extent is (scan_index - unit_index) != 0. */
46687+ if (reiser4_scanning_left(scan) &&
46688+ (scan_index - unit_index) != 0) {
46689+ /* FIXME(B): Someone should step-through and verify that this preceder
46690+ calculation is indeed correct. */
46691+ /* @unit_start is starting block (number) of extent
46692+ unit. Flush stopped at the @scan_index block from
46693+ the beginning of the file, which is (scan_index -
46694+ unit_index) block within extent.
46695+ */
46696+ if (unit_start) {
46697+ /* skip preceder update when we are at hole */
46698+ scan->preceder_blk =
46699+ unit_start + scan_index - unit_index;
46700+ check_preceder(scan->preceder_blk);
46701+ }
46702+ }
46703+
46704+ /* In this case, we leave coord set to the parent of scan->node. */
46705+ scan->stop = 1;
46706+
46707+ } else {
46708+ /* In this case, we are still scanning, coord is set to the next item which is
46709+ either off-the-end of the node or not an extent. */
46710+ assert("jmacd-8912", scan->stop == 0);
46711+ assert("jmacd-7812",
46712+ (coord_is_after_sideof_unit(&coord, scan->direction)
46713+ || !item_is_extent(&coord)));
46714+ }
46715+
46716+ ret = 0;
46717+ exit:
46718+ return ret;
46719+}
46720+
46721+/* ask block allocator for some blocks */
46722+static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
46723+ reiser4_block_nr wanted_count,
46724+ reiser4_block_nr *first_allocated,
46725+ reiser4_block_nr *allocated,
46726+ block_stage_t block_stage)
46727+{
46728+ *allocated = wanted_count;
46729+ preceder->max_dist = 0; /* scan whole disk, if needed */
46730+
46731+ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
46732+ preceder->block_stage = block_stage;
46733+
46734+ /* FIXME: we do not handle errors here now */
46735+ check_me("vs-420",
46736+ reiser4_alloc_blocks(preceder, first_allocated, allocated,
46737+ BA_PERMANENT) == 0);
46738+ /* update flush_pos's preceder to last allocated block number */
46739+ preceder->blk = *first_allocated + *allocated - 1;
46740+}
46741+
46742+/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
46743+ will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
46744+ to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
46745+static reiser4_block_nr reserve_replace(void)
46746+{
46747+ reiser4_block_nr grabbed, needed;
46748+
46749+ grabbed = get_current_context()->grabbed_blocks;
46750+ needed = estimate_one_insert_into_item(current_tree);
46751+ check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
46752+ return grabbed;
46753+}
46754+
46755+static void free_replace_reserved(reiser4_block_nr grabbed)
46756+{
46757+ reiser4_context *ctx;
46758+
46759+ ctx = get_current_context();
46760+ grabbed2free(ctx, get_super_private(ctx->super),
46761+ ctx->grabbed_blocks - grabbed);
46762+}
46763+
46764+/* Block offset of first block addressed by unit */
46765+__u64 extent_unit_index(const coord_t * item)
46766+{
46767+ reiser4_key key;
46768+
46769+ assert("vs-648", coord_is_existing_unit(item));
46770+ unit_key_by_coord(item, &key);
46771+ return get_key_offset(&key) >> current_blocksize_bits;
46772+}
46773+
46774+/* AUDIT shouldn't return value be of reiser4_block_nr type?
46775+ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
46776+__u64 extent_unit_width(const coord_t * item)
46777+{
46778+ assert("vs-649", coord_is_existing_unit(item));
46779+ return width_by_coord(item);
46780+}
46781+
46782+/* Starting block location of this unit */
46783+static reiser4_block_nr extent_unit_start(const coord_t * item)
46784+{
46785+ return extent_get_start(extent_by_coord(item));
46786+}
46787+
46788+/**
46789+ * split_allocated_extent -
46790+ * @coord:
46791+ * @pos_in_unit:
46792+ *
46793+ * replace allocated extent with two allocated extents
46794+ */
46795+static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
46796+{
46797+ int result;
46798+ struct replace_handle *h;
46799+ reiser4_extent *ext;
46800+ reiser4_block_nr grabbed;
46801+
46802+ ext = extent_by_coord(coord);
46803+ assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
46804+ assert("vs-1411", extent_get_width(ext) > pos_in_unit);
46805+
46806+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
46807+ if (h == NULL)
46808+ return RETERR(-ENOMEM);
46809+ h->coord = coord;
46810+ h->lh = znode_lh(coord->node);
46811+ h->pkey = &h->key;
46812+ unit_key_by_coord(coord, h->pkey);
46813+ set_key_offset(h->pkey,
46814+ (get_key_offset(h->pkey) +
46815+ pos_in_unit * current_blocksize));
46816+ reiser4_set_extent(&h->overwrite, extent_get_start(ext),
46817+ pos_in_unit);
46818+ reiser4_set_extent(&h->new_extents[0],
46819+ extent_get_start(ext) + pos_in_unit,
46820+ extent_get_width(ext) - pos_in_unit);
46821+ h->nr_new_extents = 1;
46822+ h->flags = COPI_DONT_SHIFT_LEFT;
46823+ h->paste_key = h->key;
46824+
46825+ /* reserve space for extent unit paste, @grabbed is reserved before */
46826+ grabbed = reserve_replace();
46827+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
46828+ extent */);
46829+ /* restore reserved */
46830+ free_replace_reserved(grabbed);
46831+ kfree(h);
46832+ return result;
46833+}
46834+
46835+/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
46836+ one). Return 1 if it succeeded, 0 - otherwise */
46837+static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
46838+ reiser4_extent *replace)
46839+{
46840+ assert("vs-1415", extent_by_coord(coord) == ext);
46841+
46842+ if (coord->unit_pos == 0
46843+ || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
46844+ /* @ext either does not exist or is not allocated extent */
46845+ return 0;
46846+ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
46847+ extent_get_start(replace))
46848+ return 0;
46849+
46850+ /* we can glue, widen previous unit */
46851+ extent_set_width(ext - 1,
46852+ extent_get_width(ext - 1) + extent_get_width(replace));
46853+
46854+ if (extent_get_width(ext) != extent_get_width(replace)) {
46855+ /* make current extent narrower */
46856+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
46857+ extent_set_start(ext,
46858+ extent_get_start(ext) +
46859+ extent_get_width(replace));
46860+ extent_set_width(ext,
46861+ extent_get_width(ext) -
46862+ extent_get_width(replace));
46863+ } else {
46864+ /* current extent completely glued with its left neighbor, remove it */
46865+ coord_t from, to;
46866+
46867+ coord_dup(&from, coord);
46868+ from.unit_pos = nr_units_extent(coord) - 1;
46869+ coord_dup(&to, &from);
46870+
46871+ /* currently cut from extent can cut either from the beginning or from the end. Move place which got
46872+ freed after unit removal to end of item */
46873+ memmove(ext, ext + 1,
46874+ (from.unit_pos -
46875+ coord->unit_pos) * sizeof(reiser4_extent));
46876+ /* wipe part of item which is going to be cut, so that node_check will not be confused */
46877+ cut_node_content(&from, &to, NULL, NULL, NULL);
46878+ }
46879+ znode_make_dirty(coord->node);
46880+ /* move coord back */
46881+ coord->unit_pos--;
46882+ return 1;
46883+}
46884+
46885+/**
46886+ * conv_extent - replace extent with 2 ones
46887+ * @coord: coordinate of extent to be replaced
46888+ * @replace: extent to overwrite the one @coord is set to
46889+ *
46890+ * Overwrites extent @coord is set to and paste one extent unit after
46891+ * overwritten one if @replace is shorter than initial extent
46892+ */
46893+static int conv_extent(coord_t *coord, reiser4_extent *replace)
46894+{
46895+ int result;
46896+ struct replace_handle *h;
46897+ reiser4_extent *ext;
46898+ reiser4_block_nr start, width, new_width;
46899+ reiser4_block_nr grabbed;
46900+ extent_state state;
46901+
46902+ ext = extent_by_coord(coord);
46903+ state = state_of_extent(ext);
46904+ start = extent_get_start(ext);
46905+ width = extent_get_width(ext);
46906+ new_width = extent_get_width(replace);
46907+
46908+ assert("vs-1458", (state == UNALLOCATED_EXTENT ||
46909+ state == ALLOCATED_EXTENT));
46910+ assert("vs-1459", width >= new_width);
46911+
46912+ if (try_to_merge_with_left(coord, ext, replace)) {
46913+ /* merged @replace with left neighbor. Current unit is either
46914+ removed or narrowed */
46915+ return 0;
46916+ }
46917+
46918+ if (width == new_width) {
46919+ /* replace current extent with @replace */
46920+ *ext = *replace;
46921+ znode_make_dirty(coord->node);
46922+ return 0;
46923+ }
46924+
46925+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
46926+ if (h == NULL)
46927+ return RETERR(-ENOMEM);
46928+ h->coord = coord;
46929+ h->lh = znode_lh(coord->node);
46930+ h->pkey = &h->key;
46931+ unit_key_by_coord(coord, h->pkey);
46932+ set_key_offset(h->pkey,
46933+ (get_key_offset(h->pkey) + new_width * current_blocksize));
46934+ h->overwrite = *replace;
46935+
46936+ /* replace @ext with @replace and padding extent */
46937+ reiser4_set_extent(&h->new_extents[0],
46938+ (state == ALLOCATED_EXTENT) ?
46939+ (start + new_width) :
46940+ UNALLOCATED_EXTENT_START,
46941+ width - new_width);
46942+ h->nr_new_extents = 1;
46943+ h->flags = COPI_DONT_SHIFT_LEFT;
46944+ h->paste_key = h->key;
46945+
46946+ /* reserve space for extent unit paste, @grabbed is reserved before */
46947+ grabbed = reserve_replace();
46948+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
46949+ extent */);
46950+
46951+ /* restore reserved */
46952+ free_replace_reserved(grabbed);
46953+ kfree(h);
46954+ return result;
46955+}
46956+
46957+/**
46958+ * assign_real_blocknrs
46959+ * @flush_pos:
46960+ * @oid: objectid of file jnodes to assign block number to belongs to
46961+ * @index: first jnode on the range
46962+ * @count: number of jnodes to assign block numbers to
46963+ * @first: start of allocated block range
46964+ *
46965+ * Assigns block numbers to each of @count jnodes. Index of first jnode is
46966+ * @index. Jnodes get lookuped with jlookup.
46967+ */
46968+static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
46969+ unsigned long index, reiser4_block_nr count,
46970+ reiser4_block_nr first)
46971+{
46972+ unsigned long i;
46973+ reiser4_tree *tree;
46974+ txn_atom *atom;
46975+ int nr;
46976+
46977+ atom = atom_locked_by_fq(flush_pos->fq);
46978+ assert("vs-1468", atom);
46979+ BUG_ON(atom == NULL);
46980+
46981+ nr = 0;
46982+ tree = current_tree;
46983+ for (i = 0; i < count; ++i, ++index) {
46984+ jnode *node;
46985+
46986+ node = jlookup(tree, oid, index);
46987+ assert("", node != NULL);
46988+ BUG_ON(node == NULL);
46989+
46990+ spin_lock_jnode(node);
46991+ assert("", !jnode_is_flushprepped(node));
46992+ assert("vs-1475", node->atom == atom);
46993+ assert("vs-1476", atomic_read(&node->x_count) > 0);
46994+
46995+ JF_CLR(node, JNODE_FLUSH_RESERVED);
46996+ jnode_set_block(node, &first);
46997+ unformatted_make_reloc(node, flush_pos->fq);
46998+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
46999+ FQ_LIST, 0));
47000+ spin_unlock_jnode(node);
47001+ first++;
47002+
47003+ atomic_dec(&node->x_count);
47004+ nr ++;
47005+ }
47006+
47007+ spin_unlock_atom(atom);
47008+ return;
47009+}
47010+
47011+/**
47012+ * make_node_ovrwr - assign node to overwrite set
47013+ * @jnodes: overwrite set list head
47014+ * @node: jnode to belong to overwrite set
47015+ *
47016+ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
47017+ * which is an accumulator for nodes before they get to overwrite set list of
47018+ * atom.
47019+ */
47020+static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
47021+{
47022+ spin_lock_jnode(node);
47023+
47024+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
47025+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
47026+
47027+ JF_SET(node, JNODE_OVRWR);
47028+ list_move_tail(&node->capture_link, jnodes);
47029+ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
47030+
47031+ spin_unlock_jnode(node);
47032+}
47033+
47034+/**
47035+ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
47036+ * @flush_pos: flush position
47037+ * @oid: objectid of file jnodes belong to
47038+ * @index: starting index
47039+ * @width: extent width
47040+ *
47041+ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
47042+ * overwrite set. Starting from the one with index @index. If end of slum is
47043+ * detected (node is not found or flushprepped) - stop iterating and set flush
47044+ * position's state to POS_INVALID.
47045+ */
47046+static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
47047+ unsigned long index, reiser4_block_nr width)
47048+{
47049+ unsigned long i;
47050+ reiser4_tree *tree;
47051+ jnode *node;
47052+ txn_atom *atom;
47053+ LIST_HEAD(jnodes);
47054+
47055+ tree = current_tree;
47056+
47057+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
47058+ assert("vs-1478", atom);
47059+
47060+ for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
47061+ node = jlookup(tree, oid, index);
47062+ if (!node) {
47063+ flush_pos->state = POS_INVALID;
47064+ break;
47065+ }
47066+ if (jnode_check_flushprepped(node)) {
47067+ flush_pos->state = POS_INVALID;
47068+ atomic_dec(&node->x_count);
47069+ break;
47070+ }
47071+ if (node->atom != atom) {
47072+ flush_pos->state = POS_INVALID;
47073+ atomic_dec(&node->x_count);
47074+ break;
47075+ }
47076+ make_node_ovrwr(&jnodes, node);
47077+ atomic_dec(&node->x_count);
47078+ }
47079+
47080+ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
47081+ spin_unlock_atom(atom);
47082+}
47083+
47084+/**
47085+ * allocated_extent_slum_size
47086+ * @flush_pos:
47087+ * @oid:
47088+ * @index:
47089+ * @count:
47090+ *
47091+ *
47092+ */
47093+static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
47094+ unsigned long index, unsigned long count)
47095+{
47096+ unsigned long i;
47097+ reiser4_tree *tree;
47098+ txn_atom *atom;
47099+ int nr;
47100+
47101+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
47102+ assert("vs-1468", atom);
47103+
47104+ nr = 0;
47105+ tree = current_tree;
47106+ for (i = 0; i < count; ++i, ++index) {
47107+ jnode *node;
47108+
47109+ node = jlookup(tree, oid, index);
47110+ if (!node)
47111+ break;
47112+
47113+ if (jnode_check_flushprepped(node)) {
47114+ atomic_dec(&node->x_count);
47115+ break;
47116+ }
47117+
47118+ if (node->atom != atom) {
47119+ /*
47120+ * this is possible on overwrite: extent_write may
47121+ * capture several unformatted nodes without capturing
47122+ * any formatted nodes.
47123+ */
47124+ atomic_dec(&node->x_count);
47125+ break;
47126+ }
47127+
47128+ assert("vs-1476", atomic_read(&node->x_count) > 1);
47129+ atomic_dec(&node->x_count);
47130+ nr ++;
47131+ }
47132+
47133+ spin_unlock_atom(atom);
47134+ return nr;
47135+}
47136+
47137+/**
47138+ * alloc_extent
47139+ * @flush_pos:
47140+ *
47141+ *
47142+ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
47143+ * is set to. It is to prepare for flushing sequence of not flushprepped nodes
47144+ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
47145+ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
47146+ * set to 1 and to overwrite set otherwise
47147+ */
47148+int reiser4_alloc_extent(flush_pos_t *flush_pos)
47149+{
47150+ coord_t *coord;
47151+ reiser4_extent *ext;
47152+ reiser4_extent replace_ext;
47153+ oid_t oid;
47154+ reiser4_block_nr protected;
47155+ reiser4_block_nr start;
47156+ __u64 index;
47157+ __u64 width;
47158+ extent_state state;
47159+ int result;
47160+ reiser4_block_nr first_allocated;
47161+ __u64 allocated;
47162+ reiser4_key key;
47163+ block_stage_t block_stage;
47164+
47165+ assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
47166+ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
47167+ && item_is_extent(&flush_pos->coord));
47168+
47169+ coord = &flush_pos->coord;
47170+
47171+ ext = extent_by_coord(coord);
47172+ state = state_of_extent(ext);
47173+ if (state == HOLE_EXTENT) {
47174+ flush_pos->state = POS_INVALID;
47175+ return 0;
47176+ }
47177+
47178+ item_key_by_coord(coord, &key);
47179+ oid = get_key_objectid(&key);
47180+ index = extent_unit_index(coord) + flush_pos->pos_in_unit;
47181+ start = extent_get_start(ext);
47182+ width = extent_get_width(ext);
47183+
47184+ assert("vs-1457", width > flush_pos->pos_in_unit);
47185+
47186+ if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
47187+ /* relocate */
47188+ if (flush_pos->pos_in_unit) {
47189+ /* split extent unit into two */
47190+ result =
47191+ split_allocated_extent(coord,
47192+ flush_pos->pos_in_unit);
47193+ flush_pos->pos_in_unit = 0;
47194+ return result;
47195+ }
47196+
47197+ /* limit number of nodes to allocate */
47198+ if (flush_pos->nr_to_write < width)
47199+ width = flush_pos->nr_to_write;
47200+
47201+ if (state == ALLOCATED_EXTENT) {
47202+ /*
47203+ * all protected nodes are not flushprepped, therefore
47204+ * they are counted as flush_reserved
47205+ */
47206+ block_stage = BLOCK_FLUSH_RESERVED;
47207+ protected = allocated_extent_slum_size(flush_pos, oid,
47208+ index, width);
47209+ if (protected == 0) {
47210+ flush_pos->state = POS_INVALID;
47211+ flush_pos->pos_in_unit = 0;
47212+ return 0;
47213+ }
47214+ } else {
47215+ block_stage = BLOCK_UNALLOCATED;
47216+ protected = width;
47217+ }
47218+
47219+ /*
47220+ * look at previous unit if possible. If it is allocated, make
47221+ * preceder more precise
47222+ */
47223+ if (coord->unit_pos &&
47224+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47225+ reiser4_pos_hint(flush_pos)->blk =
47226+ extent_get_start(ext - 1) +
47227+ extent_get_width(ext - 1);
47228+
47229+ /* allocate new block numbers for protected nodes */
47230+ extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47231+ protected,
47232+ &first_allocated, &allocated,
47233+ block_stage);
47234+
47235+ if (state == ALLOCATED_EXTENT)
47236+ /*
47237+ * on relocating - free nodes which are going to be
47238+ * relocated
47239+ */
47240+ reiser4_dealloc_blocks(&start, &allocated,
47241+ BLOCK_ALLOCATED, BA_DEFER);
47242+
47243+ /* assign new block numbers to protected nodes */
47244+ assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
47245+
47246+ /* prepare extent which will replace current one */
47247+ reiser4_set_extent(&replace_ext, first_allocated, allocated);
47248+
47249+ /* adjust extent item */
47250+ result = conv_extent(coord, &replace_ext);
47251+ if (result != 0 && result != -ENOMEM) {
47252+ warning("vs-1461",
47253+ "Failed to allocate extent. Should not happen\n");
47254+ return result;
47255+ }
47256+
47257+ /*
47258+ * break flush: we prepared for flushing as many blocks as we
47259+ * were asked for
47260+ */
47261+ if (flush_pos->nr_to_write == allocated)
47262+ flush_pos->state = POS_INVALID;
47263+ } else {
47264+ /* overwrite */
47265+ mark_jnodes_overwrite(flush_pos, oid, index, width);
47266+ }
47267+ flush_pos->pos_in_unit = 0;
47268+ return 0;
47269+}
47270+
47271+/* if @key is glueable to the item @coord is set to */
47272+static int must_insert(const coord_t *coord, const reiser4_key *key)
47273+{
47274+ reiser4_key last;
47275+
47276+ if (item_id_by_coord(coord) == EXTENT_POINTER_ID
47277+ && keyeq(append_key_extent(coord, &last), key))
47278+ return 0;
47279+ return 1;
47280+}
47281+
47282+/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
47283+ or modify last unit of last item to have greater width */
47284+static int put_unit_to_end(znode *node, const reiser4_key *key,
47285+ reiser4_extent *copy_ext)
47286+{
47287+ int result;
47288+ coord_t coord;
47289+ cop_insert_flag flags;
47290+ reiser4_extent *last_ext;
47291+ reiser4_item_data data;
47292+
47293+ /* set coord after last unit in an item */
47294+ coord_init_last_unit(&coord, node);
47295+ coord.between = AFTER_UNIT;
47296+
47297+ flags =
47298+ COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
47299+ if (must_insert(&coord, key)) {
47300+ result =
47301+ insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
47302+ key, NULL /*lh */ , flags);
47303+
47304+ } else {
47305+ /* try to glue with last unit */
47306+ last_ext = extent_by_coord(&coord);
47307+ if (state_of_extent(last_ext) &&
47308+ extent_get_start(last_ext) + extent_get_width(last_ext) ==
47309+ extent_get_start(copy_ext)) {
47310+ /* widen last unit of node */
47311+ extent_set_width(last_ext,
47312+ extent_get_width(last_ext) +
47313+ extent_get_width(copy_ext));
47314+ znode_make_dirty(node);
47315+ return 0;
47316+ }
47317+
47318+ /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
47319+ result =
47320+ insert_into_item(&coord, NULL /*lh */ , key,
47321+ init_new_extent(&data, copy_ext, 1),
47322+ flags);
47323+ }
47324+
47325+ assert("vs-438", result == 0 || result == -E_NODE_FULL);
47326+ return result;
47327+}
47328+
47329+/* @coord is set to extent unit */
47330+squeeze_result squalloc_extent(znode *left, const coord_t *coord,
47331+ flush_pos_t *flush_pos,
47332+ reiser4_key *stop_key)
47333+{
47334+ reiser4_extent *ext;
47335+ __u64 index;
47336+ __u64 width;
47337+ reiser4_block_nr start;
47338+ extent_state state;
47339+ oid_t oid;
47340+ reiser4_block_nr first_allocated;
47341+ __u64 allocated;
47342+ __u64 protected;
47343+ reiser4_extent copy_extent;
47344+ reiser4_key key;
47345+ int result;
47346+ block_stage_t block_stage;
47347+
47348+ assert("vs-1457", flush_pos->pos_in_unit == 0);
47349+ assert("vs-1467", coord_is_leftmost_unit(coord));
47350+ assert("vs-1467", item_is_extent(coord));
47351+
47352+ ext = extent_by_coord(coord);
47353+ index = extent_unit_index(coord);
47354+ start = extent_get_start(ext);
47355+ width = extent_get_width(ext);
47356+ state = state_of_extent(ext);
47357+ unit_key_by_coord(coord, &key);
47358+ oid = get_key_objectid(&key);
47359+
47360+ if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
47361+ (state == UNALLOCATED_EXTENT)) {
47362+ /* relocate */
47363+ if (state == ALLOCATED_EXTENT) {
47364+ /* all protected nodes are not flushprepped, therefore
47365+ * they are counted as flush_reserved */
47366+ block_stage = BLOCK_FLUSH_RESERVED;
47367+ protected = allocated_extent_slum_size(flush_pos, oid,
47368+ index, width);
47369+ if (protected == 0) {
47370+ flush_pos->state = POS_INVALID;
47371+ flush_pos->pos_in_unit = 0;
47372+ return 0;
47373+ }
47374+ } else {
47375+ block_stage = BLOCK_UNALLOCATED;
47376+ protected = width;
47377+ }
47378+
47379+ /*
47380+ * look at previous unit if possible. If it is allocated, make
47381+ * preceder more precise
47382+ */
47383+ if (coord->unit_pos &&
47384+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47385+ reiser4_pos_hint(flush_pos)->blk =
47386+ extent_get_start(ext - 1) +
47387+ extent_get_width(ext - 1);
47388+
47389+ /* allocate new block numbers for protected nodes */
47390+ extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47391+ protected,
47392+ &first_allocated, &allocated,
47393+ block_stage);
47394+
47395+ /* prepare extent which will be copied to left */
47396+ reiser4_set_extent(&copy_extent, first_allocated, allocated);
47397+
47398+ result = put_unit_to_end(left, &key, &copy_extent);
47399+ if (result == -E_NODE_FULL) {
47400+ int target_block_stage;
47401+
47402+ /* free blocks which were just allocated */
47403+ target_block_stage =
47404+ (state ==
47405+ ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
47406+ BLOCK_UNALLOCATED;
47407+ reiser4_dealloc_blocks(&first_allocated, &allocated,
47408+ target_block_stage,
47409+ BA_PERMANENT);
47410+
47411+ /* rewind the preceder. */
47412+ flush_pos->preceder.blk = first_allocated;
47413+ check_preceder(flush_pos->preceder.blk);
47414+
47415+ return SQUEEZE_TARGET_FULL;
47416+ }
47417+
47418+ if (state == ALLOCATED_EXTENT) {
47419+ /* free nodes which were relocated */
47420+ reiser4_dealloc_blocks(&start, &allocated,
47421+ BLOCK_ALLOCATED, BA_DEFER);
47422+ }
47423+
47424+ /* assign new block numbers to protected nodes */
47425+ assign_real_blocknrs(flush_pos, oid, index, allocated,
47426+ first_allocated);
47427+
47428+ set_key_offset(&key,
47429+ get_key_offset(&key) +
47430+ (allocated << current_blocksize_bits));
47431+ } else {
47432+ /*
47433+ * overwrite: try to copy unit as it is to left neighbor and
47434+ * make all first not flushprepped nodes overwrite nodes
47435+ */
47436+ reiser4_set_extent(&copy_extent, start, width);
47437+ result = put_unit_to_end(left, &key, &copy_extent);
47438+ if (result == -E_NODE_FULL)
47439+ return SQUEEZE_TARGET_FULL;
47440+
47441+ if (state != HOLE_EXTENT)
47442+ mark_jnodes_overwrite(flush_pos, oid, index, width);
47443+ set_key_offset(&key,
47444+ get_key_offset(&key) +
47445+ (width << current_blocksize_bits));
47446+ }
47447+ *stop_key = key;
47448+ return SQUEEZE_CONTINUE;
47449+}
47450+
47451+int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
47452+{
47453+ return key_by_inode_and_offset_common(inode, off, key);
47454+}
47455+
47456+/*
47457+ * Local variables:
47458+ * c-indentation-style: "K&R"
47459+ * mode-name: "LC"
47460+ * c-basic-offset: 8
47461+ * tab-width: 8
47462+ * fill-column: 79
47463+ * scroll-step: 1
47464+ * End:
47465+ */
47466diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/extent.h linux-2.6.24/fs/reiser4/plugin/item/extent.h
47467--- linux-2.6.24.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 03:00:00.000000000 +0300
47468+++ linux-2.6.24/fs/reiser4/plugin/item/extent.h 2008-01-25 11:40:16.698169785 +0300
47469@@ -0,0 +1,231 @@
47470+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47471+
47472+#ifndef __REISER4_EXTENT_H__
47473+#define __REISER4_EXTENT_H__
47474+
47475+/* on disk extent */
47476+typedef struct {
47477+ reiser4_dblock_nr start;
47478+ reiser4_dblock_nr width;
47479+} reiser4_extent;
47480+
47481+struct extent_stat {
47482+ int unallocated_units;
47483+ int unallocated_blocks;
47484+ int allocated_units;
47485+ int allocated_blocks;
47486+ int hole_units;
47487+ int hole_blocks;
47488+};
47489+
47490+/* extents in an extent item can be either holes, or unallocated or allocated
47491+ extents */
47492+typedef enum {
47493+ HOLE_EXTENT,
47494+ UNALLOCATED_EXTENT,
47495+ ALLOCATED_EXTENT
47496+} extent_state;
47497+
47498+#define HOLE_EXTENT_START 0
47499+#define UNALLOCATED_EXTENT_START 1
47500+#define UNALLOCATED_EXTENT_START2 2
47501+
47502+struct extent_coord_extension {
47503+ reiser4_block_nr pos_in_unit;
47504+ reiser4_block_nr width; /* width of current unit */
47505+ pos_in_node_t nr_units; /* number of units */
47506+ int ext_offset; /* offset from the beginning of zdata() */
47507+ unsigned long expected_page;
47508+#if REISER4_DEBUG
47509+ reiser4_extent extent;
47510+#endif
47511+};
47512+
47513+/* macros to set/get fields of on-disk extent */
47514+static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47515+{
47516+ return le64_to_cpu(ext->start);
47517+}
47518+
47519+static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47520+{
47521+ return le64_to_cpu(ext->width);
47522+}
47523+
47524+extern __u64 reiser4_current_block_count(void);
47525+
47526+static inline void
47527+extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47528+{
47529+ cassert(sizeof(ext->start) == 8);
47530+ assert("nikita-2510",
47531+ ergo(start > 1, start < reiser4_current_block_count()));
47532+ put_unaligned(cpu_to_le64(start), &ext->start);
47533+}
47534+
47535+static inline void
47536+extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
47537+{
47538+ cassert(sizeof(ext->width) == 8);
47539+ assert("", width > 0);
47540+ put_unaligned(cpu_to_le64(width), &ext->width);
47541+ assert("nikita-2511",
47542+ ergo(extent_get_start(ext) > 1,
47543+ extent_get_start(ext) + width <=
47544+ reiser4_current_block_count()));
47545+}
47546+
47547+#define extent_item(coord) \
47548+({ \
47549+ assert("nikita-3143", item_is_extent(coord)); \
47550+ ((reiser4_extent *)item_body_by_coord (coord)); \
47551+})
47552+
47553+#define extent_by_coord(coord) \
47554+({ \
47555+ assert("nikita-3144", item_is_extent(coord)); \
47556+ (extent_item (coord) + (coord)->unit_pos); \
47557+})
47558+
47559+#define width_by_coord(coord) \
47560+({ \
47561+ assert("nikita-3145", item_is_extent(coord)); \
47562+ extent_get_width (extent_by_coord(coord)); \
47563+})
47564+
47565+struct carry_cut_data;
47566+struct carry_kill_data;
47567+
47568+/* plugin->u.item.b.* */
47569+reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
47570+int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47571+ const reiser4_item_data *);
47572+int mergeable_extent(const coord_t * p1, const coord_t * p2);
47573+pos_in_node_t nr_units_extent(const coord_t *);
47574+lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
47575+void init_coord_extent(coord_t *);
47576+int init_extent(coord_t *, reiser4_item_data *);
47577+int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
47578+int can_shift_extent(unsigned free_space,
47579+ coord_t * source, znode * target, shift_direction,
47580+ unsigned *size, unsigned want);
47581+void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
47582+ unsigned count, shift_direction where_is_free_space,
47583+ unsigned free_space);
47584+int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
47585+ struct carry_kill_data *);
47586+int create_hook_extent(const coord_t * coord, void *arg);
47587+int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47588+ struct carry_cut_data *, reiser4_key * smallest_removed,
47589+ reiser4_key * new_first);
47590+int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47591+ struct carry_kill_data *, reiser4_key * smallest_removed,
47592+ reiser4_key * new_first);
47593+reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
47594+reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
47595+void print_extent(const char *, coord_t *);
47596+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
47597+int utmost_child_real_block_extent(const coord_t * coord, sideof side,
47598+ reiser4_block_nr * block);
47599+void item_stat_extent(const coord_t * coord, void *vp);
47600+int reiser4_check_extent(const coord_t * coord, const char **error);
47601+
47602+/* plugin->u.item.s.file.* */
47603+ssize_t reiser4_write_extent(struct file *, struct inode * inode,
47604+ const char __user *, size_t, loff_t *);
47605+int reiser4_read_extent(struct file *, flow_t *, hint_t *);
47606+int reiser4_readpage_extent(void *, struct page *);
47607+int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
47608+reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
47609+void init_coord_extension_extent(uf_coord_t *, loff_t offset);
47610+int get_block_address_extent(const coord_t *, sector_t block,
47611+ sector_t * result);
47612+
47613+/* these are used in flush.c
47614+ FIXME-VS: should they be somewhere in item_plugin? */
47615+int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
47616+int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
47617+ reiser4_key * stop_key);
47618+
47619+int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
47620+__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
47621+__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
47622+
47623+/* plugin->u.item.f. */
47624+int reiser4_scan_extent(flush_scan * scan);
47625+extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
47626+
47627+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47628+ int nr_extents);
47629+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
47630+extent_state state_of_extent(reiser4_extent * ext);
47631+void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
47632+ reiser4_block_nr width);
47633+int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
47634+ int *plugged_hole);
47635+
47636+#include "../../coord.h"
47637+#include "../../lock.h"
47638+#include "../../tap.h"
47639+
47640+struct replace_handle {
47641+ /* these are to be set before calling reiser4_replace_extent */
47642+ coord_t *coord;
47643+ lock_handle *lh;
47644+ reiser4_key key;
47645+ reiser4_key *pkey;
47646+ reiser4_extent overwrite;
47647+ reiser4_extent new_extents[2];
47648+ int nr_new_extents;
47649+ unsigned flags;
47650+
47651+ /* these are used by reiser4_replace_extent */
47652+ reiser4_item_data item;
47653+ coord_t coord_after;
47654+ lock_handle lh_after;
47655+ tap_t watch;
47656+ reiser4_key paste_key;
47657+#if REISER4_DEBUG
47658+ reiser4_extent orig_ext;
47659+ reiser4_key tmp;
47660+#endif
47661+};
47662+
47663+/* this structure is kmalloced before calling make_extent to avoid excessive
47664+ stack consumption on plug_hole->reiser4_replace_extent */
47665+struct make_extent_handle {
47666+ uf_coord_t *uf_coord;
47667+ reiser4_block_nr blocknr;
47668+ int created;
47669+ struct inode *inode;
47670+ union {
47671+ struct {
47672+ } append;
47673+ struct replace_handle replace;
47674+ } u;
47675+};
47676+
47677+int reiser4_replace_extent(struct replace_handle *,
47678+ int return_inserted_position);
47679+lock_handle *znode_lh(znode *);
47680+
47681+/* the reiser4 repacker support */
47682+struct repacker_cursor;
47683+extern int process_extent_backward_for_repacking(tap_t *,
47684+ struct repacker_cursor *);
47685+extern int mark_extent_for_repacking(tap_t *, int);
47686+
47687+#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
47688+#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
47689+
47690+/* __REISER4_EXTENT_H__ */
47691+#endif
47692+/*
47693+ Local variables:
47694+ c-indentation-style: "K&R"
47695+ mode-name: "LC"
47696+ c-basic-offset: 8
47697+ tab-width: 8
47698+ fill-column: 120
47699+ End:
47700+*/
47701diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.24/fs/reiser4/plugin/item/extent_item_ops.c
47702--- linux-2.6.24.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 03:00:00.000000000 +0300
47703+++ linux-2.6.24/fs/reiser4/plugin/item/extent_item_ops.c 2008-01-25 11:39:07.016228297 +0300
47704@@ -0,0 +1,889 @@
47705+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47706+
47707+#include "item.h"
47708+#include "../../inode.h"
47709+#include "../../tree_walk.h" /* check_sibling_list() */
47710+#include "../../page_cache.h"
47711+#include "../../carry.h"
47712+
47713+#include <linux/quotaops.h>
47714+
47715+/* item_plugin->b.max_key_inside */
47716+reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
47717+{
47718+ item_key_by_coord(coord, key);
47719+ set_key_offset(key, get_key_offset(reiser4_max_key()));
47720+ return key;
47721+}
47722+
47723+/* item_plugin->b.can_contain_key
47724+ this checks whether @key of @data is matching to position set by @coord */
47725+int
47726+can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47727+ const reiser4_item_data * data)
47728+{
47729+ reiser4_key item_key;
47730+
47731+ if (item_plugin_by_coord(coord) != data->iplug)
47732+ return 0;
47733+
47734+ item_key_by_coord(coord, &item_key);
47735+ if (get_key_locality(key) != get_key_locality(&item_key) ||
47736+ get_key_objectid(key) != get_key_objectid(&item_key) ||
47737+ get_key_ordering(key) != get_key_ordering(&item_key))
47738+ return 0;
47739+
47740+ return 1;
47741+}
47742+
47743+/* item_plugin->b.mergeable
47744+ first item is of extent type */
47745+/* Audited by: green(2002.06.13) */
47746+int mergeable_extent(const coord_t * p1, const coord_t * p2)
47747+{
47748+ reiser4_key key1, key2;
47749+
47750+ assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
47751+ /* FIXME-VS: Which is it? Assert or return 0 */
47752+ if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
47753+ return 0;
47754+ }
47755+
47756+ item_key_by_coord(p1, &key1);
47757+ item_key_by_coord(p2, &key2);
47758+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
47759+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
47760+ get_key_ordering(&key1) != get_key_ordering(&key2) ||
47761+ get_key_type(&key1) != get_key_type(&key2))
47762+ return 0;
47763+ if (get_key_offset(&key1) +
47764+ reiser4_extent_size(p1, nr_units_extent(p1)) !=
47765+ get_key_offset(&key2))
47766+ return 0;
47767+ return 1;
47768+}
47769+
47770+/* item_plugin->b.nr_units */
47771+pos_in_node_t nr_units_extent(const coord_t * coord)
47772+{
47773+ /* length of extent item has to be multiple of extent size */
47774+ assert("vs-1424",
47775+ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
47776+ return item_length_by_coord(coord) / sizeof(reiser4_extent);
47777+}
47778+
47779+/* item_plugin->b.lookup */
47780+lookup_result
47781+lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
47782+ coord_t * coord)
47783+{ /* znode and item_pos are
47784+ set to an extent item to
47785+ look through */
47786+ reiser4_key item_key;
47787+ reiser4_block_nr lookuped, offset;
47788+ unsigned i, nr_units;
47789+ reiser4_extent *ext;
47790+ unsigned blocksize;
47791+ unsigned char blocksize_bits;
47792+
47793+ item_key_by_coord(coord, &item_key);
47794+ offset = get_key_offset(&item_key);
47795+
47796+ /* key we are looking for must be greater than key of item @coord */
47797+ assert("vs-414", keygt(key, &item_key));
47798+
47799+ assert("umka-99945",
47800+ !keygt(key, max_key_inside_extent(coord, &item_key)));
47801+
47802+ ext = extent_item(coord);
47803+ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
47804+
47805+ blocksize = current_blocksize;
47806+ blocksize_bits = current_blocksize_bits;
47807+
47808+ /* offset we are looking for */
47809+ lookuped = get_key_offset(key);
47810+
47811+ nr_units = nr_units_extent(coord);
47812+ /* go through all extents until the one which address given offset */
47813+ for (i = 0; i < nr_units; i++, ext++) {
47814+ offset += (extent_get_width(ext) << blocksize_bits);
47815+ if (offset > lookuped) {
47816+ /* desired byte is somewhere in this extent */
47817+ coord->unit_pos = i;
47818+ coord->between = AT_UNIT;
47819+ return CBK_COORD_FOUND;
47820+ }
47821+ }
47822+
47823+ /* set coord after last unit */
47824+ coord->unit_pos = nr_units - 1;
47825+ coord->between = AFTER_UNIT;
47826+ return CBK_COORD_FOUND;
47827+}
47828+
47829+/* item_plugin->b.paste
47830+ item @coord is set to has been appended with @data->length of free
47831+ space. data->data contains data to be pasted into the item in position
47832+ @coord->in_item.unit_pos. It must fit into that free space.
47833+ @coord must be set between units.
47834+*/
47835+int
47836+paste_extent(coord_t * coord, reiser4_item_data * data,
47837+ carry_plugin_info * info UNUSED_ARG)
47838+{
47839+ unsigned old_nr_units;
47840+ reiser4_extent *ext;
47841+ int item_length;
47842+
47843+ ext = extent_item(coord);
47844+ item_length = item_length_by_coord(coord);
47845+ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
47846+
47847+ /* this is also used to copy extent into newly created item, so
47848+ old_nr_units could be 0 */
47849+ assert("vs-260", item_length >= data->length);
47850+
47851+ /* make sure that coord is set properly */
47852+ assert("vs-35",
47853+ ((!coord_is_existing_unit(coord))
47854+ || (!old_nr_units && !coord->unit_pos)));
47855+
47856+ /* first unit to be moved */
47857+ switch (coord->between) {
47858+ case AFTER_UNIT:
47859+ coord->unit_pos++;
47860+ case BEFORE_UNIT:
47861+ coord->between = AT_UNIT;
47862+ break;
47863+ case AT_UNIT:
47864+ assert("vs-331", !old_nr_units && !coord->unit_pos);
47865+ break;
47866+ default:
47867+ impossible("vs-330", "coord is set improperly");
47868+ }
47869+
47870+ /* prepare space for new units */
47871+ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
47872+ ext + coord->unit_pos,
47873+ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
47874+
47875+ /* copy new data from kernel space */
47876+ assert("vs-556", data->user == 0);
47877+ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
47878+
47879+ /* after paste @coord is set to first of pasted units */
47880+ assert("vs-332", coord_is_existing_unit(coord));
47881+ assert("vs-333",
47882+ !memcmp(data->data, extent_by_coord(coord),
47883+ (unsigned)data->length));
47884+ return 0;
47885+}
47886+
47887+/* item_plugin->b.can_shift */
47888+int
47889+can_shift_extent(unsigned free_space, coord_t * source,
47890+ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
47891+ unsigned *size, unsigned want)
47892+{
47893+ *size = item_length_by_coord(source);
47894+ if (*size > free_space)
47895+ /* never split a unit of extent item */
47896+ *size = free_space - free_space % sizeof(reiser4_extent);
47897+
47898+ /* we can shift *size bytes, calculate how many do we want to shift */
47899+ if (*size > want * sizeof(reiser4_extent))
47900+ *size = want * sizeof(reiser4_extent);
47901+
47902+ if (*size % sizeof(reiser4_extent) != 0)
47903+ impossible("vs-119", "Wrong extent size: %i %zd", *size,
47904+ sizeof(reiser4_extent));
47905+ return *size / sizeof(reiser4_extent);
47906+
47907+}
47908+
47909+/* item_plugin->b.copy_units */
47910+void
47911+copy_units_extent(coord_t * target, coord_t * source,
47912+ unsigned from, unsigned count,
47913+ shift_direction where_is_free_space, unsigned free_space)
47914+{
47915+ char *from_ext, *to_ext;
47916+
47917+ assert("vs-217", free_space == count * sizeof(reiser4_extent));
47918+
47919+ from_ext = item_body_by_coord(source);
47920+ to_ext = item_body_by_coord(target);
47921+
47922+ if (where_is_free_space == SHIFT_LEFT) {
47923+ assert("vs-215", from == 0);
47924+
47925+ /* At this moment, item length was already updated in the item
47926+ header by shifting code, hence nr_units_extent() will
47927+ return "new" number of units---one we obtain after copying
47928+ units.
47929+ */
47930+ to_ext +=
47931+ (nr_units_extent(target) - count) * sizeof(reiser4_extent);
47932+ } else {
47933+ reiser4_key key;
47934+ coord_t coord;
47935+
47936+ assert("vs-216",
47937+ from + count == coord_last_unit_pos(source) + 1);
47938+
47939+ from_ext += item_length_by_coord(source) - free_space;
47940+
47941+ /* new units are inserted before first unit in an item,
47942+ therefore, we have to update item key */
47943+ coord = *source;
47944+ coord.unit_pos = from;
47945+ unit_key_extent(&coord, &key);
47946+
47947+ node_plugin_by_node(target->node)->update_item_key(target, &key,
47948+ NULL /*info */);
47949+ }
47950+
47951+ memcpy(to_ext, from_ext, free_space);
47952+}
47953+
47954+/* item_plugin->b.create_hook
47955+ @arg is znode of leaf node for which we need to update right delimiting key */
47956+int create_hook_extent(const coord_t * coord, void *arg)
47957+{
47958+ coord_t *child_coord;
47959+ znode *node;
47960+ reiser4_key key;
47961+ reiser4_tree *tree;
47962+
47963+ if (!arg)
47964+ return 0;
47965+
47966+ child_coord = arg;
47967+ tree = znode_get_tree(coord->node);
47968+
47969+ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
47970+
47971+ write_lock_tree(tree);
47972+ write_lock_dk(tree);
47973+ /* find a node on the left level for which right delimiting key has to
47974+ be updated */
47975+ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
47976+ assert("vs-411", znode_is_left_connected(child_coord->node));
47977+ node = child_coord->node->left;
47978+ } else {
47979+ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
47980+ node = child_coord->node;
47981+ assert("nikita-3314", node != NULL);
47982+ }
47983+
47984+ if (node != NULL) {
47985+ znode_set_rd_key(node, item_key_by_coord(coord, &key));
47986+
47987+ assert("nikita-3282", check_sibling_list(node));
47988+ /* break sibling links */
47989+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
47990+ ON_DEBUG(node->right->left_version =
47991+ atomic_inc_return(&delim_key_version);
47992+ node->right_version =
47993+ atomic_inc_return(&delim_key_version););
47994+
47995+ node->right->left = NULL;
47996+ node->right = NULL;
47997+ }
47998+ }
47999+ write_unlock_dk(tree);
48000+ write_unlock_tree(tree);
48001+ return 0;
48002+}
48003+
48004+#define ITEM_TAIL_KILLED 0
48005+#define ITEM_HEAD_KILLED 1
48006+#define ITEM_KILLED 2
48007+
48008+/* item_plugin->b.kill_hook
48009+ this is called when @count units starting from @from-th one are going to be removed
48010+ */
48011+int
48012+kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
48013+ struct carry_kill_data *kdata)
48014+{
48015+ reiser4_extent *ext;
48016+ reiser4_block_nr start, length;
48017+ const reiser4_key *pfrom_key, *pto_key;
48018+ struct inode *inode;
48019+ reiser4_tree *tree;
48020+ pgoff_t from_off, to_off, offset, skip;
48021+ int retval;
48022+
48023+ /* these are located in memory kmalloc-ed by kill_node_content */
48024+ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
48025+ coord_t *dup, *next;
48026+
48027+ assert("zam-811", znode_is_write_locked(coord->node));
48028+ assert("nikita-3315", kdata != NULL);
48029+ assert("vs-34", kdata->buf != NULL);
48030+
48031+ /* map structures to kdata->buf */
48032+ min_item_key = (reiser4_key *) (kdata->buf);
48033+ max_item_key = min_item_key + 1;
48034+ from_key = max_item_key + 1;
48035+ to_key = from_key + 1;
48036+ key = to_key + 1;
48037+ dup = (coord_t *) (key + 1);
48038+ next = dup + 1;
48039+
48040+ item_key_by_coord(coord, min_item_key);
48041+ max_item_key_by_coord(coord, max_item_key);
48042+
48043+ if (kdata->params.from_key) {
48044+ pfrom_key = kdata->params.from_key;
48045+ pto_key = kdata->params.to_key;
48046+ } else {
48047+ assert("vs-1549", from == coord->unit_pos);
48048+ unit_key_by_coord(coord, from_key);
48049+ pfrom_key = from_key;
48050+
48051+ coord_dup(dup, coord);
48052+ dup->unit_pos = from + count - 1;
48053+ max_unit_key_by_coord(dup, to_key);
48054+ pto_key = to_key;
48055+ }
48056+
48057+ if (!keylt(pto_key, max_item_key)) {
48058+ if (!keygt(pfrom_key, min_item_key)) {
48059+ znode *left, *right;
48060+
48061+ /* item is to be removed completely */
48062+ assert("nikita-3316", kdata->left != NULL
48063+ && kdata->right != NULL);
48064+
48065+ left = kdata->left->node;
48066+ right = kdata->right->node;
48067+
48068+ tree = current_tree;
48069+ /* we have to do two things:
48070+ *
48071+ * 1. link left and right formatted neighbors of
48072+ * extent being removed, and
48073+ *
48074+ * 2. update their delimiting keys.
48075+ *
48076+ * atomicity of these operations is protected by
48077+ * taking dk-lock and tree-lock.
48078+ */
48079+ /* if neighbors of item being removed are znodes -
48080+ * link them */
48081+ write_lock_tree(tree);
48082+ write_lock_dk(tree);
48083+ link_left_and_right(left, right);
48084+ if (left) {
48085+ /* update right delimiting key of left
48086+ * neighbor of extent item */
48087+ /*coord_t next;
48088+ reiser4_key key; */
48089+
48090+ coord_dup(next, coord);
48091+
48092+ if (coord_next_item(next))
48093+ *key = *znode_get_rd_key(coord->node);
48094+ else
48095+ item_key_by_coord(next, key);
48096+ znode_set_rd_key(left, key);
48097+ }
48098+ write_unlock_dk(tree);
48099+ write_unlock_tree(tree);
48100+
48101+ from_off =
48102+ get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
48103+ to_off =
48104+ (get_key_offset(max_item_key) +
48105+ 1) >> PAGE_CACHE_SHIFT;
48106+ retval = ITEM_KILLED;
48107+ } else {
48108+ /* tail of item is to be removed */
48109+ from_off =
48110+ (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
48111+ 1) >> PAGE_CACHE_SHIFT;
48112+ to_off =
48113+ (get_key_offset(max_item_key) +
48114+ 1) >> PAGE_CACHE_SHIFT;
48115+ retval = ITEM_TAIL_KILLED;
48116+ }
48117+ } else {
48118+ /* head of item is to be removed */
48119+ assert("vs-1571", keyeq(pfrom_key, min_item_key));
48120+ assert("vs-1572",
48121+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
48122+ 0);
48123+ assert("vs-1573",
48124+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48125+ 1)) == 0);
48126+
48127+ if (kdata->left->node) {
48128+ /* update right delimiting key of left neighbor of extent item */
48129+ /*reiser4_key key; */
48130+
48131+ *key = *pto_key;
48132+ set_key_offset(key, get_key_offset(pto_key) + 1);
48133+
48134+ write_lock_dk(current_tree);
48135+ znode_set_rd_key(kdata->left->node, key);
48136+ write_unlock_dk(current_tree);
48137+ }
48138+
48139+ from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
48140+ to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
48141+ retval = ITEM_HEAD_KILLED;
48142+ }
48143+
48144+ inode = kdata->inode;
48145+ assert("vs-1545", inode != NULL);
48146+ if (inode != NULL)
48147+ /* take care of pages and jnodes corresponding to part of item being killed */
48148+ reiser4_invalidate_pages(inode->i_mapping, from_off,
48149+ to_off - from_off,
48150+ kdata->params.truncate);
48151+
48152+ ext = extent_item(coord) + from;
48153+ offset =
48154+ (get_key_offset(min_item_key) +
48155+ reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
48156+
48157+ assert("vs-1551", from_off >= offset);
48158+ assert("vs-1552", from_off - offset <= extent_get_width(ext));
48159+ skip = from_off - offset;
48160+ offset = from_off;
48161+
48162+ while (offset < to_off) {
48163+ length = extent_get_width(ext) - skip;
48164+ if (state_of_extent(ext) == HOLE_EXTENT) {
48165+ skip = 0;
48166+ offset += length;
48167+ ext++;
48168+ continue;
48169+ }
48170+
48171+ if (offset + length > to_off) {
48172+ length = to_off - offset;
48173+ }
48174+
48175+ DQUOT_FREE_BLOCK_NODIRTY(inode, length);
48176+
48177+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48178+ /* some jnodes corresponding to this unallocated extent */
48179+ fake_allocated2free(length, 0 /* unformatted */ );
48180+
48181+ skip = 0;
48182+ offset += length;
48183+ ext++;
48184+ continue;
48185+ }
48186+
48187+ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
48188+
48189+ if (length != 0) {
48190+ start = extent_get_start(ext) + skip;
48191+
48192+ /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
48193+ immediately */
48194+ reiser4_dealloc_blocks(&start, &length,
48195+ 0 /* not used */ ,
48196+ BA_DEFER
48197+ /* unformatted with defer */ );
48198+ }
48199+ skip = 0;
48200+ offset += length;
48201+ ext++;
48202+ }
48203+ return retval;
48204+}
48205+
48206+/* item_plugin->b.kill_units */
48207+int
48208+kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48209+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
48210+ reiser4_key * new_first)
48211+{
48212+ reiser4_extent *ext;
48213+ reiser4_key item_key;
48214+ pos_in_node_t count;
48215+ reiser4_key from_key, to_key;
48216+ const reiser4_key *pfrom_key, *pto_key;
48217+ loff_t off;
48218+ int result;
48219+
48220+ assert("vs-1541",
48221+ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
48222+ || (kdata->params.from_key != NULL
48223+ && kdata->params.to_key != NULL)));
48224+
48225+ if (kdata->params.from_key) {
48226+ pfrom_key = kdata->params.from_key;
48227+ pto_key = kdata->params.to_key;
48228+ } else {
48229+ coord_t dup;
48230+
48231+ /* calculate key range of kill */
48232+ assert("vs-1549", from == coord->unit_pos);
48233+ unit_key_by_coord(coord, &from_key);
48234+ pfrom_key = &from_key;
48235+
48236+ coord_dup(&dup, coord);
48237+ dup.unit_pos = to;
48238+ max_unit_key_by_coord(&dup, &to_key);
48239+ pto_key = &to_key;
48240+ }
48241+
48242+ item_key_by_coord(coord, &item_key);
48243+
48244+#if REISER4_DEBUG
48245+ {
48246+ reiser4_key max_item_key;
48247+
48248+ max_item_key_by_coord(coord, &max_item_key);
48249+
48250+ if (new_first) {
48251+ /* head of item is to be cut */
48252+ assert("vs-1542", keyeq(pfrom_key, &item_key));
48253+ assert("vs-1538", keylt(pto_key, &max_item_key));
48254+ } else {
48255+ /* tail of item is to be cut */
48256+ assert("vs-1540", keygt(pfrom_key, &item_key));
48257+ assert("vs-1543", !keylt(pto_key, &max_item_key));
48258+ }
48259+ }
48260+#endif
48261+
48262+ if (smallest_removed)
48263+ *smallest_removed = *pfrom_key;
48264+
48265+ if (new_first) {
48266+ /* item head is cut. Item key will change. This new key is calculated here */
48267+ assert("vs-1556",
48268+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48269+ (PAGE_CACHE_SIZE - 1));
48270+ *new_first = *pto_key;
48271+ set_key_offset(new_first, get_key_offset(new_first) + 1);
48272+ }
48273+
48274+ count = to - from + 1;
48275+ result = kill_hook_extent(coord, from, count, kdata);
48276+ if (result == ITEM_TAIL_KILLED) {
48277+ assert("vs-1553",
48278+ get_key_offset(pfrom_key) >=
48279+ get_key_offset(&item_key) +
48280+ reiser4_extent_size(coord, from));
48281+ off =
48282+ get_key_offset(pfrom_key) -
48283+ (get_key_offset(&item_key) +
48284+ reiser4_extent_size(coord, from));
48285+ if (off) {
48286+ /* unit @from is to be cut partially. Its width decreases */
48287+ ext = extent_item(coord) + from;
48288+ extent_set_width(ext,
48289+ (off + PAGE_CACHE_SIZE -
48290+ 1) >> PAGE_CACHE_SHIFT);
48291+ count--;
48292+ }
48293+ } else {
48294+ __u64 max_to_offset;
48295+ __u64 rest;
48296+
48297+ assert("vs-1575", result == ITEM_HEAD_KILLED);
48298+ assert("", from == 0);
48299+ assert("",
48300+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48301+ 1)) == 0);
48302+ assert("",
48303+ get_key_offset(pto_key) + 1 >
48304+ get_key_offset(&item_key) +
48305+ reiser4_extent_size(coord, to));
48306+ max_to_offset =
48307+ get_key_offset(&item_key) +
48308+ reiser4_extent_size(coord, to + 1) - 1;
48309+ assert("", get_key_offset(pto_key) <= max_to_offset);
48310+
48311+ rest =
48312+ (max_to_offset -
48313+ get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
48314+ if (rest) {
48315+ /* unit @to is to be cut partially */
48316+ ext = extent_item(coord) + to;
48317+
48318+ assert("", extent_get_width(ext) > rest);
48319+
48320+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
48321+ extent_set_start(ext,
48322+ extent_get_start(ext) +
48323+ (extent_get_width(ext) -
48324+ rest));
48325+
48326+ extent_set_width(ext, rest);
48327+ count--;
48328+ }
48329+ }
48330+ return count * sizeof(reiser4_extent);
48331+}
48332+
48333+/* item_plugin->b.cut_units
48334+ this is too similar to kill_units_extent */
48335+int
48336+cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48337+ struct carry_cut_data *cdata, reiser4_key * smallest_removed,
48338+ reiser4_key * new_first)
48339+{
48340+ reiser4_extent *ext;
48341+ reiser4_key item_key;
48342+ pos_in_node_t count;
48343+ reiser4_key from_key, to_key;
48344+ const reiser4_key *pfrom_key, *pto_key;
48345+ loff_t off;
48346+
48347+ assert("vs-1541",
48348+ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
48349+ || (cdata->params.from_key != NULL
48350+ && cdata->params.to_key != NULL)));
48351+
48352+ if (cdata->params.from_key) {
48353+ pfrom_key = cdata->params.from_key;
48354+ pto_key = cdata->params.to_key;
48355+ } else {
48356+ coord_t dup;
48357+
48358+ /* calculate key range of kill */
48359+ coord_dup(&dup, coord);
48360+ dup.unit_pos = from;
48361+ unit_key_by_coord(&dup, &from_key);
48362+
48363+ dup.unit_pos = to;
48364+ max_unit_key_by_coord(&dup, &to_key);
48365+
48366+ pfrom_key = &from_key;
48367+ pto_key = &to_key;
48368+ }
48369+
48370+ assert("vs-1555",
48371+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
48372+ assert("vs-1556",
48373+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48374+ (PAGE_CACHE_SIZE - 1));
48375+
48376+ item_key_by_coord(coord, &item_key);
48377+
48378+#if REISER4_DEBUG
48379+ {
48380+ reiser4_key max_item_key;
48381+
48382+ assert("vs-1584",
48383+ get_key_locality(pfrom_key) ==
48384+ get_key_locality(&item_key));
48385+ assert("vs-1585",
48386+ get_key_type(pfrom_key) == get_key_type(&item_key));
48387+ assert("vs-1586",
48388+ get_key_objectid(pfrom_key) ==
48389+ get_key_objectid(&item_key));
48390+ assert("vs-1587",
48391+ get_key_ordering(pfrom_key) ==
48392+ get_key_ordering(&item_key));
48393+
48394+ max_item_key_by_coord(coord, &max_item_key);
48395+
48396+ if (new_first != NULL) {
48397+ /* head of item is to be cut */
48398+ assert("vs-1542", keyeq(pfrom_key, &item_key));
48399+ assert("vs-1538", keylt(pto_key, &max_item_key));
48400+ } else {
48401+ /* tail of item is to be cut */
48402+ assert("vs-1540", keygt(pfrom_key, &item_key));
48403+ assert("vs-1543", keyeq(pto_key, &max_item_key));
48404+ }
48405+ }
48406+#endif
48407+
48408+ if (smallest_removed)
48409+ *smallest_removed = *pfrom_key;
48410+
48411+ if (new_first) {
48412+ /* item head is cut. Item key will change. This new key is calculated here */
48413+ *new_first = *pto_key;
48414+ set_key_offset(new_first, get_key_offset(new_first) + 1);
48415+ }
48416+
48417+ count = to - from + 1;
48418+
48419+ assert("vs-1553",
48420+ get_key_offset(pfrom_key) >=
48421+ get_key_offset(&item_key) + reiser4_extent_size(coord, from));
48422+ off =
48423+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
48424+ reiser4_extent_size(coord, from));
48425+ if (off) {
48426+ /* tail of unit @from is to be cut partially. Its width decreases */
48427+ assert("vs-1582", new_first == NULL);
48428+ ext = extent_item(coord) + from;
48429+ extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
48430+ count--;
48431+ }
48432+
48433+ assert("vs-1554",
48434+ get_key_offset(pto_key) <=
48435+ get_key_offset(&item_key) +
48436+ reiser4_extent_size(coord, to + 1) - 1);
48437+ off =
48438+ (get_key_offset(&item_key) +
48439+ reiser4_extent_size(coord, to + 1) - 1) -
48440+ get_key_offset(pto_key);
48441+ if (off) {
48442+ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
48443+ and width decreased. */
48444+ assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
48445+ ext = extent_item(coord) + to;
48446+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
48447+ extent_set_start(ext,
48448+ extent_get_start(ext) +
48449+ (extent_get_width(ext) -
48450+ (off >> PAGE_CACHE_SHIFT)));
48451+
48452+ extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
48453+ count--;
48454+ }
48455+ return count * sizeof(reiser4_extent);
48456+}
48457+
48458+/* item_plugin->b.unit_key */
48459+reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
48460+{
48461+ assert("vs-300", coord_is_existing_unit(coord));
48462+
48463+ item_key_by_coord(coord, key);
48464+ set_key_offset(key,
48465+ (get_key_offset(key) +
48466+ reiser4_extent_size(coord, coord->unit_pos)));
48467+
48468+ return key;
48469+}
48470+
48471+/* item_plugin->b.max_unit_key */
48472+reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
48473+{
48474+ assert("vs-300", coord_is_existing_unit(coord));
48475+
48476+ item_key_by_coord(coord, key);
48477+ set_key_offset(key,
48478+ (get_key_offset(key) +
48479+ reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
48480+ return key;
48481+}
48482+
48483+/* item_plugin->b.estimate
48484+ item_plugin->b.item_data_by_flow */
48485+
48486+#if REISER4_DEBUG
48487+
48488+/* item_plugin->b.check
48489+ used for debugging, every item should have here the most complete
48490+ possible check of the consistency of the item that the inventor can
48491+ construct
48492+*/
48493+int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
48494+ const char **error /* where to store error message */)
48495+{
48496+ reiser4_extent *ext, *first;
48497+ unsigned i, j;
48498+ reiser4_block_nr start, width, blk_cnt;
48499+ unsigned num_units;
48500+ reiser4_tree *tree;
48501+ oid_t oid;
48502+ reiser4_key key;
48503+ coord_t scan;
48504+
48505+ assert("vs-933", REISER4_DEBUG);
48506+
48507+ if (znode_get_level(coord->node) != TWIG_LEVEL) {
48508+ *error = "Extent on the wrong level";
48509+ return -1;
48510+ }
48511+ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
48512+ *error = "Wrong item size";
48513+ return -1;
48514+ }
48515+ ext = first = extent_item(coord);
48516+ blk_cnt = reiser4_block_count(reiser4_get_current_sb());
48517+ num_units = coord_num_units(coord);
48518+ tree = znode_get_tree(coord->node);
48519+ item_key_by_coord(coord, &key);
48520+ oid = get_key_objectid(&key);
48521+ coord_dup(&scan, coord);
48522+
48523+ for (i = 0; i < num_units; ++i, ++ext) {
48524+ __u64 index;
48525+
48526+ scan.unit_pos = i;
48527+ index = extent_unit_index(&scan);
48528+
48529+#if 0
48530+ /* check that all jnodes are present for the unallocated
48531+ * extent */
48532+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48533+ for (j = 0; j < extent_get_width(ext); j++) {
48534+ jnode *node;
48535+
48536+ node = jlookup(tree, oid, index + j);
48537+ if (node == NULL) {
48538+ print_coord("scan", &scan, 0);
48539+ *error = "Jnode missing";
48540+ return -1;
48541+ }
48542+ jput(node);
48543+ }
48544+ }
48545+#endif
48546+
48547+ start = extent_get_start(ext);
48548+ if (start < 2)
48549+ continue;
48550+ /* extent is allocated one */
48551+ width = extent_get_width(ext);
48552+ if (start >= blk_cnt) {
48553+ *error = "Start too large";
48554+ return -1;
48555+ }
48556+ if (start + width > blk_cnt) {
48557+ *error = "End too large";
48558+ return -1;
48559+ }
48560+ /* make sure that this extent does not overlap with other
48561+ allocated extents extents */
48562+ for (j = 0; j < i; j++) {
48563+ if (state_of_extent(first + j) != ALLOCATED_EXTENT)
48564+ continue;
48565+ if (!
48566+ ((extent_get_start(ext) >=
48567+ extent_get_start(first + j) +
48568+ extent_get_width(first + j))
48569+ || (extent_get_start(ext) +
48570+ extent_get_width(ext) <=
48571+ extent_get_start(first + j)))) {
48572+ *error = "Extent overlaps with others";
48573+ return -1;
48574+ }
48575+ }
48576+
48577+ }
48578+
48579+ return 0;
48580+}
48581+
48582+#endif /* REISER4_DEBUG */
48583+
48584+/*
48585+ Local variables:
48586+ c-indentation-style: "K&R"
48587+ mode-name: "LC"
48588+ c-basic-offset: 8
48589+ tab-width: 8
48590+ fill-column: 120
48591+ scroll-step: 1
48592+ End:
48593+*/
48594diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/internal.c linux-2.6.24/fs/reiser4/plugin/item/internal.c
48595--- linux-2.6.24.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 03:00:00.000000000 +0300
48596+++ linux-2.6.24/fs/reiser4/plugin/item/internal.c 2008-01-25 11:39:07.020229327 +0300
48597@@ -0,0 +1,396 @@
48598+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48599+
48600+/* Implementation of internal-item plugin methods. */
48601+
48602+#include "../../forward.h"
48603+#include "../../debug.h"
48604+#include "../../dformat.h"
48605+#include "../../key.h"
48606+#include "../../coord.h"
48607+#include "internal.h"
48608+#include "item.h"
48609+#include "../node/node.h"
48610+#include "../plugin.h"
48611+#include "../../jnode.h"
48612+#include "../../znode.h"
48613+#include "../../tree_walk.h"
48614+#include "../../tree_mod.h"
48615+#include "../../tree.h"
48616+#include "../../super.h"
48617+#include "../../block_alloc.h"
48618+
48619+/* see internal.h for explanation */
48620+
48621+/* plugin->u.item.b.mergeable */
48622+int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
48623+ const coord_t * p2 UNUSED_ARG /* second item */ )
48624+{
48625+ /* internal items are not mergeable */
48626+ return 0;
48627+}
48628+
48629+/* ->lookup() method for internal items */
48630+lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
48631+ lookup_bias bias UNUSED_ARG /* lookup bias */ ,
48632+ coord_t * coord /* coord of item */ )
48633+{
48634+ reiser4_key ukey;
48635+
48636+ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
48637+ default:
48638+ impossible("", "keycmp()?!");
48639+ case LESS_THAN:
48640+ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
48641+ item plugin can not be taken using coord set this way */
48642+ assert("vs-681", coord->unit_pos == 0);
48643+ coord->between = AFTER_UNIT;
48644+ case EQUAL_TO:
48645+ return CBK_COORD_FOUND;
48646+ case GREATER_THAN:
48647+ return CBK_COORD_NOTFOUND;
48648+ }
48649+}
48650+
48651+/* return body of internal item at @coord */
48652+static internal_item_layout *internal_at(const coord_t * coord /* coord of
48653+ * item */ )
48654+{
48655+ assert("nikita-607", coord != NULL);
48656+ assert("nikita-1650",
48657+ item_plugin_by_coord(coord) ==
48658+ item_plugin_by_id(NODE_POINTER_ID));
48659+ return (internal_item_layout *) item_body_by_coord(coord);
48660+}
48661+
48662+void reiser4_update_internal(const coord_t * coord,
48663+ const reiser4_block_nr * blocknr)
48664+{
48665+ internal_item_layout *item = internal_at(coord);
48666+ assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
48667+
48668+ put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
48669+}
48670+
48671+/* return child block number stored in the internal item at @coord */
48672+static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
48673+{
48674+ assert("nikita-608", coord != NULL);
48675+ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
48676+}
48677+
48678+/* get znode pointed to by internal @item */
48679+static znode *znode_at(const coord_t * item /* coord of item */ ,
48680+ znode * parent /* parent node */ )
48681+{
48682+ return child_znode(item, parent, 1, 0);
48683+}
48684+
48685+/* store pointer from internal item into "block". Implementation of
48686+ ->down_link() method */
48687+void down_link_internal(const coord_t * coord /* coord of item */ ,
48688+ const reiser4_key * key UNUSED_ARG /* key to get
48689+ * pointer for */ ,
48690+ reiser4_block_nr * block /* resulting block number */ )
48691+{
48692+ ON_DEBUG(reiser4_key item_key);
48693+
48694+ assert("nikita-609", coord != NULL);
48695+ assert("nikita-611", block != NULL);
48696+ assert("nikita-612", (key == NULL) ||
48697+ /* twig horrors */
48698+ (znode_get_level(coord->node) == TWIG_LEVEL)
48699+ || keyle(item_key_by_coord(coord, &item_key), key));
48700+
48701+ *block = pointer_at(coord);
48702+ assert("nikita-2960", reiser4_blocknr_is_sane(block));
48703+}
48704+
48705+/* Get the child's block number, or 0 if the block is unallocated. */
48706+int
48707+utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
48708+ reiser4_block_nr * block)
48709+{
48710+ assert("jmacd-2059", coord != NULL);
48711+
48712+ *block = pointer_at(coord);
48713+ assert("nikita-2961", reiser4_blocknr_is_sane(block));
48714+
48715+ if (reiser4_blocknr_is_fake(block)) {
48716+ *block = 0;
48717+ }
48718+
48719+ return 0;
48720+}
48721+
48722+/* Return the child. */
48723+int
48724+utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
48725+ jnode ** childp)
48726+{
48727+ reiser4_block_nr block = pointer_at(coord);
48728+ znode *child;
48729+
48730+ assert("jmacd-2059", childp != NULL);
48731+ assert("nikita-2962", reiser4_blocknr_is_sane(&block));
48732+
48733+ child = zlook(znode_get_tree(coord->node), &block);
48734+
48735+ if (IS_ERR(child)) {
48736+ return PTR_ERR(child);
48737+ }
48738+
48739+ *childp = ZJNODE(child);
48740+
48741+ return 0;
48742+}
48743+
48744+#if REISER4_DEBUG
48745+
48746+static void check_link(znode * left, znode * right)
48747+{
48748+ znode *scan;
48749+
48750+ for (scan = left; scan != right; scan = scan->right) {
48751+ if (ZF_ISSET(scan, JNODE_RIP))
48752+ break;
48753+ if (znode_is_right_connected(scan) && scan->right != NULL) {
48754+ if (ZF_ISSET(scan->right, JNODE_RIP))
48755+ break;
48756+ assert("nikita-3285",
48757+ znode_is_left_connected(scan->right));
48758+ assert("nikita-3265",
48759+ ergo(scan != left,
48760+ ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
48761+ assert("nikita-3284", scan->right->left == scan);
48762+ } else
48763+ break;
48764+ }
48765+}
48766+
48767+int check__internal(const coord_t * coord, const char **error)
48768+{
48769+ reiser4_block_nr blk;
48770+ znode *child;
48771+ coord_t cpy;
48772+
48773+ blk = pointer_at(coord);
48774+ if (!reiser4_blocknr_is_sane(&blk)) {
48775+ *error = "Invalid pointer";
48776+ return -1;
48777+ }
48778+ coord_dup(&cpy, coord);
48779+ child = znode_at(&cpy, cpy.node);
48780+ if (child != NULL) {
48781+ znode *left_child;
48782+ znode *right_child;
48783+
48784+ left_child = right_child = NULL;
48785+
48786+ assert("nikita-3256", znode_invariant(child));
48787+ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
48788+ left_child = znode_at(&cpy, cpy.node);
48789+ if (left_child != NULL) {
48790+ read_lock_tree(znode_get_tree(child));
48791+ check_link(left_child, child);
48792+ read_unlock_tree(znode_get_tree(child));
48793+ zput(left_child);
48794+ }
48795+ }
48796+ coord_dup(&cpy, coord);
48797+ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
48798+ right_child = znode_at(&cpy, cpy.node);
48799+ if (right_child != NULL) {
48800+ read_lock_tree(znode_get_tree(child));
48801+ check_link(child, right_child);
48802+ read_unlock_tree(znode_get_tree(child));
48803+ zput(right_child);
48804+ }
48805+ }
48806+ zput(child);
48807+ }
48808+ return 0;
48809+}
48810+
48811+#endif /* REISER4_DEBUG */
48812+
48813+/* return true only if this item really points to "block" */
48814+/* Audited by: green(2002.06.14) */
48815+int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
48816+ const reiser4_block_nr * block /* block number to
48817+ * check */ )
48818+{
48819+ assert("nikita-613", coord != NULL);
48820+ assert("nikita-614", block != NULL);
48821+
48822+ return pointer_at(coord) == *block;
48823+}
48824+
48825+/* hook called by ->create_item() method of node plugin after new internal
48826+ item was just created.
48827+
48828+ This is point where pointer to new node is inserted into tree. Initialize
48829+ parent pointer in child znode, insert child into sibling list and slum.
48830+
48831+*/
48832+int create_hook_internal(const coord_t * item /* coord of item */ ,
48833+ void *arg /* child's left neighbor, if any */ )
48834+{
48835+ znode *child;
48836+ __u64 child_ptr;
48837+
48838+ assert("nikita-1252", item != NULL);
48839+ assert("nikita-1253", item->node != NULL);
48840+ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
48841+ assert("nikita-1450", item->unit_pos == 0);
48842+
48843+ /*
48844+ * preparing to item insertion build_child_ptr_data sets pointer to
48845+ * data to be inserted to jnode's blocknr which is in cpu byte
48846+ * order. Node's create_item simply copied those data. As result we
48847+ * have child pointer in cpu's byte order. Convert content of internal
48848+ * item to little endian byte order.
48849+ */
48850+ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
48851+ reiser4_update_internal(item, &child_ptr);
48852+
48853+ child = znode_at(item, item->node);
48854+ if (child != NULL && !IS_ERR(child)) {
48855+ znode *left;
48856+ int result = 0;
48857+ reiser4_tree *tree;
48858+
48859+ left = arg;
48860+ tree = znode_get_tree(item->node);
48861+ write_lock_tree(tree);
48862+ write_lock_dk(tree);
48863+ assert("nikita-1400", (child->in_parent.node == NULL)
48864+ || (znode_above_root(child->in_parent.node)));
48865+ ++item->node->c_count;
48866+ coord_to_parent_coord(item, &child->in_parent);
48867+ sibling_list_insert_nolock(child, left);
48868+
48869+ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
48870+ ZF_CLR(child, JNODE_ORPHAN);
48871+
48872+ if ((left != NULL) && !keyeq(znode_get_rd_key(left),
48873+ znode_get_rd_key(child))) {
48874+ znode_set_rd_key(child, znode_get_rd_key(left));
48875+ }
48876+ write_unlock_dk(tree);
48877+ write_unlock_tree(tree);
48878+ zput(child);
48879+ return result;
48880+ } else {
48881+ if (child == NULL)
48882+ child = ERR_PTR(-EIO);
48883+ return PTR_ERR(child);
48884+ }
48885+}
48886+
48887+/* hook called by ->cut_and_kill() method of node plugin just before internal
48888+ item is removed.
48889+
48890+ This is point where empty node is removed from the tree. Clear parent
48891+ pointer in child, and mark node for pending deletion.
48892+
48893+ Node will be actually deleted later and in several installations:
48894+
48895+ . when last lock on this node will be released, node will be removed from
48896+ the sibling list and its lock will be invalidated
48897+
48898+ . when last reference to this node will be dropped, bitmap will be updated
48899+ and node will be actually removed from the memory.
48900+
48901+*/
48902+int kill_hook_internal(const coord_t * item /* coord of item */ ,
48903+ pos_in_node_t from UNUSED_ARG /* start unit */ ,
48904+ pos_in_node_t count UNUSED_ARG /* stop unit */ ,
48905+ struct carry_kill_data *p UNUSED_ARG)
48906+{
48907+ znode *child;
48908+
48909+ assert("nikita-1222", item != NULL);
48910+ assert("nikita-1224", from == 0);
48911+ assert("nikita-1225", count == 1);
48912+
48913+ child = znode_at(item, item->node);
48914+ if (IS_ERR(child))
48915+ return PTR_ERR(child);
48916+ else if (node_is_empty(child)) {
48917+ reiser4_tree *tree;
48918+
48919+ assert("nikita-1397", znode_is_write_locked(child));
48920+ assert("nikita-1398", child->c_count == 0);
48921+ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
48922+
48923+ tree = znode_get_tree(item->node);
48924+ write_lock_tree(tree);
48925+ init_parent_coord(&child->in_parent, NULL);
48926+ --item->node->c_count;
48927+ write_unlock_tree(tree);
48928+ zput(child);
48929+ return 0;
48930+ } else {
48931+ warning("nikita-1223",
48932+ "Cowardly refuse to remove link to non-empty node");
48933+ zput(child);
48934+ return RETERR(-EIO);
48935+ }
48936+}
48937+
48938+/* hook called by ->shift() node plugin method when iternal item was just
48939+ moved from one node to another.
48940+
48941+ Update parent pointer in child and c_counts in old and new parent
48942+
48943+*/
48944+int shift_hook_internal(const coord_t * item /* coord of item */ ,
48945+ unsigned from UNUSED_ARG /* start unit */ ,
48946+ unsigned count UNUSED_ARG /* stop unit */ ,
48947+ znode * old_node /* old parent */ )
48948+{
48949+ znode *child;
48950+ znode *new_node;
48951+ reiser4_tree *tree;
48952+
48953+ assert("nikita-1276", item != NULL);
48954+ assert("nikita-1277", from == 0);
48955+ assert("nikita-1278", count == 1);
48956+ assert("nikita-1451", item->unit_pos == 0);
48957+
48958+ new_node = item->node;
48959+ assert("nikita-2132", new_node != old_node);
48960+ tree = znode_get_tree(item->node);
48961+ child = child_znode(item, old_node, 1, 0);
48962+ if (child == NULL)
48963+ return 0;
48964+ if (!IS_ERR(child)) {
48965+ write_lock_tree(tree);
48966+ ++new_node->c_count;
48967+ assert("nikita-1395", znode_parent(child) == old_node);
48968+ assert("nikita-1396", old_node->c_count > 0);
48969+ coord_to_parent_coord(item, &child->in_parent);
48970+ assert("nikita-1781", znode_parent(child) == new_node);
48971+ assert("nikita-1782",
48972+ check_tree_pointer(item, child) == NS_FOUND);
48973+ --old_node->c_count;
48974+ write_unlock_tree(tree);
48975+ zput(child);
48976+ return 0;
48977+ } else
48978+ return PTR_ERR(child);
48979+}
48980+
48981+/* plugin->u.item.b.max_key_inside - not defined */
48982+
48983+/* plugin->u.item.b.nr_units - item.c:single_unit */
48984+
48985+/* Make Linus happy.
48986+ Local variables:
48987+ c-indentation-style: "K&R"
48988+ mode-name: "LC"
48989+ c-basic-offset: 8
48990+ tab-width: 8
48991+ fill-column: 120
48992+ End:
48993+*/
48994diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/internal.h linux-2.6.24/fs/reiser4/plugin/item/internal.h
48995--- linux-2.6.24.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 03:00:00.000000000 +0300
48996+++ linux-2.6.24/fs/reiser4/plugin/item/internal.h 2008-01-25 11:39:07.020229327 +0300
48997@@ -0,0 +1,57 @@
48998+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48999+/* Internal item contains down-link to the child of the internal/twig
49000+ node in a tree. It is internal items that are actually used during
49001+ tree traversal. */
49002+
49003+#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
49004+#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
49005+
49006+#include "../../forward.h"
49007+#include "../../dformat.h"
49008+
49009+/* on-disk layout of internal item */
49010+typedef struct internal_item_layout {
49011+ /* 0 */ reiser4_dblock_nr pointer;
49012+ /* 4 */
49013+} internal_item_layout;
49014+
49015+struct cut_list;
49016+
49017+int mergeable_internal(const coord_t * p1, const coord_t * p2);
49018+lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
49019+ coord_t * coord);
49020+/* store pointer from internal item into "block". Implementation of
49021+ ->down_link() method */
49022+extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
49023+ reiser4_block_nr * block);
49024+extern int has_pointer_to_internal(const coord_t * coord,
49025+ const reiser4_block_nr * block);
49026+extern int create_hook_internal(const coord_t * item, void *arg);
49027+extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
49028+ pos_in_node_t count, struct carry_kill_data *);
49029+extern int shift_hook_internal(const coord_t * item, unsigned from,
49030+ unsigned count, znode * old_node);
49031+extern void reiser4_print_internal(const char *prefix, coord_t * coord);
49032+
49033+extern int utmost_child_internal(const coord_t * coord, sideof side,
49034+ jnode ** child);
49035+int utmost_child_real_block_internal(const coord_t * coord, sideof side,
49036+ reiser4_block_nr * block);
49037+
49038+extern void reiser4_update_internal(const coord_t * coord,
49039+ const reiser4_block_nr * blocknr);
49040+/* FIXME: reiserfs has check_internal */
49041+extern int check__internal(const coord_t * coord, const char **error);
49042+
49043+/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
49044+#endif
49045+
49046+/* Make Linus happy.
49047+ Local variables:
49048+ c-indentation-style: "K&R"
49049+ mode-name: "LC"
49050+ c-basic-offset: 8
49051+ tab-width: 8
49052+ fill-column: 120
49053+ End:
49054+*/
49055diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/item.c linux-2.6.24/fs/reiser4/plugin/item/item.c
49056--- linux-2.6.24.orig/fs/reiser4/plugin/item/item.c 1970-01-01 03:00:00.000000000 +0300
49057+++ linux-2.6.24/fs/reiser4/plugin/item/item.c 2008-01-25 11:39:07.020229327 +0300
49058@@ -0,0 +1,719 @@
49059+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49060+
49061+/* definition of item plugins. */
49062+
49063+#include "../../forward.h"
49064+#include "../../debug.h"
49065+#include "../../key.h"
49066+#include "../../coord.h"
49067+#include "../plugin_header.h"
49068+#include "sde.h"
49069+#include "internal.h"
49070+#include "item.h"
49071+#include "static_stat.h"
49072+#include "../plugin.h"
49073+#include "../../znode.h"
49074+#include "../../tree.h"
49075+#include "../../context.h"
49076+#include "ctail.h"
49077+
49078+/* return pointer to item body */
49079+void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
49080+{
49081+ assert("nikita-324", coord != NULL);
49082+ assert("nikita-325", coord->node != NULL);
49083+ assert("nikita-326", znode_is_loaded(coord->node));
49084+ assert("nikita-3200", coord->offset == INVALID_OFFSET);
49085+
49086+ coord->offset =
49087+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
49088+ zdata(coord->node);
49089+ ON_DEBUG(coord->body_v = coord->node->times_locked);
49090+}
49091+
49092+void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
49093+{
49094+ return zdata(coord->node) + coord->offset;
49095+}
49096+
49097+#if REISER4_DEBUG
49098+
49099+int item_body_is_valid(const coord_t * coord)
49100+{
49101+ return
49102+ coord->offset ==
49103+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
49104+ zdata(coord->node);
49105+}
49106+
49107+#endif
49108+
49109+/* return length of item at @coord */
49110+pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
49111+{
49112+ int len;
49113+
49114+ assert("nikita-327", coord != NULL);
49115+ assert("nikita-328", coord->node != NULL);
49116+ assert("nikita-329", znode_is_loaded(coord->node));
49117+
49118+ len = node_plugin_by_node(coord->node)->length_by_coord(coord);
49119+ return len;
49120+}
49121+
49122+void obtain_item_plugin(const coord_t * coord)
49123+{
49124+ assert("nikita-330", coord != NULL);
49125+ assert("nikita-331", coord->node != NULL);
49126+ assert("nikita-332", znode_is_loaded(coord->node));
49127+
49128+ coord_set_iplug((coord_t *) coord,
49129+ node_plugin_by_node(coord->node)->
49130+ plugin_by_coord(coord));
49131+ assert("nikita-2479",
49132+ coord_iplug(coord) ==
49133+ node_plugin_by_node(coord->node)->plugin_by_coord(coord));
49134+}
49135+
49136+/* return id of item */
49137+/* Audited by: green(2002.06.15) */
49138+item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
49139+{
49140+ assert("vs-539", coord != NULL);
49141+ assert("vs-538", coord->node != NULL);
49142+ assert("vs-537", znode_is_loaded(coord->node));
49143+ assert("vs-536", item_plugin_by_coord(coord) != NULL);
49144+ assert("vs-540",
49145+ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
49146+
49147+ return item_id_by_plugin(item_plugin_by_coord(coord));
49148+}
49149+
49150+/* return key of item at @coord */
49151+/* Audited by: green(2002.06.15) */
49152+reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
49153+ reiser4_key * key /* result */ )
49154+{
49155+ assert("nikita-338", coord != NULL);
49156+ assert("nikita-339", coord->node != NULL);
49157+ assert("nikita-340", znode_is_loaded(coord->node));
49158+
49159+ return node_plugin_by_node(coord->node)->key_at(coord, key);
49160+}
49161+
49162+/* this returns max key in the item */
49163+reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
49164+ reiser4_key * key /* result */ )
49165+{
49166+ coord_t last;
49167+
49168+ assert("nikita-338", coord != NULL);
49169+ assert("nikita-339", coord->node != NULL);
49170+ assert("nikita-340", znode_is_loaded(coord->node));
49171+
49172+ /* make coord pointing to last item's unit */
49173+ coord_dup(&last, coord);
49174+ last.unit_pos = coord_num_units(&last) - 1;
49175+ assert("vs-1560", coord_is_existing_unit(&last));
49176+
49177+ max_unit_key_by_coord(&last, key);
49178+ return key;
49179+}
49180+
49181+/* return key of unit at @coord */
49182+reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49183+ reiser4_key * key /* result */ )
49184+{
49185+ assert("nikita-772", coord != NULL);
49186+ assert("nikita-774", coord->node != NULL);
49187+ assert("nikita-775", znode_is_loaded(coord->node));
49188+
49189+ if (item_plugin_by_coord(coord)->b.unit_key != NULL)
49190+ return item_plugin_by_coord(coord)->b.unit_key(coord, key);
49191+ else
49192+ return item_key_by_coord(coord, key);
49193+}
49194+
49195+/* return the biggest key contained the unit @coord */
49196+reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49197+ reiser4_key * key /* result */ )
49198+{
49199+ assert("nikita-772", coord != NULL);
49200+ assert("nikita-774", coord->node != NULL);
49201+ assert("nikita-775", znode_is_loaded(coord->node));
49202+
49203+ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
49204+ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
49205+ else
49206+ return unit_key_by_coord(coord, key);
49207+}
49208+
49209+/* ->max_key_inside() method for items consisting of exactly one key (like
49210+ stat-data) */
49211+static reiser4_key *max_key_inside_single_key(const coord_t *
49212+ coord /* coord of item */ ,
49213+ reiser4_key *
49214+ result /* resulting key */ )
49215+{
49216+ assert("nikita-604", coord != NULL);
49217+
49218+ /* coord -> key is starting key of this item and it has to be already
49219+ filled in */
49220+ return unit_key_by_coord(coord, result);
49221+}
49222+
49223+/* ->nr_units() method for items consisting of exactly one unit always */
49224+pos_in_node_t
49225+nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
49226+{
49227+ return 1;
49228+}
49229+
49230+static int
49231+paste_no_paste(coord_t * coord UNUSED_ARG,
49232+ reiser4_item_data * data UNUSED_ARG,
49233+ carry_plugin_info * info UNUSED_ARG)
49234+{
49235+ return 0;
49236+}
49237+
49238+/* default ->fast_paste() method */
49239+static int
49240+agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
49241+{
49242+ return 1;
49243+}
49244+
49245+int item_can_contain_key(const coord_t * item /* coord of item */ ,
49246+ const reiser4_key * key /* key to check */ ,
49247+ const reiser4_item_data * data /* parameters of item
49248+ * being created */ )
49249+{
49250+ item_plugin *iplug;
49251+ reiser4_key min_key_in_item;
49252+ reiser4_key max_key_in_item;
49253+
49254+ assert("nikita-1658", item != NULL);
49255+ assert("nikita-1659", key != NULL);
49256+
49257+ iplug = item_plugin_by_coord(item);
49258+ if (iplug->b.can_contain_key != NULL)
49259+ return iplug->b.can_contain_key(item, key, data);
49260+ else {
49261+ assert("nikita-1681", iplug->b.max_key_inside != NULL);
49262+ item_key_by_coord(item, &min_key_in_item);
49263+ iplug->b.max_key_inside(item, &max_key_in_item);
49264+
49265+ /* can contain key if
49266+ min_key_in_item <= key &&
49267+ key <= max_key_in_item
49268+ */
49269+ return keyle(&min_key_in_item, key)
49270+ && keyle(key, &max_key_in_item);
49271+ }
49272+}
49273+
49274+/* mergeable method for non mergeable items */
49275+static int
49276+not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
49277+{
49278+ return 0;
49279+}
49280+
49281+/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
49282+int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
49283+ const coord_t * i2 /* coord of second item */ )
49284+{
49285+ item_plugin *iplug;
49286+ reiser4_key k1;
49287+ reiser4_key k2;
49288+
49289+ assert("nikita-1336", i1 != NULL);
49290+ assert("nikita-1337", i2 != NULL);
49291+
49292+ iplug = item_plugin_by_coord(i1);
49293+ assert("nikita-1338", iplug != NULL);
49294+
49295+ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
49296+ shifting code when nodes are in "suspended" state. */
49297+ assert("nikita-1663",
49298+ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
49299+
49300+ if (iplug->b.mergeable != NULL) {
49301+ return iplug->b.mergeable(i1, i2);
49302+ } else if (iplug->b.max_key_inside != NULL) {
49303+ iplug->b.max_key_inside(i1, &k1);
49304+ item_key_by_coord(i2, &k2);
49305+
49306+ /* mergeable if ->max_key_inside() >= key of i2; */
49307+ return keyge(iplug->b.max_key_inside(i1, &k1),
49308+ item_key_by_coord(i2, &k2));
49309+ } else {
49310+ item_key_by_coord(i1, &k1);
49311+ item_key_by_coord(i2, &k2);
49312+
49313+ return
49314+ (get_key_locality(&k1) == get_key_locality(&k2)) &&
49315+ (get_key_objectid(&k1) == get_key_objectid(&k2))
49316+ && (iplug == item_plugin_by_coord(i2));
49317+ }
49318+}
49319+
49320+int item_is_extent(const coord_t * item)
49321+{
49322+ assert("vs-482", coord_is_existing_item(item));
49323+ return item_id_by_coord(item) == EXTENT_POINTER_ID;
49324+}
49325+
49326+int item_is_tail(const coord_t * item)
49327+{
49328+ assert("vs-482", coord_is_existing_item(item));
49329+ return item_id_by_coord(item) == FORMATTING_ID;
49330+}
49331+
49332+#if REISER4_DEBUG
49333+
49334+int item_is_statdata(const coord_t * item)
49335+{
49336+ assert("vs-516", coord_is_existing_item(item));
49337+ return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
49338+}
49339+
49340+int item_is_ctail(const coord_t * item)
49341+{
49342+ assert("edward-xx", coord_is_existing_item(item));
49343+ return item_id_by_coord(item) == CTAIL_ID;
49344+}
49345+
49346+#endif /* REISER4_DEBUG */
49347+
49348+static int change_item(struct inode *inode,
49349+ reiser4_plugin * plugin,
49350+ pset_member memb)
49351+{
49352+ /* cannot change constituent item (sd, or dir_item) */
49353+ return RETERR(-EINVAL);
49354+}
49355+
49356+static reiser4_plugin_ops item_plugin_ops = {
49357+ .init = NULL,
49358+ .load = NULL,
49359+ .save_len = NULL,
49360+ .save = NULL,
49361+ .change = change_item
49362+};
49363+
49364+item_plugin item_plugins[LAST_ITEM_ID] = {
49365+ [STATIC_STAT_DATA_ID] = {
49366+ .h = {
49367+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49368+ .id = STATIC_STAT_DATA_ID,
49369+ .groups = (1 << STAT_DATA_ITEM_TYPE),
49370+ .pops = &item_plugin_ops,
49371+ .label = "sd",
49372+ .desc = "stat-data",
49373+ .linkage = {NULL, NULL}
49374+ },
49375+ .b = {
49376+ .max_key_inside = max_key_inside_single_key,
49377+ .can_contain_key = NULL,
49378+ .mergeable = not_mergeable,
49379+ .nr_units = nr_units_single_unit,
49380+ .lookup = NULL,
49381+ .init = NULL,
49382+ .paste = paste_no_paste,
49383+ .fast_paste = NULL,
49384+ .can_shift = NULL,
49385+ .copy_units = NULL,
49386+ .create_hook = NULL,
49387+ .kill_hook = NULL,
49388+ .shift_hook = NULL,
49389+ .cut_units = NULL,
49390+ .kill_units = NULL,
49391+ .unit_key = NULL,
49392+ .max_unit_key = NULL,
49393+ .estimate = NULL,
49394+ .item_data_by_flow = NULL,
49395+#if REISER4_DEBUG
49396+ .check = NULL
49397+#endif
49398+ },
49399+ .f = {
49400+ .utmost_child = NULL,
49401+ .utmost_child_real_block = NULL,
49402+ .update = NULL,
49403+ .scan = NULL,
49404+ .convert = NULL
49405+ },
49406+ .s = {
49407+ .sd = {
49408+ .init_inode = init_inode_static_sd,
49409+ .save_len = save_len_static_sd,
49410+ .save = save_static_sd
49411+ }
49412+ }
49413+ },
49414+ [SIMPLE_DIR_ENTRY_ID] = {
49415+ .h = {
49416+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49417+ .id = SIMPLE_DIR_ENTRY_ID,
49418+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49419+ .pops = &item_plugin_ops,
49420+ .label = "de",
49421+ .desc = "directory entry",
49422+ .linkage = {NULL, NULL}
49423+ },
49424+ .b = {
49425+ .max_key_inside = max_key_inside_single_key,
49426+ .can_contain_key = NULL,
49427+ .mergeable = NULL,
49428+ .nr_units = nr_units_single_unit,
49429+ .lookup = NULL,
49430+ .init = NULL,
49431+ .paste = NULL,
49432+ .fast_paste = NULL,
49433+ .can_shift = NULL,
49434+ .copy_units = NULL,
49435+ .create_hook = NULL,
49436+ .kill_hook = NULL,
49437+ .shift_hook = NULL,
49438+ .cut_units = NULL,
49439+ .kill_units = NULL,
49440+ .unit_key = NULL,
49441+ .max_unit_key = NULL,
49442+ .estimate = NULL,
49443+ .item_data_by_flow = NULL,
49444+#if REISER4_DEBUG
49445+ .check = NULL
49446+#endif
49447+ },
49448+ .f = {
49449+ .utmost_child = NULL,
49450+ .utmost_child_real_block = NULL,
49451+ .update = NULL,
49452+ .scan = NULL,
49453+ .convert = NULL
49454+ },
49455+ .s = {
49456+ .dir = {
49457+ .extract_key = extract_key_de,
49458+ .update_key = update_key_de,
49459+ .extract_name = extract_name_de,
49460+ .extract_file_type = extract_file_type_de,
49461+ .add_entry = add_entry_de,
49462+ .rem_entry = rem_entry_de,
49463+ .max_name_len = max_name_len_de
49464+ }
49465+ }
49466+ },
49467+ [COMPOUND_DIR_ID] = {
49468+ .h = {
49469+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49470+ .id = COMPOUND_DIR_ID,
49471+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49472+ .pops = &item_plugin_ops,
49473+ .label = "cde",
49474+ .desc = "compressed directory entry",
49475+ .linkage = {NULL, NULL}
49476+ },
49477+ .b = {
49478+ .max_key_inside = max_key_inside_cde,
49479+ .can_contain_key = can_contain_key_cde,
49480+ .mergeable = mergeable_cde,
49481+ .nr_units = nr_units_cde,
49482+ .lookup = lookup_cde,
49483+ .init = init_cde,
49484+ .paste = paste_cde,
49485+ .fast_paste = agree_to_fast_op,
49486+ .can_shift = can_shift_cde,
49487+ .copy_units = copy_units_cde,
49488+ .create_hook = NULL,
49489+ .kill_hook = NULL,
49490+ .shift_hook = NULL,
49491+ .cut_units = cut_units_cde,
49492+ .kill_units = kill_units_cde,
49493+ .unit_key = unit_key_cde,
49494+ .max_unit_key = unit_key_cde,
49495+ .estimate = estimate_cde,
49496+ .item_data_by_flow = NULL,
49497+#if REISER4_DEBUG
49498+ .check = reiser4_check_cde
49499+#endif
49500+ },
49501+ .f = {
49502+ .utmost_child = NULL,
49503+ .utmost_child_real_block = NULL,
49504+ .update = NULL,
49505+ .scan = NULL,
49506+ .convert = NULL
49507+ },
49508+ .s = {
49509+ .dir = {
49510+ .extract_key = extract_key_cde,
49511+ .update_key = update_key_cde,
49512+ .extract_name = extract_name_cde,
49513+ .extract_file_type = extract_file_type_de,
49514+ .add_entry = add_entry_cde,
49515+ .rem_entry = rem_entry_cde,
49516+ .max_name_len = max_name_len_cde
49517+ }
49518+ }
49519+ },
49520+ [NODE_POINTER_ID] = {
49521+ .h = {
49522+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49523+ .id = NODE_POINTER_ID,
49524+ .groups = (1 << INTERNAL_ITEM_TYPE),
49525+ .pops = NULL,
49526+ .label = "internal",
49527+ .desc = "internal item",
49528+ .linkage = {NULL, NULL}
49529+ },
49530+ .b = {
49531+ .max_key_inside = NULL,
49532+ .can_contain_key = NULL,
49533+ .mergeable = mergeable_internal,
49534+ .nr_units = nr_units_single_unit,
49535+ .lookup = lookup_internal,
49536+ .init = NULL,
49537+ .paste = NULL,
49538+ .fast_paste = NULL,
49539+ .can_shift = NULL,
49540+ .copy_units = NULL,
49541+ .create_hook = create_hook_internal,
49542+ .kill_hook = kill_hook_internal,
49543+ .shift_hook = shift_hook_internal,
49544+ .cut_units = NULL,
49545+ .kill_units = NULL,
49546+ .unit_key = NULL,
49547+ .max_unit_key = NULL,
49548+ .estimate = NULL,
49549+ .item_data_by_flow = NULL,
49550+#if REISER4_DEBUG
49551+ .check = check__internal
49552+#endif
49553+ },
49554+ .f = {
49555+ .utmost_child = utmost_child_internal,
49556+ .utmost_child_real_block =
49557+ utmost_child_real_block_internal,
49558+ .update = reiser4_update_internal,
49559+ .scan = NULL,
49560+ .convert = NULL
49561+ },
49562+ .s = {
49563+ .internal = {
49564+ .down_link = down_link_internal,
49565+ .has_pointer_to = has_pointer_to_internal
49566+ }
49567+ }
49568+ },
49569+ [EXTENT_POINTER_ID] = {
49570+ .h = {
49571+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49572+ .id = EXTENT_POINTER_ID,
49573+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49574+ .pops = NULL,
49575+ .label = "extent",
49576+ .desc = "extent item",
49577+ .linkage = {NULL, NULL}
49578+ },
49579+ .b = {
49580+ .max_key_inside = max_key_inside_extent,
49581+ .can_contain_key = can_contain_key_extent,
49582+ .mergeable = mergeable_extent,
49583+ .nr_units = nr_units_extent,
49584+ .lookup = lookup_extent,
49585+ .init = NULL,
49586+ .paste = paste_extent,
49587+ .fast_paste = agree_to_fast_op,
49588+ .can_shift = can_shift_extent,
49589+ .create_hook = create_hook_extent,
49590+ .copy_units = copy_units_extent,
49591+ .kill_hook = kill_hook_extent,
49592+ .shift_hook = NULL,
49593+ .cut_units = cut_units_extent,
49594+ .kill_units = kill_units_extent,
49595+ .unit_key = unit_key_extent,
49596+ .max_unit_key = max_unit_key_extent,
49597+ .estimate = NULL,
49598+ .item_data_by_flow = NULL,
49599+#if REISER4_DEBUG
49600+ .check = reiser4_check_extent
49601+#endif
49602+ },
49603+ .f = {
49604+ .utmost_child = utmost_child_extent,
49605+ .utmost_child_real_block =
49606+ utmost_child_real_block_extent,
49607+ .update = NULL,
49608+ .scan = reiser4_scan_extent,
49609+ .convert = NULL,
49610+ .key_by_offset = key_by_offset_extent
49611+ },
49612+ .s = {
49613+ .file = {
49614+ .write = reiser4_write_extent,
49615+ .read = reiser4_read_extent,
49616+ .readpage = reiser4_readpage_extent,
49617+ .get_block = get_block_address_extent,
49618+ .append_key = append_key_extent,
49619+ .init_coord_extension =
49620+ init_coord_extension_extent
49621+ }
49622+ }
49623+ },
49624+ [FORMATTING_ID] = {
49625+ .h = {
49626+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49627+ .id = FORMATTING_ID,
49628+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49629+ .pops = NULL,
49630+ .label = "body",
49631+ .desc = "body (or tail?) item",
49632+ .linkage = {NULL, NULL}
49633+ },
49634+ .b = {
49635+ .max_key_inside = max_key_inside_tail,
49636+ .can_contain_key = can_contain_key_tail,
49637+ .mergeable = mergeable_tail,
49638+ .nr_units = nr_units_tail,
49639+ .lookup = lookup_tail,
49640+ .init = NULL,
49641+ .paste = paste_tail,
49642+ .fast_paste = agree_to_fast_op,
49643+ .can_shift = can_shift_tail,
49644+ .create_hook = NULL,
49645+ .copy_units = copy_units_tail,
49646+ .kill_hook = kill_hook_tail,
49647+ .shift_hook = NULL,
49648+ .cut_units = cut_units_tail,
49649+ .kill_units = kill_units_tail,
49650+ .unit_key = unit_key_tail,
49651+ .max_unit_key = unit_key_tail,
49652+ .estimate = NULL,
49653+ .item_data_by_flow = NULL,
49654+#if REISER4_DEBUG
49655+ .check = NULL
49656+#endif
49657+ },
49658+ .f = {
49659+ .utmost_child = NULL,
49660+ .utmost_child_real_block = NULL,
49661+ .update = NULL,
49662+ .scan = NULL,
49663+ .convert = NULL
49664+ },
49665+ .s = {
49666+ .file = {
49667+ .write = reiser4_write_tail,
49668+ .read = reiser4_read_tail,
49669+ .readpage = readpage_tail,
49670+ .get_block = get_block_address_tail,
49671+ .append_key = append_key_tail,
49672+ .init_coord_extension =
49673+ init_coord_extension_tail
49674+ }
49675+ }
49676+ },
49677+ [CTAIL_ID] = {
49678+ .h = {
49679+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49680+ .id = CTAIL_ID,
49681+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49682+ .pops = NULL,
49683+ .label = "ctail",
49684+ .desc = "cryptcompress tail item",
49685+ .linkage = {NULL, NULL}
49686+ },
49687+ .b = {
49688+ .max_key_inside = max_key_inside_tail,
49689+ .can_contain_key = can_contain_key_ctail,
49690+ .mergeable = mergeable_ctail,
49691+ .nr_units = nr_units_ctail,
49692+ .lookup = NULL,
49693+ .init = init_ctail,
49694+ .paste = paste_ctail,
49695+ .fast_paste = agree_to_fast_op,
49696+ .can_shift = can_shift_ctail,
49697+ .create_hook = create_hook_ctail,
49698+ .copy_units = copy_units_ctail,
49699+ .kill_hook = kill_hook_ctail,
49700+ .shift_hook = shift_hook_ctail,
49701+ .cut_units = cut_units_ctail,
49702+ .kill_units = kill_units_ctail,
49703+ .unit_key = unit_key_tail,
49704+ .max_unit_key = unit_key_tail,
49705+ .estimate = estimate_ctail,
49706+ .item_data_by_flow = NULL,
49707+#if REISER4_DEBUG
49708+ .check = check_ctail
49709+#endif
49710+ },
49711+ .f = {
49712+ .utmost_child = utmost_child_ctail,
49713+ /* FIXME-EDWARD: write this */
49714+ .utmost_child_real_block = NULL,
49715+ .update = NULL,
49716+ .scan = scan_ctail,
49717+ .convert = convert_ctail
49718+ },
49719+ .s = {
49720+ .file = {
49721+ .write = NULL,
49722+ .read = read_ctail,
49723+ .readpage = readpage_ctail,
49724+ .get_block = get_block_address_tail,
49725+ .append_key = append_key_ctail,
49726+ .init_coord_extension =
49727+ init_coord_extension_tail
49728+ }
49729+ }
49730+ },
49731+ [BLACK_BOX_ID] = {
49732+ .h = {
49733+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
49734+ .id = BLACK_BOX_ID,
49735+ .groups = (1 << OTHER_ITEM_TYPE),
49736+ .pops = NULL,
49737+ .label = "blackbox",
49738+ .desc = "black box item",
49739+ .linkage = {NULL, NULL}
49740+ },
49741+ .b = {
49742+ .max_key_inside = NULL,
49743+ .can_contain_key = NULL,
49744+ .mergeable = not_mergeable,
49745+ .nr_units = nr_units_single_unit,
49746+ /* to need for ->lookup method */
49747+ .lookup = NULL,
49748+ .init = NULL,
49749+ .paste = NULL,
49750+ .fast_paste = NULL,
49751+ .can_shift = NULL,
49752+ .copy_units = NULL,
49753+ .create_hook = NULL,
49754+ .kill_hook = NULL,
49755+ .shift_hook = NULL,
49756+ .cut_units = NULL,
49757+ .kill_units = NULL,
49758+ .unit_key = NULL,
49759+ .max_unit_key = NULL,
49760+ .estimate = NULL,
49761+ .item_data_by_flow = NULL,
49762+#if REISER4_DEBUG
49763+ .check = NULL
49764+#endif
49765+ }
49766+ }
49767+};
49768+
49769+/* Make Linus happy.
49770+ Local variables:
49771+ c-indentation-style: "K&R"
49772+ mode-name: "LC"
49773+ c-basic-offset: 8
49774+ tab-width: 8
49775+ fill-column: 120
49776+ End:
49777+*/
49778diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/item.h linux-2.6.24/fs/reiser4/plugin/item/item.h
49779--- linux-2.6.24.orig/fs/reiser4/plugin/item/item.h 1970-01-01 03:00:00.000000000 +0300
49780+++ linux-2.6.24/fs/reiser4/plugin/item/item.h 2008-01-25 11:40:16.698169785 +0300
49781@@ -0,0 +1,398 @@
49782+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49783+
49784+/* first read balance.c comments before reading this */
49785+
49786+/* An item_plugin implements all of the operations required for
49787+ balancing that are item specific. */
49788+
49789+/* an item plugin also implements other operations that are specific to that
49790+ item. These go into the item specific operations portion of the item
49791+ handler, and all of the item specific portions of the item handler are put
49792+ into a union. */
49793+
49794+#if !defined( __REISER4_ITEM_H__ )
49795+#define __REISER4_ITEM_H__
49796+
49797+#include "../../forward.h"
49798+#include "../plugin_header.h"
49799+#include "../../dformat.h"
49800+#include "../../seal.h"
49801+#include "../../plugin/file/file.h"
49802+
49803+#include <linux/fs.h> /* for struct file, struct inode */
49804+#include <linux/mm.h> /* for struct page */
49805+#include <linux/dcache.h> /* for struct dentry */
49806+
49807+typedef enum {
49808+ STAT_DATA_ITEM_TYPE,
49809+ DIR_ENTRY_ITEM_TYPE,
49810+ INTERNAL_ITEM_TYPE,
49811+ UNIX_FILE_METADATA_ITEM_TYPE,
49812+ OTHER_ITEM_TYPE
49813+} item_type_id;
49814+
49815+/* this is the part of each item plugin that all items are expected to
49816+ support or at least explicitly fail to support by setting the
49817+ pointer to null. */
49818+struct balance_ops {
49819+ /* operations called by balancing
49820+
49821+ It is interesting to consider that some of these item
49822+ operations could be given sources or targets that are not
49823+ really items in nodes. This could be ok/useful.
49824+
49825+ */
49826+ /* maximal key that can _possibly_ be occupied by this item
49827+
49828+ When inserting, and node ->lookup() method (called by
49829+ coord_by_key()) reaches an item after binary search,
49830+ the ->max_key_inside() item plugin method is used to determine
49831+ whether new item should pasted into existing item
49832+ (new_key<=max_key_inside()) or new item has to be created
49833+ (new_key>max_key_inside()).
49834+
49835+ For items that occupy exactly one key (like stat-data)
49836+ this method should return this key. For items that can
49837+ grow indefinitely (extent, directory item) this should
49838+ return reiser4_max_key().
49839+
49840+ For example extent with the key
49841+
49842+ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
49843+
49844+ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
49845+ */
49846+ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
49847+
49848+ /* true if item @coord can merge data at @key. */
49849+ int (*can_contain_key) (const coord_t *, const reiser4_key *,
49850+ const reiser4_item_data *);
49851+ /* mergeable() - check items for mergeability
49852+
49853+ Optional method. Returns true if two items can be merged.
49854+
49855+ */
49856+ int (*mergeable) (const coord_t *, const coord_t *);
49857+
49858+ /* number of atomic things in an item.
49859+ NOTE FOR CONTRIBUTORS: use a generic method
49860+ nr_units_single_unit() for solid (atomic) items, as
49861+ tree operations use it as a criterion of solidness
49862+ (see is_solid_item macro) */
49863+ pos_in_node_t(*nr_units) (const coord_t *);
49864+
49865+ /* search within item for a unit within the item, and return a
49866+ pointer to it. This can be used to calculate how many
49867+ bytes to shrink an item if you use pointer arithmetic and
49868+ compare to the start of the item body if the item's data
49869+ are continuous in the node, if the item's data are not
49870+ continuous in the node, all sorts of other things are maybe
49871+ going to break as well. */
49872+ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
49873+ /* method called by ode_plugin->create_item() to initialise new
49874+ item */
49875+ int (*init) (coord_t * target, coord_t * from,
49876+ reiser4_item_data * data);
49877+ /* method called (e.g., by reiser4_resize_item()) to place new data
49878+ into item when it grows */
49879+ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
49880+ /* return true if paste into @coord is allowed to skip
49881+ carry. That is, if such paste would require any changes
49882+ at the parent level
49883+ */
49884+ int (*fast_paste) (const coord_t *);
49885+ /* how many but not more than @want units of @source can be
49886+ shifted into @target node. If pend == append - we try to
49887+ append last item of @target by first units of @source. If
49888+ pend == prepend - we try to "prepend" first item in @target
49889+ by last units of @source. @target node has @free_space
49890+ bytes of free space. Total size of those units are returned
49891+ via @size.
49892+
49893+ @target is not NULL if shifting to the mergeable item and
49894+ NULL is new item will be created during shifting.
49895+ */
49896+ int (*can_shift) (unsigned free_space, coord_t *,
49897+ znode *, shift_direction, unsigned *size,
49898+ unsigned want);
49899+
49900+ /* starting off @from-th unit of item @source append or
49901+ prepend @count units to @target. @target has been already
49902+ expanded by @free_space bytes. That must be exactly what is
49903+ needed for those items in @target. If @where_is_free_space
49904+ == SHIFT_LEFT - free space is at the end of @target item,
49905+ othersize - it is in the beginning of it. */
49906+ void (*copy_units) (coord_t *, coord_t *,
49907+ unsigned from, unsigned count,
49908+ shift_direction where_is_free_space,
49909+ unsigned free_space);
49910+
49911+ int (*create_hook) (const coord_t *, void *);
49912+ /* do whatever is necessary to do when @count units starting
49913+ from @from-th one are removed from the tree */
49914+ /* FIXME-VS: this is used to be here for, in particular,
49915+ extents and items of internal type to free blocks they point
49916+ to at the same time with removing items from a
49917+ tree. Problems start, however, when dealloc_block fails due
49918+ to some reason. Item gets removed, but blocks it pointed to
49919+ are not freed. It is not clear how to fix this for items of
49920+ internal type because a need to remove internal item may
49921+ appear in the middle of balancing, and there is no way to
49922+ undo changes made. OTOH, if space allocator involves
49923+ balancing to perform dealloc_block - this will probably
49924+ break balancing due to deadlock issues
49925+ */
49926+ int (*kill_hook) (const coord_t *, pos_in_node_t from,
49927+ pos_in_node_t count, struct carry_kill_data *);
49928+ int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
49929+ znode * _node);
49930+
49931+ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
49932+ including boundaries. When units are cut from item beginning - move space which gets freed to head of
49933+ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
49934+ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
49935+ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
49936+ */
49937+ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
49938+ struct carry_cut_data *,
49939+ reiser4_key * smallest_removed,
49940+ reiser4_key * new_first_key);
49941+
49942+ /* like cut_units, except that these units are removed from the
49943+ tree, not only from a node */
49944+ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
49945+ struct carry_kill_data *,
49946+ reiser4_key * smallest_removed,
49947+ reiser4_key * new_first);
49948+
49949+ /* if @key_of_coord == 1 - returned key of coord, otherwise -
49950+ key of unit is returned. If @coord is not set to certain
49951+ unit - ERR_PTR(-ENOENT) is returned */
49952+ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
49953+ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
49954+ /* estimate how much space is needed for paste @data into item at
49955+ @coord. if @coord==0 - estimate insertion, otherwise - estimate
49956+ pasting
49957+ */
49958+ int (*estimate) (const coord_t *, const reiser4_item_data *);
49959+
49960+ /* converts flow @f to item data. @coord == 0 on insert */
49961+ int (*item_data_by_flow) (const coord_t *, const flow_t *,
49962+ reiser4_item_data *);
49963+
49964+ /*void (*show) (struct seq_file *, coord_t *); */
49965+
49966+#if REISER4_DEBUG
49967+ /* used for debugging, every item should have here the most
49968+ complete possible check of the consistency of the item that
49969+ the inventor can construct */
49970+ int (*check) (const coord_t *, const char **error);
49971+#endif
49972+
49973+};
49974+
49975+struct flush_ops {
49976+ /* return the right or left child of @coord, only if it is in memory */
49977+ int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
49978+
49979+ /* return whether the right or left child of @coord has a non-fake
49980+ block number. */
49981+ int (*utmost_child_real_block) (const coord_t *, sideof side,
49982+ reiser4_block_nr *);
49983+ /* relocate child at @coord to the @block */
49984+ void (*update) (const coord_t *, const reiser4_block_nr *);
49985+ /* count unformatted nodes per item for leave relocation policy, etc.. */
49986+ int (*scan) (flush_scan * scan);
49987+ /* convert item by flush */
49988+ int (*convert) (flush_pos_t * pos);
49989+ /* backward mapping from jnode offset to a key. */
49990+ int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
49991+};
49992+
49993+/* operations specific to the directory item */
49994+struct dir_entry_iops {
49995+ /* extract stat-data key from directory entry at @coord and place it
49996+ into @key. */
49997+ int (*extract_key) (const coord_t *, reiser4_key * key);
49998+ /* update object key in item. */
49999+ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
50000+ /* extract name from directory entry at @coord and return it */
50001+ char *(*extract_name) (const coord_t *, char *buf);
50002+ /* extract file type (DT_* stuff) from directory entry at @coord and
50003+ return it */
50004+ unsigned (*extract_file_type) (const coord_t *);
50005+ int (*add_entry) (struct inode * dir,
50006+ coord_t *, lock_handle *,
50007+ const struct dentry * name,
50008+ reiser4_dir_entry_desc * entry);
50009+ int (*rem_entry) (struct inode * dir, const struct qstr * name,
50010+ coord_t *, lock_handle *,
50011+ reiser4_dir_entry_desc * entry);
50012+ int (*max_name_len) (const struct inode * dir);
50013+};
50014+
50015+/* operations specific to items regular (unix) file metadata are built of */
50016+struct file_iops{
50017+ int (*write) (struct file *, struct inode *,
50018+ const char __user *, size_t, loff_t *pos);
50019+ int (*read) (struct file *, flow_t *, hint_t *);
50020+ int (*readpage) (void *, struct page *);
50021+ int (*get_block) (const coord_t *, sector_t, sector_t *);
50022+ /*
50023+ * key of first byte which is not addressed by the item @coord is set
50024+ * to.
50025+ * For example, for extent item with the key
50026+ *
50027+ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
50028+ *
50029+ * ->append_key is
50030+ *
50031+ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
50032+ */
50033+ reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
50034+
50035+ void (*init_coord_extension) (uf_coord_t *, loff_t);
50036+};
50037+
50038+/* operations specific to items of stat data type */
50039+struct sd_iops {
50040+ int (*init_inode) (struct inode * inode, char *sd, int len);
50041+ int (*save_len) (struct inode * inode);
50042+ int (*save) (struct inode * inode, char **area);
50043+};
50044+
50045+/* operations specific to internal item */
50046+struct internal_iops{
50047+ /* all tree traversal want to know from internal item is where
50048+ to go next. */
50049+ void (*down_link) (const coord_t * coord,
50050+ const reiser4_key * key, reiser4_block_nr * block);
50051+ /* check that given internal item contains given pointer. */
50052+ int (*has_pointer_to) (const coord_t * coord,
50053+ const reiser4_block_nr * block);
50054+};
50055+
50056+struct item_plugin {
50057+ /* generic fields */
50058+ plugin_header h;
50059+ /* methods common for all item types */
50060+ struct balance_ops b; /* balance operations */
50061+ struct flush_ops f; /* flush operates with items via this methods */
50062+
50063+ /* methods specific to particular type of item */
50064+ union {
50065+ struct dir_entry_iops dir;
50066+ struct file_iops file;
50067+ struct sd_iops sd;
50068+ struct internal_iops internal;
50069+ } s;
50070+};
50071+
50072+#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
50073+
50074+static inline item_id item_id_by_plugin(item_plugin * plugin)
50075+{
50076+ return plugin->h.id;
50077+}
50078+
50079+static inline char get_iplugid(item_plugin * iplug)
50080+{
50081+ assert("nikita-2838", iplug != NULL);
50082+ assert("nikita-2839", iplug->h.id < 0xff);
50083+ return (char)item_id_by_plugin(iplug);
50084+}
50085+
50086+extern unsigned long znode_times_locked(const znode * z);
50087+
50088+static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
50089+{
50090+ assert("nikita-2837", coord != NULL);
50091+ assert("nikita-2838", iplug != NULL);
50092+ coord->iplugid = get_iplugid(iplug);
50093+ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
50094+}
50095+
50096+static inline item_plugin *coord_iplug(const coord_t * coord)
50097+{
50098+ assert("nikita-2833", coord != NULL);
50099+ assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
50100+ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
50101+ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
50102+ coord->iplugid);
50103+}
50104+
50105+extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
50106+ const reiser4_item_data *);
50107+extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
50108+extern int item_is_extent(const coord_t *);
50109+extern int item_is_tail(const coord_t *);
50110+extern int item_is_statdata(const coord_t * item);
50111+extern int item_is_ctail(const coord_t *);
50112+
50113+extern pos_in_node_t item_length_by_coord(const coord_t * coord);
50114+extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
50115+extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
50116+extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
50117+extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
50118+extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
50119+extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
50120+ reiser4_key * key);
50121+extern void obtain_item_plugin(const coord_t * coord);
50122+
50123+#if defined(REISER4_DEBUG)
50124+extern int znode_is_loaded(const znode * node);
50125+#endif
50126+
50127+/* return plugin of item at @coord */
50128+static inline item_plugin *item_plugin_by_coord(const coord_t *
50129+ coord /* coord to query */ )
50130+{
50131+ assert("nikita-330", coord != NULL);
50132+ assert("nikita-331", coord->node != NULL);
50133+ assert("nikita-332", znode_is_loaded(coord->node));
50134+
50135+ if (unlikely(!coord_is_iplug_set(coord)))
50136+ obtain_item_plugin(coord);
50137+ return coord_iplug(coord);
50138+}
50139+
50140+/* this returns true if item is of internal type */
50141+static inline int item_is_internal(const coord_t * item)
50142+{
50143+ assert("vs-483", coord_is_existing_item(item));
50144+ return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
50145+}
50146+
50147+extern void item_body_by_coord_hard(coord_t * coord);
50148+extern void *item_body_by_coord_easy(const coord_t * coord);
50149+#if REISER4_DEBUG
50150+extern int item_body_is_valid(const coord_t * coord);
50151+#endif
50152+
50153+/* return pointer to item body */
50154+static inline void *item_body_by_coord(const coord_t *
50155+ coord /* coord to query */ )
50156+{
50157+ assert("nikita-324", coord != NULL);
50158+ assert("nikita-325", coord->node != NULL);
50159+ assert("nikita-326", znode_is_loaded(coord->node));
50160+
50161+ if (coord->offset == INVALID_OFFSET)
50162+ item_body_by_coord_hard((coord_t *) coord);
50163+ assert("nikita-3201", item_body_is_valid(coord));
50164+ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
50165+ return item_body_by_coord_easy(coord);
50166+}
50167+
50168+/* __REISER4_ITEM_H__ */
50169+#endif
50170+/* Make Linus happy.
50171+ Local variables:
50172+ c-indentation-style: "K&R"
50173+ mode-name: "LC"
50174+ c-basic-offset: 8
50175+ tab-width: 8
50176+ fill-column: 120
50177+ scroll-step: 1
50178+ End:
50179+*/
50180diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/Makefile linux-2.6.24/fs/reiser4/plugin/item/Makefile
50181--- linux-2.6.24.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 03:00:00.000000000 +0300
50182+++ linux-2.6.24/fs/reiser4/plugin/item/Makefile 2008-01-25 11:39:07.024230357 +0300
50183@@ -0,0 +1,18 @@
50184+obj-$(CONFIG_REISER4_FS) += item_plugins.o
50185+
50186+item_plugins-objs := \
50187+ item.o \
50188+ static_stat.o \
50189+ sde.o \
50190+ cde.o \
50191+ blackbox.o \
50192+ internal.o \
50193+ tail.o \
50194+ ctail.o \
50195+ extent.o \
50196+ extent_item_ops.o \
50197+ extent_file_ops.o \
50198+ extent_flush_ops.o
50199+
50200+
50201+
50202diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/sde.c linux-2.6.24/fs/reiser4/plugin/item/sde.c
50203--- linux-2.6.24.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 03:00:00.000000000 +0300
50204+++ linux-2.6.24/fs/reiser4/plugin/item/sde.c 2008-01-25 11:39:07.024230357 +0300
50205@@ -0,0 +1,190 @@
50206+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50207+
50208+/* Directory entry implementation */
50209+#include "../../forward.h"
50210+#include "../../debug.h"
50211+#include "../../dformat.h"
50212+#include "../../kassign.h"
50213+#include "../../coord.h"
50214+#include "sde.h"
50215+#include "item.h"
50216+#include "../plugin.h"
50217+#include "../../znode.h"
50218+#include "../../carry.h"
50219+#include "../../tree.h"
50220+#include "../../inode.h"
50221+
50222+#include <linux/fs.h> /* for struct inode */
50223+#include <linux/dcache.h> /* for struct dentry */
50224+#include <linux/quotaops.h>
50225+
50226+/* ->extract_key() method of simple directory item plugin. */
50227+int extract_key_de(const coord_t * coord /* coord of item */ ,
50228+ reiser4_key * key /* resulting key */ )
50229+{
50230+ directory_entry_format *dent;
50231+
50232+ assert("nikita-1458", coord != NULL);
50233+ assert("nikita-1459", key != NULL);
50234+
50235+ dent = (directory_entry_format *) item_body_by_coord(coord);
50236+ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
50237+ return extract_key_from_id(&dent->id, key);
50238+}
50239+
50240+int
50241+update_key_de(const coord_t * coord, const reiser4_key * key,
50242+ lock_handle * lh UNUSED_ARG)
50243+{
50244+ directory_entry_format *dent;
50245+ obj_key_id obj_id;
50246+ int result;
50247+
50248+ assert("nikita-2342", coord != NULL);
50249+ assert("nikita-2343", key != NULL);
50250+
50251+ dent = (directory_entry_format *) item_body_by_coord(coord);
50252+ result = build_obj_key_id(key, &obj_id);
50253+ if (result == 0) {
50254+ dent->id = obj_id;
50255+ znode_make_dirty(coord->node);
50256+ }
50257+ return 0;
50258+}
50259+
50260+char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
50261+ char *buf)
50262+{
50263+ reiser4_key key;
50264+
50265+ unit_key_by_coord(coord, &key);
50266+ if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
50267+ reiser4_print_address("oops", znode_get_block(coord->node));
50268+ if (!is_longname_key(&key)) {
50269+ if (is_dot_key(&key))
50270+ return (char *)".";
50271+ else
50272+ return extract_name_from_key(&key, buf);
50273+ } else
50274+ return (char *)dent->name;
50275+}
50276+
50277+/* ->extract_name() method of simple directory item plugin. */
50278+char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
50279+{
50280+ directory_entry_format *dent;
50281+
50282+ assert("nikita-1460", coord != NULL);
50283+
50284+ dent = (directory_entry_format *) item_body_by_coord(coord);
50285+ return extract_dent_name(coord, dent, buf);
50286+}
50287+
50288+/* ->extract_file_type() method of simple directory item plugin. */
50289+unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
50290+ * item */ )
50291+{
50292+ assert("nikita-1764", coord != NULL);
50293+ /* we don't store file type in the directory entry yet.
50294+
50295+ But see comments at kassign.h:obj_key_id
50296+ */
50297+ return DT_UNKNOWN;
50298+}
50299+
50300+int add_entry_de(struct inode *dir /* directory of item */ ,
50301+ coord_t * coord /* coord of item */ ,
50302+ lock_handle * lh /* insertion lock handle */ ,
50303+ const struct dentry *de /* name to add */ ,
50304+ reiser4_dir_entry_desc * entry /* parameters of new directory
50305+ * entry */ )
50306+{
50307+ reiser4_item_data data;
50308+ directory_entry_format *dent;
50309+ int result;
50310+ const char *name;
50311+ int len;
50312+ int longname;
50313+
50314+ name = de->d_name.name;
50315+ len = de->d_name.len;
50316+ assert("nikita-1163", strlen(name) == len);
50317+
50318+ longname = is_longname(name, len);
50319+
50320+ data.length = sizeof *dent;
50321+ if (longname)
50322+ data.length += len + 1;
50323+ data.data = NULL;
50324+ data.user = 0;
50325+ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
50326+
50327+ /* NOTE-NIKITA quota plugin */
50328+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
50329+ return -EDQUOT;
50330+
50331+ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
50332+ if (result != 0)
50333+ return result;
50334+
50335+ dent = (directory_entry_format *) item_body_by_coord(coord);
50336+ build_inode_key_id(entry->obj, &dent->id);
50337+ if (longname) {
50338+ memcpy(dent->name, name, len);
50339+ put_unaligned(0, &dent->name[len]);
50340+ }
50341+ return 0;
50342+}
50343+
50344+int rem_entry_de(struct inode *dir /* directory of item */ ,
50345+ const struct qstr *name UNUSED_ARG,
50346+ coord_t * coord /* coord of item */ ,
50347+ lock_handle * lh UNUSED_ARG /* lock handle for
50348+ * removal */ ,
50349+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
50350+ * directory entry
50351+ * being removed */ )
50352+{
50353+ coord_t shadow;
50354+ int result;
50355+ int length;
50356+
50357+ length = item_length_by_coord(coord);
50358+ if (inode_get_bytes(dir) < length) {
50359+ warning("nikita-2627", "Dir is broke: %llu: %llu",
50360+ (unsigned long long)get_inode_oid(dir),
50361+ inode_get_bytes(dir));
50362+
50363+ return RETERR(-EIO);
50364+ }
50365+
50366+ /* cut_node() is supposed to take pointers to _different_
50367+ coords, because it will modify them without respect to
50368+ possible aliasing. To work around this, create temporary copy
50369+ of @coord.
50370+ */
50371+ coord_dup(&shadow, coord);
50372+ result =
50373+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
50374+ if (result == 0) {
50375+ /* NOTE-NIKITA quota plugin */
50376+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
50377+ }
50378+ return result;
50379+}
50380+
50381+int max_name_len_de(const struct inode *dir)
50382+{
50383+ return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
50384+ sizeof(directory_entry_format) - 2;
50385+}
50386+
50387+/* Make Linus happy.
50388+ Local variables:
50389+ c-indentation-style: "K&R"
50390+ mode-name: "LC"
50391+ c-basic-offset: 8
50392+ tab-width: 8
50393+ fill-column: 120
50394+ End:
50395+*/
50396diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/sde.h linux-2.6.24/fs/reiser4/plugin/item/sde.h
50397--- linux-2.6.24.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 03:00:00.000000000 +0300
50398+++ linux-2.6.24/fs/reiser4/plugin/item/sde.h 2008-01-25 11:39:07.024230357 +0300
50399@@ -0,0 +1,66 @@
50400+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50401+
50402+/* Directory entry. */
50403+
50404+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
50405+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
50406+
50407+#include "../../forward.h"
50408+#include "../../dformat.h"
50409+#include "../../kassign.h"
50410+#include "../../key.h"
50411+
50412+#include <linux/fs.h>
50413+#include <linux/dcache.h> /* for struct dentry */
50414+
50415+typedef struct directory_entry_format {
50416+ /* key of object stat-data. It's not necessary to store whole
50417+ key here, because it's always key of stat-data, so minor
50418+ packing locality and offset can be omitted here. But this
50419+ relies on particular key allocation scheme for stat-data, so,
50420+ for extensibility sake, whole key can be stored here.
50421+
50422+ We store key as array of bytes, because we don't want 8-byte
50423+ alignment of dir entries.
50424+ */
50425+ obj_key_id id;
50426+ /* file name. Null terminated string. */
50427+ d8 name[0];
50428+} directory_entry_format;
50429+
50430+void print_de(const char *prefix, coord_t * coord);
50431+int extract_key_de(const coord_t * coord, reiser4_key * key);
50432+int update_key_de(const coord_t * coord, const reiser4_key * key,
50433+ lock_handle * lh);
50434+char *extract_name_de(const coord_t * coord, char *buf);
50435+unsigned extract_file_type_de(const coord_t * coord);
50436+int add_entry_de(struct inode *dir, coord_t * coord,
50437+ lock_handle * lh, const struct dentry *name,
50438+ reiser4_dir_entry_desc * entry);
50439+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
50440+ lock_handle * lh, reiser4_dir_entry_desc * entry);
50441+int max_name_len_de(const struct inode *dir);
50442+
50443+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
50444+
50445+char *extract_dent_name(const coord_t * coord,
50446+ directory_entry_format * dent, char *buf);
50447+
50448+#if REISER4_LARGE_KEY
50449+#define DE_NAME_BUF_LEN (24)
50450+#else
50451+#define DE_NAME_BUF_LEN (16)
50452+#endif
50453+
50454+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
50455+#endif
50456+
50457+/* Make Linus happy.
50458+ Local variables:
50459+ c-indentation-style: "K&R"
50460+ mode-name: "LC"
50461+ c-basic-offset: 8
50462+ tab-width: 8
50463+ fill-column: 120
50464+ End:
50465+*/
50466diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.24/fs/reiser4/plugin/item/static_stat.c
50467--- linux-2.6.24.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 03:00:00.000000000 +0300
50468+++ linux-2.6.24/fs/reiser4/plugin/item/static_stat.c 2008-01-25 11:39:07.024230357 +0300
50469@@ -0,0 +1,1107 @@
50470+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50471+
50472+/* stat data manipulation. */
50473+
50474+#include "../../forward.h"
50475+#include "../../super.h"
50476+#include "../../vfs_ops.h"
50477+#include "../../inode.h"
50478+#include "../../debug.h"
50479+#include "../../dformat.h"
50480+#include "../object.h"
50481+#include "../plugin.h"
50482+#include "../plugin_header.h"
50483+#include "static_stat.h"
50484+#include "item.h"
50485+
50486+#include <linux/types.h>
50487+#include <linux/fs.h>
50488+
50489+/* see static_stat.h for explanation */
50490+
50491+/* helper function used while we are dumping/loading inode/plugin state
50492+ to/from the stat-data. */
50493+
50494+static void move_on(int *length /* space remaining in stat-data */ ,
50495+ char **area /* current coord in stat data */ ,
50496+ int size_of /* how many bytes to move forward */ )
50497+{
50498+ assert("nikita-615", length != NULL);
50499+ assert("nikita-616", area != NULL);
50500+
50501+ *length -= size_of;
50502+ *area += size_of;
50503+
50504+ assert("nikita-617", *length >= 0);
50505+}
50506+
50507+/* helper function used while loading inode/plugin state from stat-data.
50508+ Complain if there is less space in stat-data than was expected.
50509+ Can only happen on disk corruption. */
50510+static int not_enough_space(struct inode *inode /* object being processed */ ,
50511+ const char *where /* error message */ )
50512+{
50513+ assert("nikita-618", inode != NULL);
50514+
50515+ warning("nikita-619", "Not enough space in %llu while loading %s",
50516+ (unsigned long long)get_inode_oid(inode), where);
50517+
50518+ return RETERR(-EINVAL);
50519+}
50520+
50521+/* helper function used while loading inode/plugin state from
50522+ stat-data. Call it if invalid plugin id was found. */
50523+static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
50524+ struct inode *inode /* object being processed */ )
50525+{
50526+ warning("nikita-620", "Unknown plugin %i in %llu",
50527+ id, (unsigned long long)get_inode_oid(inode));
50528+
50529+ return RETERR(-EINVAL);
50530+}
50531+
50532+/* this is installed as ->init_inode() method of
50533+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
50534+ Copies data from on-disk stat-data format into inode.
50535+ Handles stat-data extensions. */
50536+/* was sd_load */
50537+int init_inode_static_sd(struct inode *inode /* object being processed */ ,
50538+ char *sd /* stat-data body */ ,
50539+ int len /* length of stat-data */ )
50540+{
50541+ int result;
50542+ int bit;
50543+ int chunk;
50544+ __u16 mask;
50545+ __u64 bigmask;
50546+ reiser4_stat_data_base *sd_base;
50547+ reiser4_inode *state;
50548+
50549+ assert("nikita-625", inode != NULL);
50550+ assert("nikita-626", sd != NULL);
50551+
50552+ result = 0;
50553+ sd_base = (reiser4_stat_data_base *) sd;
50554+ state = reiser4_inode_data(inode);
50555+ mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
50556+ bigmask = mask;
50557+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
50558+
50559+ move_on(&len, &sd, sizeof *sd_base);
50560+ for (bit = 0, chunk = 0;
50561+ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
50562+ ++bit, mask >>= 1) {
50563+ if (((bit + 1) % 16) != 0) {
50564+ /* handle extension */
50565+ sd_ext_plugin *sdplug;
50566+
50567+ if (bit >= LAST_SD_EXTENSION) {
50568+ warning("vpf-1904",
50569+ "No such extension %i in inode %llu",
50570+ bit,
50571+ (unsigned long long)
50572+ get_inode_oid(inode));
50573+
50574+ result = RETERR(-EINVAL);
50575+ break;
50576+ }
50577+
50578+ sdplug = sd_ext_plugin_by_id(bit);
50579+ if (sdplug == NULL) {
50580+ warning("nikita-627",
50581+ "No such extension %i in inode %llu",
50582+ bit,
50583+ (unsigned long long)
50584+ get_inode_oid(inode));
50585+
50586+ result = RETERR(-EINVAL);
50587+ break;
50588+ }
50589+ if (mask & 1) {
50590+ assert("nikita-628", sdplug->present);
50591+ /* alignment is not supported in node layout
50592+ plugin yet.
50593+ result = align( inode, &len, &sd,
50594+ sdplug -> alignment );
50595+ if( result != 0 )
50596+ return result; */
50597+ result = sdplug->present(inode, &sd, &len);
50598+ } else if (sdplug->absent != NULL)
50599+ result = sdplug->absent(inode);
50600+ if (result)
50601+ break;
50602+ /* else, we are looking at the last bit in 16-bit
50603+ portion of bitmask */
50604+ } else if (mask & 1) {
50605+ /* next portion of bitmask */
50606+ if (len < (int)sizeof(d16)) {
50607+ warning("nikita-629",
50608+ "No space for bitmap in inode %llu",
50609+ (unsigned long long)
50610+ get_inode_oid(inode));
50611+
50612+ result = RETERR(-EINVAL);
50613+ break;
50614+ }
50615+ mask = le16_to_cpu(get_unaligned((d16 *)sd));
50616+ bigmask <<= 16;
50617+ bigmask |= mask;
50618+ move_on(&len, &sd, sizeof(d16));
50619+ ++chunk;
50620+ if (chunk == 3) {
50621+ if (!(mask & 0x8000)) {
50622+ /* clear last bit */
50623+ mask &= ~0x8000;
50624+ continue;
50625+ }
50626+ /* too much */
50627+ warning("nikita-630",
50628+ "Too many extensions in %llu",
50629+ (unsigned long long)
50630+ get_inode_oid(inode));
50631+
50632+ result = RETERR(-EINVAL);
50633+ break;
50634+ }
50635+ } else
50636+ /* bitmask exhausted */
50637+ break;
50638+ }
50639+ state->extmask = bigmask;
50640+ /* common initialisations */
50641+ if (len - (bit / 16 * sizeof(d16)) > 0) {
50642+ /* alignment in save_len_static_sd() is taken into account
50643+ -edward */
50644+ warning("nikita-631", "unused space in inode %llu",
50645+ (unsigned long long)get_inode_oid(inode));
50646+ }
50647+
50648+ return result;
50649+}
50650+
50651+/* estimates size of stat-data required to store inode.
50652+ Installed as ->save_len() method of
50653+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50654+/* was sd_len */
50655+int save_len_static_sd(struct inode *inode /* object being processed */ )
50656+{
50657+ unsigned int result;
50658+ __u64 mask;
50659+ int bit;
50660+
50661+ assert("nikita-632", inode != NULL);
50662+
50663+ result = sizeof(reiser4_stat_data_base);
50664+ mask = reiser4_inode_data(inode)->extmask;
50665+ for (bit = 0; mask != 0; ++bit, mask >>= 1) {
50666+ if (mask & 1) {
50667+ sd_ext_plugin *sdplug;
50668+
50669+ sdplug = sd_ext_plugin_by_id(bit);
50670+ assert("nikita-633", sdplug != NULL);
50671+ /* no aligment support
50672+ result +=
50673+ round_up( result, sdplug -> alignment ) - result; */
50674+ result += sdplug->save_len(inode);
50675+ }
50676+ }
50677+ result += bit / 16 * sizeof(d16);
50678+ return result;
50679+}
50680+
50681+/* saves inode into stat-data.
50682+ Installed as ->save() method of
50683+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50684+/* was sd_save */
50685+int save_static_sd(struct inode *inode /* object being processed */ ,
50686+ char **area /* where to save stat-data */ )
50687+{
50688+ int result;
50689+ __u64 emask;
50690+ int bit;
50691+ unsigned int len;
50692+ reiser4_stat_data_base *sd_base;
50693+
50694+ assert("nikita-634", inode != NULL);
50695+ assert("nikita-635", area != NULL);
50696+
50697+ result = 0;
50698+ emask = reiser4_inode_data(inode)->extmask;
50699+ sd_base = (reiser4_stat_data_base *) * area;
50700+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
50701+ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
50702+
50703+ *area += sizeof *sd_base;
50704+ len = 0xffffffffu;
50705+ for (bit = 0; emask != 0; ++bit, emask >>= 1) {
50706+ if (emask & 1) {
50707+ if ((bit + 1) % 16 != 0) {
50708+ sd_ext_plugin *sdplug;
50709+ sdplug = sd_ext_plugin_by_id(bit);
50710+ assert("nikita-636", sdplug != NULL);
50711+ /* no alignment support yet
50712+ align( inode, &len, area,
50713+ sdplug -> alignment ); */
50714+ result = sdplug->save(inode, area);
50715+ if (result)
50716+ break;
50717+ } else {
50718+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
50719+ (d16 *)(*area));
50720+ /*cputod16((unsigned)(emask & 0xffff),
50721+ (d16 *) * area);*/
50722+ *area += sizeof(d16);
50723+ }
50724+ }
50725+ }
50726+ return result;
50727+}
50728+
50729+/* stat-data extension handling functions. */
50730+
50731+static int present_lw_sd(struct inode *inode /* object being processed */ ,
50732+ char **area /* position in stat-data */ ,
50733+ int *len /* remaining length */ )
50734+{
50735+ if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
50736+ reiser4_light_weight_stat *sd_lw;
50737+
50738+ sd_lw = (reiser4_light_weight_stat *) * area;
50739+
50740+ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
50741+ inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
50742+ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
50743+ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
50744+ inode->i_mode &= ~S_IFIFO;
50745+ warning("", "partially converted file is encountered");
50746+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
50747+ }
50748+ move_on(len, area, sizeof *sd_lw);
50749+ return 0;
50750+ } else
50751+ return not_enough_space(inode, "lw sd");
50752+}
50753+
50754+static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
50755+ * processed */ )
50756+{
50757+ return sizeof(reiser4_light_weight_stat);
50758+}
50759+
50760+static int save_lw_sd(struct inode *inode /* object being processed */ ,
50761+ char **area /* position in stat-data */ )
50762+{
50763+ reiser4_light_weight_stat *sd;
50764+ mode_t delta;
50765+
50766+ assert("nikita-2705", inode != NULL);
50767+ assert("nikita-2706", area != NULL);
50768+ assert("nikita-2707", *area != NULL);
50769+
50770+ sd = (reiser4_light_weight_stat *) * area;
50771+
50772+ delta = (reiser4_inode_get_flag(inode,
50773+ REISER4_PART_MIXED) ? S_IFIFO : 0);
50774+ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
50775+ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
50776+ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
50777+ *area += sizeof *sd;
50778+ return 0;
50779+}
50780+
50781+static int present_unix_sd(struct inode *inode /* object being processed */ ,
50782+ char **area /* position in stat-data */ ,
50783+ int *len /* remaining length */ )
50784+{
50785+ assert("nikita-637", inode != NULL);
50786+ assert("nikita-638", area != NULL);
50787+ assert("nikita-639", *area != NULL);
50788+ assert("nikita-640", len != NULL);
50789+ assert("nikita-641", *len > 0);
50790+
50791+ if (*len >= (int)sizeof(reiser4_unix_stat)) {
50792+ reiser4_unix_stat *sd;
50793+
50794+ sd = (reiser4_unix_stat *) * area;
50795+
50796+ inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
50797+ inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
50798+ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
50799+ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
50800+ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
50801+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
50802+ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
50803+ else
50804+ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
50805+ move_on(len, area, sizeof *sd);
50806+ return 0;
50807+ } else
50808+ return not_enough_space(inode, "unix sd");
50809+}
50810+
50811+static int absent_unix_sd(struct inode *inode /* object being processed */ )
50812+{
50813+ inode->i_uid = get_super_private(inode->i_sb)->default_uid;
50814+ inode->i_gid = get_super_private(inode->i_sb)->default_gid;
50815+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
50816+ inode_set_bytes(inode, inode->i_size);
50817+ /* mark inode as lightweight, so that caller (lookup_common) will
50818+ complete initialisation by copying [ug]id from a parent. */
50819+ reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
50820+ return 0;
50821+}
50822+
50823+/* Audited by: green(2002.06.14) */
50824+static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
50825+ * processed */ )
50826+{
50827+ return sizeof(reiser4_unix_stat);
50828+}
50829+
50830+static int save_unix_sd(struct inode *inode /* object being processed */ ,
50831+ char **area /* position in stat-data */ )
50832+{
50833+ reiser4_unix_stat *sd;
50834+
50835+ assert("nikita-642", inode != NULL);
50836+ assert("nikita-643", area != NULL);
50837+ assert("nikita-644", *area != NULL);
50838+
50839+ sd = (reiser4_unix_stat *) * area;
50840+ put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
50841+ put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
50842+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
50843+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
50844+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
50845+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
50846+ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
50847+ else
50848+ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
50849+ *area += sizeof *sd;
50850+ return 0;
50851+}
50852+
50853+static int
50854+present_large_times_sd(struct inode *inode /* object being processed */ ,
50855+ char **area /* position in stat-data */ ,
50856+ int *len /* remaining length */ )
50857+{
50858+ if (*len >= (int)sizeof(reiser4_large_times_stat)) {
50859+ reiser4_large_times_stat *sd_lt;
50860+
50861+ sd_lt = (reiser4_large_times_stat *) * area;
50862+
50863+ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
50864+ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
50865+ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
50866+
50867+ move_on(len, area, sizeof *sd_lt);
50868+ return 0;
50869+ } else
50870+ return not_enough_space(inode, "large times sd");
50871+}
50872+
50873+static int
50874+save_len_large_times_sd(struct inode *inode UNUSED_ARG
50875+ /* object being processed */ )
50876+{
50877+ return sizeof(reiser4_large_times_stat);
50878+}
50879+
50880+static int
50881+save_large_times_sd(struct inode *inode /* object being processed */ ,
50882+ char **area /* position in stat-data */ )
50883+{
50884+ reiser4_large_times_stat *sd;
50885+
50886+ assert("nikita-2817", inode != NULL);
50887+ assert("nikita-2818", area != NULL);
50888+ assert("nikita-2819", *area != NULL);
50889+
50890+ sd = (reiser4_large_times_stat *) * area;
50891+
50892+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
50893+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
50894+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
50895+
50896+ *area += sizeof *sd;
50897+ return 0;
50898+}
50899+
50900+/* symlink stat data extension */
50901+
50902+/* allocate memory for symlink target and attach it to inode->i_private */
50903+static int
50904+symlink_target_to_inode(struct inode *inode, const char *target, int len)
50905+{
50906+ assert("vs-845", inode->i_private == NULL);
50907+ assert("vs-846", !reiser4_inode_get_flag(inode,
50908+ REISER4_GENERIC_PTR_USED));
50909+ /* FIXME-VS: this is prone to deadlock. Not more than other similar
50910+ places, though */
50911+ inode->i_private = kmalloc((size_t) len + 1,
50912+ reiser4_ctx_gfp_mask_get());
50913+ if (!inode->i_private)
50914+ return RETERR(-ENOMEM);
50915+
50916+ memcpy((char *)(inode->i_private), target, (size_t) len);
50917+ ((char *)(inode->i_private))[len] = 0;
50918+ reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
50919+ return 0;
50920+}
50921+
50922+/* this is called on read_inode. There is nothing to do actually, but some
50923+ sanity checks */
50924+static int present_symlink_sd(struct inode *inode, char **area, int *len)
50925+{
50926+ int result;
50927+ int length;
50928+ reiser4_symlink_stat *sd;
50929+
50930+ length = (int)inode->i_size;
50931+ /*
50932+ * *len is number of bytes in stat data item from *area to the end of
50933+ * item. It must be not less than size of symlink + 1 for ending 0
50934+ */
50935+ if (length > *len)
50936+ return not_enough_space(inode, "symlink");
50937+
50938+ if (*(*area + length) != 0) {
50939+ warning("vs-840", "Symlink is not zero terminated");
50940+ return RETERR(-EIO);
50941+ }
50942+
50943+ sd = (reiser4_symlink_stat *) * area;
50944+ result = symlink_target_to_inode(inode, sd->body, length);
50945+
50946+ move_on(len, area, length + 1);
50947+ return result;
50948+}
50949+
50950+static int save_len_symlink_sd(struct inode *inode)
50951+{
50952+ return inode->i_size + 1;
50953+}
50954+
50955+/* this is called on create and update stat data. Do nothing on update but
50956+ update @area */
50957+static int save_symlink_sd(struct inode *inode, char **area)
50958+{
50959+ int result;
50960+ int length;
50961+ reiser4_symlink_stat *sd;
50962+
50963+ length = (int)inode->i_size;
50964+ /* inode->i_size must be set already */
50965+ assert("vs-841", length);
50966+
50967+ result = 0;
50968+ sd = (reiser4_symlink_stat *) * area;
50969+ if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
50970+ const char *target;
50971+
50972+ target = (const char *)(inode->i_private);
50973+ inode->i_private = NULL;
50974+
50975+ result = symlink_target_to_inode(inode, target, length);
50976+
50977+ /* copy symlink to stat data */
50978+ memcpy(sd->body, target, (size_t) length);
50979+ (*area)[length] = 0;
50980+ } else {
50981+ /* there is nothing to do in update but move area */
50982+ assert("vs-844",
50983+ !memcmp(inode->i_private, sd->body,
50984+ (size_t) length + 1));
50985+ }
50986+
50987+ *area += (length + 1);
50988+ return result;
50989+}
50990+
50991+static int present_flags_sd(struct inode *inode /* object being processed */ ,
50992+ char **area /* position in stat-data */ ,
50993+ int *len /* remaining length */ )
50994+{
50995+ assert("nikita-645", inode != NULL);
50996+ assert("nikita-646", area != NULL);
50997+ assert("nikita-647", *area != NULL);
50998+ assert("nikita-648", len != NULL);
50999+ assert("nikita-649", *len > 0);
51000+
51001+ if (*len >= (int)sizeof(reiser4_flags_stat)) {
51002+ reiser4_flags_stat *sd;
51003+
51004+ sd = (reiser4_flags_stat *) * area;
51005+ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
51006+ move_on(len, area, sizeof *sd);
51007+ return 0;
51008+ } else
51009+ return not_enough_space(inode, "generation and attrs");
51010+}
51011+
51012+/* Audited by: green(2002.06.14) */
51013+static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
51014+ * processed */ )
51015+{
51016+ return sizeof(reiser4_flags_stat);
51017+}
51018+
51019+static int save_flags_sd(struct inode *inode /* object being processed */ ,
51020+ char **area /* position in stat-data */ )
51021+{
51022+ reiser4_flags_stat *sd;
51023+
51024+ assert("nikita-650", inode != NULL);
51025+ assert("nikita-651", area != NULL);
51026+ assert("nikita-652", *area != NULL);
51027+
51028+ sd = (reiser4_flags_stat *) * area;
51029+ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
51030+ *area += sizeof *sd;
51031+ return 0;
51032+}
51033+
51034+static int absent_plugin_sd(struct inode *inode);
51035+static int present_plugin_sd(struct inode *inode /* object being processed */ ,
51036+ char **area /* position in stat-data */ ,
51037+ int *len /* remaining length */,
51038+ int is_pset /* 1 if plugin set, 0 if heir set. */)
51039+{
51040+ reiser4_plugin_stat *sd;
51041+ reiser4_plugin *plugin;
51042+ reiser4_inode *info;
51043+ int i;
51044+ __u16 mask;
51045+ int result;
51046+ int num_of_plugins;
51047+
51048+ assert("nikita-653", inode != NULL);
51049+ assert("nikita-654", area != NULL);
51050+ assert("nikita-655", *area != NULL);
51051+ assert("nikita-656", len != NULL);
51052+ assert("nikita-657", *len > 0);
51053+
51054+ if (*len < (int)sizeof(reiser4_plugin_stat))
51055+ return not_enough_space(inode, "plugin");
51056+
51057+ sd = (reiser4_plugin_stat *) * area;
51058+ info = reiser4_inode_data(inode);
51059+
51060+ mask = 0;
51061+ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
51062+ move_on(len, area, sizeof *sd);
51063+ result = 0;
51064+ for (i = 0; i < num_of_plugins; ++i) {
51065+ reiser4_plugin_slot *slot;
51066+ reiser4_plugin_type type;
51067+ pset_member memb;
51068+
51069+ slot = (reiser4_plugin_slot *) * area;
51070+ if (*len < (int)sizeof *slot)
51071+ return not_enough_space(inode, "additional plugin");
51072+
51073+ memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
51074+ type = aset_member_to_type_unsafe(memb);
51075+
51076+ if (type == REISER4_PLUGIN_TYPES) {
51077+ warning("nikita-3502",
51078+ "wrong %s member (%i) for %llu", is_pset ?
51079+ "pset" : "hset", memb,
51080+ (unsigned long long)get_inode_oid(inode));
51081+ return RETERR(-EINVAL);
51082+ }
51083+ plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
51084+ type, &slot->id);
51085+ if (plugin == NULL)
51086+ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
51087+
51088+ /* plugin is loaded into inode, mark this into inode's
51089+ bitmask of loaded non-standard plugins */
51090+ if (!(mask & (1 << memb))) {
51091+ mask |= (1 << memb);
51092+ } else {
51093+ warning("nikita-658", "duplicate plugin for %llu",
51094+ (unsigned long long)get_inode_oid(inode));
51095+ return RETERR(-EINVAL);
51096+ }
51097+ move_on(len, area, sizeof *slot);
51098+ /* load plugin data, if any */
51099+ if (plugin->h.pops != NULL && plugin->h.pops->load)
51100+ result = plugin->h.pops->load(inode, plugin, area, len);
51101+ else
51102+ result = aset_set_unsafe(is_pset ? &info->pset :
51103+ &info->hset, memb, plugin);
51104+ if (result)
51105+ return result;
51106+ }
51107+ if (is_pset) {
51108+ /* if object plugin wasn't loaded from stat-data, guess it by
51109+ mode bits */
51110+ plugin = file_plugin_to_plugin(inode_file_plugin(inode));
51111+ if (plugin == NULL)
51112+ result = absent_plugin_sd(inode);
51113+ info->plugin_mask = mask;
51114+ } else
51115+ info->heir_mask = mask;
51116+
51117+ return result;
51118+}
51119+
51120+static int present_pset_sd(struct inode *inode, char **area, int *len) {
51121+ return present_plugin_sd(inode, area, len, 1 /* pset */);
51122+}
51123+
51124+/* Determine object plugin for @inode based on i_mode.
51125+
51126+ Many objects in reiser4 file system are controlled by standard object
51127+ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
51128+
51129+ For such files we don't explicitly store plugin id in object stat
51130+ data. Rather required plugin is guessed from mode bits, where file "type"
51131+ is encoded (see stat(2)).
51132+*/
51133+static int
51134+guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
51135+{
51136+ int fplug_id;
51137+ int dplug_id;
51138+ reiser4_inode *info;
51139+
51140+ assert("nikita-736", inode != NULL);
51141+
51142+ dplug_id = fplug_id = -1;
51143+
51144+ switch (inode->i_mode & S_IFMT) {
51145+ case S_IFSOCK:
51146+ case S_IFBLK:
51147+ case S_IFCHR:
51148+ case S_IFIFO:
51149+ fplug_id = SPECIAL_FILE_PLUGIN_ID;
51150+ break;
51151+ case S_IFLNK:
51152+ fplug_id = SYMLINK_FILE_PLUGIN_ID;
51153+ break;
51154+ case S_IFDIR:
51155+ fplug_id = DIRECTORY_FILE_PLUGIN_ID;
51156+ dplug_id = HASHED_DIR_PLUGIN_ID;
51157+ break;
51158+ default:
51159+ warning("nikita-737", "wrong file mode: %o", inode->i_mode);
51160+ return RETERR(-EIO);
51161+ case S_IFREG:
51162+ fplug_id = UNIX_FILE_PLUGIN_ID;
51163+ break;
51164+ }
51165+ info = reiser4_inode_data(inode);
51166+ set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
51167+ plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
51168+ set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
51169+ plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
51170+ return 0;
51171+}
51172+
51173+/* Audited by: green(2002.06.14) */
51174+static int absent_plugin_sd(struct inode *inode /* object being processed */ )
51175+{
51176+ int result;
51177+
51178+ assert("nikita-659", inode != NULL);
51179+
51180+ result = guess_plugin_by_mode(inode);
51181+ /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
51182+ but setup_inode_ops() will call make_bad_inode().
51183+ Another, more logical but bit more complex solution is to add
51184+ "bad-file plugin". */
51185+ /* FIXME-VS: activate was called here */
51186+ return result;
51187+}
51188+
51189+/* helper function for plugin_sd_save_len(): calculate how much space
51190+ required to save state of given plugin */
51191+/* Audited by: green(2002.06.14) */
51192+static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
51193+ struct inode *inode /* object being processed */ ,
51194+ pset_member memb,
51195+ int len, int is_pset)
51196+{
51197+ reiser4_inode *info;
51198+ assert("nikita-661", inode != NULL);
51199+
51200+ if (plugin == NULL)
51201+ return len;
51202+
51203+ info = reiser4_inode_data(inode);
51204+ if (is_pset ?
51205+ info->plugin_mask & (1 << memb) :
51206+ info->heir_mask & (1 << memb)) {
51207+ len += sizeof(reiser4_plugin_slot);
51208+ if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
51209+ /* non-standard plugin, call method */
51210+ /* commented as it is incompatible with alignment
51211+ * policy in save_plug() -edward */
51212+ /* len = round_up(len, plugin->h.pops->alignment); */
51213+ len += plugin->h.pops->save_len(inode, plugin);
51214+ }
51215+ }
51216+ return len;
51217+}
51218+
51219+/* calculate how much space is required to save state of all plugins,
51220+ associated with inode */
51221+static int save_len_plugin_sd(struct inode *inode /* object being processed */,
51222+ int is_pset)
51223+{
51224+ int len;
51225+ int last;
51226+ reiser4_inode *state;
51227+ pset_member memb;
51228+
51229+ assert("nikita-663", inode != NULL);
51230+
51231+ state = reiser4_inode_data(inode);
51232+
51233+ /* common case: no non-standard plugins */
51234+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51235+ return 0;
51236+ len = sizeof(reiser4_plugin_stat);
51237+ last = PSET_LAST;
51238+
51239+ for (memb = 0; memb < last; ++memb) {
51240+ len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
51241+ inode, memb, len, is_pset);
51242+ }
51243+ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
51244+ return len;
51245+}
51246+
51247+static int save_len_pset_sd(struct inode *inode) {
51248+ return save_len_plugin_sd(inode, 1 /* pset */);
51249+}
51250+
51251+/* helper function for plugin_sd_save(): save plugin, associated with
51252+ inode. */
51253+static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
51254+ struct inode *inode /* object being processed */ ,
51255+ int memb /* what element of pset is saved */ ,
51256+ char **area /* position in stat-data */ ,
51257+ int *count /* incremented if plugin were actually saved. */,
51258+ int is_pset /* 1 for plugin set, 0 for heir set */)
51259+{
51260+ reiser4_plugin_slot *slot;
51261+ int fake_len;
51262+ int result;
51263+
51264+ assert("nikita-665", inode != NULL);
51265+ assert("nikita-666", area != NULL);
51266+ assert("nikita-667", *area != NULL);
51267+
51268+ if (plugin == NULL)
51269+ return 0;
51270+
51271+ if (is_pset ?
51272+ !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
51273+ !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
51274+ return 0;
51275+ slot = (reiser4_plugin_slot *) * area;
51276+ put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
51277+ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
51278+ fake_len = (int)0xffff;
51279+ move_on(&fake_len, area, sizeof *slot);
51280+ ++*count;
51281+ result = 0;
51282+ if (plugin->h.pops != NULL) {
51283+ if (plugin->h.pops->save != NULL)
51284+ result = plugin->h.pops->save(inode, plugin, area);
51285+ }
51286+ return result;
51287+}
51288+
51289+/* save state of all non-standard plugins associated with inode */
51290+static int save_plugin_sd(struct inode *inode /* object being processed */ ,
51291+ char **area /* position in stat-data */,
51292+ int is_pset /* 1 for pset, 0 for hset */)
51293+{
51294+ int fake_len;
51295+ int result = 0;
51296+ int num_of_plugins;
51297+ reiser4_plugin_stat *sd;
51298+ reiser4_inode *state;
51299+ pset_member memb;
51300+
51301+ assert("nikita-669", inode != NULL);
51302+ assert("nikita-670", area != NULL);
51303+ assert("nikita-671", *area != NULL);
51304+
51305+ state = reiser4_inode_data(inode);
51306+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51307+ return 0;
51308+ sd = (reiser4_plugin_stat *) * area;
51309+ fake_len = (int)0xffff;
51310+ move_on(&fake_len, area, sizeof *sd);
51311+
51312+ num_of_plugins = 0;
51313+ for (memb = 0; memb < PSET_LAST; ++memb) {
51314+ result = save_plug(aset_get(is_pset ? state->pset : state->hset,
51315+ memb),
51316+ inode, memb, area, &num_of_plugins, is_pset);
51317+ if (result != 0)
51318+ break;
51319+ }
51320+
51321+ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
51322+ return result;
51323+}
51324+
51325+static int save_pset_sd(struct inode *inode, char **area) {
51326+ return save_plugin_sd(inode, area, 1 /* pset */);
51327+}
51328+
51329+static int present_hset_sd(struct inode *inode, char **area, int *len) {
51330+ return present_plugin_sd(inode, area, len, 0 /* hset */);
51331+}
51332+
51333+static int save_len_hset_sd(struct inode *inode) {
51334+ return save_len_plugin_sd(inode, 0 /* pset */);
51335+}
51336+
51337+static int save_hset_sd(struct inode *inode, char **area) {
51338+ return save_plugin_sd(inode, area, 0 /* hset */);
51339+}
51340+
51341+/* helper function for crypto_sd_present(), crypto_sd_save.
51342+ Extract crypto info from stat-data and attach it to inode */
51343+static int extract_crypto_info (struct inode * inode,
51344+ reiser4_crypto_stat * sd)
51345+{
51346+ struct reiser4_crypto_info * info;
51347+ assert("edward-11", !inode_crypto_info(inode));
51348+ assert("edward-1413",
51349+ !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
51350+ /* create and attach a crypto-stat without secret key loaded */
51351+ info = reiser4_alloc_crypto_info(inode);
51352+ if (IS_ERR(info))
51353+ return PTR_ERR(info);
51354+ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
51355+ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
51356+ reiser4_attach_crypto_info(inode, info);
51357+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51358+ return 0;
51359+}
51360+
51361+/* crypto stat-data extension */
51362+
51363+static int present_crypto_sd(struct inode *inode, char **area, int *len)
51364+{
51365+ int result;
51366+ reiser4_crypto_stat *sd;
51367+ digest_plugin *dplug = inode_digest_plugin(inode);
51368+
51369+ assert("edward-06", dplug != NULL);
51370+ assert("edward-684", dplug->fipsize);
51371+ assert("edward-07", area != NULL);
51372+ assert("edward-08", *area != NULL);
51373+ assert("edward-09", len != NULL);
51374+ assert("edward-10", *len > 0);
51375+
51376+ if (*len < (int)sizeof(reiser4_crypto_stat)) {
51377+ return not_enough_space(inode, "crypto-sd");
51378+ }
51379+ /* *len is number of bytes in stat data item from *area to the end of
51380+ item. It must be not less than size of this extension */
51381+ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
51382+
51383+ sd = (reiser4_crypto_stat *) * area;
51384+ result = extract_crypto_info(inode, sd);
51385+ move_on(len, area, sizeof(*sd) + dplug->fipsize);
51386+
51387+ return result;
51388+}
51389+
51390+static int save_len_crypto_sd(struct inode *inode)
51391+{
51392+ return sizeof(reiser4_crypto_stat) +
51393+ inode_digest_plugin(inode)->fipsize;
51394+}
51395+
51396+static int save_crypto_sd(struct inode *inode, char **area)
51397+{
51398+ int result = 0;
51399+ reiser4_crypto_stat *sd;
51400+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
51401+ digest_plugin *dplug = inode_digest_plugin(inode);
51402+
51403+ assert("edward-12", dplug != NULL);
51404+ assert("edward-13", area != NULL);
51405+ assert("edward-14", *area != NULL);
51406+ assert("edward-15", info != NULL);
51407+ assert("edward-1414", info->keyid != NULL);
51408+ assert("edward-1415", info->keysize != 0);
51409+ assert("edward-76", reiser4_inode_data(inode) != NULL);
51410+
51411+ if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
51412+ /* file is just created */
51413+ sd = (reiser4_crypto_stat *) *area;
51414+ /* copy everything but private key to the disk stat-data */
51415+ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
51416+ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
51417+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51418+ }
51419+ *area += (sizeof(*sd) + dplug->fipsize);
51420+ return result;
51421+}
51422+
51423+static int eio(struct inode *inode, char **area, int *len)
51424+{
51425+ return RETERR(-EIO);
51426+}
51427+
51428+sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
51429+ [LIGHT_WEIGHT_STAT] = {
51430+ .h = {
51431+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51432+ .id = LIGHT_WEIGHT_STAT,
51433+ .pops = NULL,
51434+ .label = "light-weight sd",
51435+ .desc = "sd for light-weight files",
51436+ .linkage = {NULL,NULL}
51437+ },
51438+ .present = present_lw_sd,
51439+ .absent = NULL,
51440+ .save_len = save_len_lw_sd,
51441+ .save = save_lw_sd,
51442+ .alignment = 8
51443+ },
51444+ [UNIX_STAT] = {
51445+ .h = {
51446+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51447+ .id = UNIX_STAT,
51448+ .pops = NULL,
51449+ .label = "unix-sd",
51450+ .desc = "unix stat-data fields",
51451+ .linkage = {NULL,NULL}
51452+ },
51453+ .present = present_unix_sd,
51454+ .absent = absent_unix_sd,
51455+ .save_len = save_len_unix_sd,
51456+ .save = save_unix_sd,
51457+ .alignment = 8
51458+ },
51459+ [LARGE_TIMES_STAT] = {
51460+ .h = {
51461+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51462+ .id = LARGE_TIMES_STAT,
51463+ .pops = NULL,
51464+ .label = "64time-sd",
51465+ .desc = "nanosecond resolution for times",
51466+ .linkage = {NULL,NULL}
51467+ },
51468+ .present = present_large_times_sd,
51469+ .absent = NULL,
51470+ .save_len = save_len_large_times_sd,
51471+ .save = save_large_times_sd,
51472+ .alignment = 8
51473+ },
51474+ [SYMLINK_STAT] = {
51475+ /* stat data of symlink has this extension */
51476+ .h = {
51477+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51478+ .id = SYMLINK_STAT,
51479+ .pops = NULL,
51480+ .label = "symlink-sd",
51481+ .desc =
51482+ "stat data is appended with symlink name",
51483+ .linkage = {NULL,NULL}
51484+ },
51485+ .present = present_symlink_sd,
51486+ .absent = NULL,
51487+ .save_len = save_len_symlink_sd,
51488+ .save = save_symlink_sd,
51489+ .alignment = 8
51490+ },
51491+ [PLUGIN_STAT] = {
51492+ .h = {
51493+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51494+ .id = PLUGIN_STAT,
51495+ .pops = NULL,
51496+ .label = "plugin-sd",
51497+ .desc = "plugin stat-data fields",
51498+ .linkage = {NULL,NULL}
51499+ },
51500+ .present = present_pset_sd,
51501+ .absent = absent_plugin_sd,
51502+ .save_len = save_len_pset_sd,
51503+ .save = save_pset_sd,
51504+ .alignment = 8
51505+ },
51506+ [HEIR_STAT] = {
51507+ .h = {
51508+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51509+ .id = HEIR_STAT,
51510+ .pops = NULL,
51511+ .label = "heir-plugin-sd",
51512+ .desc = "heir plugin stat-data fields",
51513+ .linkage = {NULL,NULL}
51514+ },
51515+ .present = present_hset_sd,
51516+ .absent = NULL,
51517+ .save_len = save_len_hset_sd,
51518+ .save = save_hset_sd,
51519+ .alignment = 8
51520+ },
51521+ [FLAGS_STAT] = {
51522+ .h = {
51523+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51524+ .id = FLAGS_STAT,
51525+ .pops = NULL,
51526+ .label = "flags-sd",
51527+ .desc = "inode bit flags",
51528+ .linkage = {NULL, NULL}
51529+ },
51530+ .present = present_flags_sd,
51531+ .absent = NULL,
51532+ .save_len = save_len_flags_sd,
51533+ .save = save_flags_sd,
51534+ .alignment = 8
51535+ },
51536+ [CAPABILITIES_STAT] = {
51537+ .h = {
51538+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51539+ .id = CAPABILITIES_STAT,
51540+ .pops = NULL,
51541+ .label = "capabilities-sd",
51542+ .desc = "capabilities",
51543+ .linkage = {NULL, NULL}
51544+ },
51545+ .present = eio,
51546+ .absent = NULL,
51547+ .save_len = save_len_flags_sd,
51548+ .save = save_flags_sd,
51549+ .alignment = 8
51550+ },
51551+ [CRYPTO_STAT] = {
51552+ .h = {
51553+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51554+ .id = CRYPTO_STAT,
51555+ .pops = NULL,
51556+ .label = "crypto-sd",
51557+ .desc = "secret key size and id",
51558+ .linkage = {NULL, NULL}
51559+ },
51560+ .present = present_crypto_sd,
51561+ .absent = NULL,
51562+ .save_len = save_len_crypto_sd,
51563+ .save = save_crypto_sd,
51564+ .alignment = 8
51565+ }
51566+};
51567+
51568+/* Make Linus happy.
51569+ Local variables:
51570+ c-indentation-style: "K&R"
51571+ mode-name: "LC"
51572+ c-basic-offset: 8
51573+ tab-width: 8
51574+ fill-column: 120
51575+ End:
51576+*/
51577diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.24/fs/reiser4/plugin/item/static_stat.h
51578--- linux-2.6.24.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 03:00:00.000000000 +0300
51579+++ linux-2.6.24/fs/reiser4/plugin/item/static_stat.h 2008-01-25 11:39:07.028231388 +0300
51580@@ -0,0 +1,224 @@
51581+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51582+
51583+/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
51584+
51585+In the case where each file has not less than the fields needed by the
51586+stat() syscall, it is more compact to store those fields in this
51587+struct.
51588+
51589+If this item does not exist, then all stats are dynamically resolved.
51590+At the moment, we either resolve all stats dynamically or all of them
51591+statically. If you think this is not fully optimal, and the rest of
51592+reiser4 is working, then fix it...:-)
51593+
51594+*/
51595+
51596+#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
51597+#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
51598+
51599+#include "../../forward.h"
51600+#include "../../dformat.h"
51601+
51602+#include <linux/fs.h> /* for struct inode */
51603+
51604+/* Stat data layout: goals and implementation.
51605+
51606+ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
51607+ them, including not having semantic metadata attached to them.
51608+
51609+ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
51610+ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
51611+ sized structure because the statically sized structure knows without recording it what the names and lengths of the
51612+ attributes are.
51613+
51614+ This leads to a natural compromise, which is to special case those files which have simply the standard unix file
51615+ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
51616+ file in their use of file attributes.
51617+
51618+ Yet this compromise deserves to be compromised a little.
51619+
51620+ We accommodate the case where you have no more than the standard unix file attributes by using an "extension
51621+ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
51622+
51623+ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
51624+ from parent directory (as uid, gid) or initialised to some sane values.
51625+
51626+ To capitalize on existing code infrastructure, extensions are
51627+ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
51628+ Each stat-data extension plugin implements four methods:
51629+
51630+ ->present() called by sd_load() when this extension is found in stat-data
51631+ ->absent() called by sd_load() when this extension is not found in stat-data
51632+ ->save_len() called by sd_len() to calculate total length of stat-data
51633+ ->save() called by sd_save() to store extension data into stat-data
51634+
51635+ Implementation is in fs/reiser4/plugin/item/static_stat.c
51636+*/
51637+
51638+/* stat-data extension. Please order this by presumed frequency of use */
51639+typedef enum {
51640+ /* support for light-weight files */
51641+ LIGHT_WEIGHT_STAT,
51642+ /* data required to implement unix stat(2) call. Layout is in
51643+ reiser4_unix_stat. If this is not present, file is light-weight */
51644+ UNIX_STAT,
51645+ /* this contains additional set of 32bit [anc]time fields to implement
51646+ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
51647+ if this extension is governed by 32bittimes mount option. */
51648+ LARGE_TIMES_STAT,
51649+ /* stat data has link name included */
51650+ SYMLINK_STAT,
51651+ /* on-disk slots of non-standard plugins for main plugin table
51652+ (@reiser4_inode->pset), that is, plugins that cannot be deduced
51653+ from file mode bits), for example, aggregation, interpolation etc. */
51654+ PLUGIN_STAT,
51655+ /* this extension contains persistent inode flags. These flags are
51656+ single bits: immutable, append, only, etc. Layout is in
51657+ reiser4_flags_stat. */
51658+ FLAGS_STAT,
51659+ /* this extension contains capabilities sets, associated with this
51660+ file. Layout is in reiser4_capabilities_stat */
51661+ CAPABILITIES_STAT,
51662+ /* this extension contains size and public id of the secret key.
51663+ Layout is in reiser4_crypto_stat */
51664+ CRYPTO_STAT,
51665+ /* on-disk slots of non-default plugins for inheritance, which
51666+ are extracted to special plugin table (@reiser4_inode->hset).
51667+ By default, children of the object will inherit plugins from
51668+ its main plugin table (pset). */
51669+ HEIR_STAT,
51670+ LAST_SD_EXTENSION,
51671+ /*
51672+ * init_inode_static_sd() iterates over extension mask until all
51673+ * non-zero bits are processed. This means, that neither ->present(),
51674+ * nor ->absent() methods will be called for stat-data extensions that
51675+ * go after last present extension. But some basic extensions, we want
51676+ * either ->absent() or ->present() method to be called, because these
51677+ * extensions set up something in inode even when they are not
51678+ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
51679+ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
51680+ * ->present(), or ->absent() method will be called, independently of
51681+ * what other extensions are present.
51682+ */
51683+ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
51684+} sd_ext_bits;
51685+
51686+/* minimal stat-data. This allows to support light-weight files. */
51687+typedef struct reiser4_stat_data_base {
51688+ /* 0 */ __le16 extmask;
51689+ /* 2 */
51690+} PACKED reiser4_stat_data_base;
51691+
51692+typedef struct reiser4_light_weight_stat {
51693+ /* 0 */ __le16 mode;
51694+ /* 2 */ __le32 nlink;
51695+ /* 6 */ __le64 size;
51696+ /* size in bytes */
51697+ /* 14 */
51698+} PACKED reiser4_light_weight_stat;
51699+
51700+typedef struct reiser4_unix_stat {
51701+ /* owner id */
51702+ /* 0 */ __le32 uid;
51703+ /* group id */
51704+ /* 4 */ __le32 gid;
51705+ /* access time */
51706+ /* 8 */ __le32 atime;
51707+ /* modification time */
51708+ /* 12 */ __le32 mtime;
51709+ /* change time */
51710+ /* 16 */ __le32 ctime;
51711+ union {
51712+ /* minor:major for device files */
51713+ /* 20 */ __le64 rdev;
51714+ /* bytes used by file */
51715+ /* 20 */ __le64 bytes;
51716+ } u;
51717+ /* 28 */
51718+} PACKED reiser4_unix_stat;
51719+
51720+/* symlink stored as part of inode */
51721+typedef struct reiser4_symlink_stat {
51722+ char body[0];
51723+} PACKED reiser4_symlink_stat;
51724+
51725+typedef struct reiser4_plugin_slot {
51726+ /* 0 */ __le16 pset_memb;
51727+ /* 2 */ __le16 id;
51728+ /* 4 *//* here plugin stores its persistent state */
51729+} PACKED reiser4_plugin_slot;
51730+
51731+/* stat-data extension for files with non-standard plugin. */
51732+typedef struct reiser4_plugin_stat {
51733+ /* number of additional plugins, associated with this object */
51734+ /* 0 */ __le16 plugins_no;
51735+ /* 2 */ reiser4_plugin_slot slot[0];
51736+ /* 2 */
51737+} PACKED reiser4_plugin_stat;
51738+
51739+/* stat-data extension for inode flags. Currently it is just fixed-width 32
51740+ * bit mask. If need arise, this can be replaced with variable width
51741+ * bitmask. */
51742+typedef struct reiser4_flags_stat {
51743+ /* 0 */ __le32 flags;
51744+ /* 4 */
51745+} PACKED reiser4_flags_stat;
51746+
51747+typedef struct reiser4_capabilities_stat {
51748+ /* 0 */ __le32 effective;
51749+ /* 8 */ __le32 permitted;
51750+ /* 16 */
51751+} PACKED reiser4_capabilities_stat;
51752+
51753+typedef struct reiser4_cluster_stat {
51754+/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
51755+ /* 0 */ d8 cluster_shift;
51756+ /* 1 */
51757+} PACKED reiser4_cluster_stat;
51758+
51759+typedef struct reiser4_crypto_stat {
51760+ /* secret key size, bits */
51761+ /* 0 */ d16 keysize;
51762+ /* secret key id */
51763+ /* 2 */ d8 keyid[0];
51764+ /* 2 */
51765+} PACKED reiser4_crypto_stat;
51766+
51767+typedef struct reiser4_large_times_stat {
51768+ /* access time */
51769+ /* 0 */ d32 atime;
51770+ /* modification time */
51771+ /* 4 */ d32 mtime;
51772+ /* change time */
51773+ /* 8 */ d32 ctime;
51774+ /* 12 */
51775+} PACKED reiser4_large_times_stat;
51776+
51777+/* this structure is filled by sd_item_stat */
51778+typedef struct sd_stat {
51779+ int dirs;
51780+ int files;
51781+ int others;
51782+} sd_stat;
51783+
51784+/* plugin->item.common.* */
51785+extern void print_sd(const char *prefix, coord_t * coord);
51786+extern void item_stat_static_sd(const coord_t * coord, void *vp);
51787+
51788+/* plugin->item.s.sd.* */
51789+extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
51790+extern int save_len_static_sd(struct inode *inode);
51791+extern int save_static_sd(struct inode *inode, char **area);
51792+
51793+/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
51794+#endif
51795+
51796+/* Make Linus happy.
51797+ Local variables:
51798+ c-indentation-style: "K&R"
51799+ mode-name: "LC"
51800+ c-basic-offset: 8
51801+ tab-width: 8
51802+ fill-column: 120
51803+ End:
51804+*/
51805diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/tail.c linux-2.6.24/fs/reiser4/plugin/item/tail.c
51806--- linux-2.6.24.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 03:00:00.000000000 +0300
51807+++ linux-2.6.24/fs/reiser4/plugin/item/tail.c 2008-01-25 11:40:16.698169785 +0300
51808@@ -0,0 +1,808 @@
51809+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51810+
51811+#include "item.h"
51812+#include "../../inode.h"
51813+#include "../../page_cache.h"
51814+#include "../../carry.h"
51815+#include "../../vfs_ops.h"
51816+
51817+#include <linux/quotaops.h>
51818+#include <asm/uaccess.h>
51819+#include <linux/swap.h>
51820+#include <linux/writeback.h>
51821+
51822+/* plugin->u.item.b.max_key_inside */
51823+reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
51824+{
51825+ item_key_by_coord(coord, key);
51826+ set_key_offset(key, get_key_offset(reiser4_max_key()));
51827+ return key;
51828+}
51829+
51830+/* plugin->u.item.b.can_contain_key */
51831+int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
51832+ const reiser4_item_data *data)
51833+{
51834+ reiser4_key item_key;
51835+
51836+ if (item_plugin_by_coord(coord) != data->iplug)
51837+ return 0;
51838+
51839+ item_key_by_coord(coord, &item_key);
51840+ if (get_key_locality(key) != get_key_locality(&item_key) ||
51841+ get_key_objectid(key) != get_key_objectid(&item_key))
51842+ return 0;
51843+
51844+ return 1;
51845+}
51846+
51847+/* plugin->u.item.b.mergeable
51848+ first item is of tail type */
51849+/* Audited by: green(2002.06.14) */
51850+int mergeable_tail(const coord_t *p1, const coord_t *p2)
51851+{
51852+ reiser4_key key1, key2;
51853+
51854+ assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
51855+ UNIX_FILE_METADATA_ITEM_TYPE));
51856+ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
51857+
51858+ if (item_id_by_coord(p2) != FORMATTING_ID) {
51859+ /* second item is of another type */
51860+ return 0;
51861+ }
51862+
51863+ item_key_by_coord(p1, &key1);
51864+ item_key_by_coord(p2, &key2);
51865+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
51866+ get_key_objectid(&key1) != get_key_objectid(&key2)
51867+ || get_key_type(&key1) != get_key_type(&key2)) {
51868+ /* items of different objects */
51869+ return 0;
51870+ }
51871+ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
51872+ /* not adjacent items */
51873+ return 0;
51874+ }
51875+ return 1;
51876+}
51877+
51878+/* plugin->u.item.b.print
51879+ plugin->u.item.b.check */
51880+
51881+/* plugin->u.item.b.nr_units */
51882+pos_in_node_t nr_units_tail(const coord_t * coord)
51883+{
51884+ return item_length_by_coord(coord);
51885+}
51886+
51887+/* plugin->u.item.b.lookup */
51888+lookup_result
51889+lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
51890+{
51891+ reiser4_key item_key;
51892+ __u64 lookuped, offset;
51893+ unsigned nr_units;
51894+
51895+ item_key_by_coord(coord, &item_key);
51896+ offset = get_key_offset(item_key_by_coord(coord, &item_key));
51897+ nr_units = nr_units_tail(coord);
51898+
51899+ /* key we are looking for must be greater than key of item @coord */
51900+ assert("vs-416", keygt(key, &item_key));
51901+
51902+ /* offset we are looking for */
51903+ lookuped = get_key_offset(key);
51904+
51905+ if (lookuped >= offset && lookuped < offset + nr_units) {
51906+ /* byte we are looking for is in this item */
51907+ coord->unit_pos = lookuped - offset;
51908+ coord->between = AT_UNIT;
51909+ return CBK_COORD_FOUND;
51910+ }
51911+
51912+ /* set coord after last unit */
51913+ coord->unit_pos = nr_units - 1;
51914+ coord->between = AFTER_UNIT;
51915+ return bias ==
51916+ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
51917+}
51918+
51919+/* plugin->u.item.b.paste */
51920+int
51921+paste_tail(coord_t *coord, reiser4_item_data *data,
51922+ carry_plugin_info *info UNUSED_ARG)
51923+{
51924+ unsigned old_item_length;
51925+ char *item;
51926+
51927+ /* length the item had before resizing has been performed */
51928+ old_item_length = item_length_by_coord(coord) - data->length;
51929+
51930+ /* tail items never get pasted in the middle */
51931+ assert("vs-363",
51932+ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
51933+ (coord->unit_pos == old_item_length - 1 &&
51934+ coord->between == AFTER_UNIT) ||
51935+ (coord->unit_pos == 0 && old_item_length == 0
51936+ && coord->between == AT_UNIT));
51937+
51938+ item = item_body_by_coord(coord);
51939+ if (coord->unit_pos == 0)
51940+ /* make space for pasted data when pasting at the beginning of
51941+ the item */
51942+ memmove(item + data->length, item, old_item_length);
51943+
51944+ if (coord->between == AFTER_UNIT)
51945+ coord->unit_pos++;
51946+
51947+ if (data->data) {
51948+ assert("vs-554", data->user == 0 || data->user == 1);
51949+ if (data->user) {
51950+ assert("nikita-3035", reiser4_schedulable());
51951+ /* copy from user space */
51952+ if (__copy_from_user(item + coord->unit_pos,
51953+ (const char __user *)data->data,
51954+ (unsigned)data->length))
51955+ return RETERR(-EFAULT);
51956+ } else
51957+ /* copy from kernel space */
51958+ memcpy(item + coord->unit_pos, data->data,
51959+ (unsigned)data->length);
51960+ } else {
51961+ memset(item + coord->unit_pos, 0, (unsigned)data->length);
51962+ }
51963+ return 0;
51964+}
51965+
51966+/* plugin->u.item.b.fast_paste */
51967+
51968+/* plugin->u.item.b.can_shift
51969+ number of units is returned via return value, number of bytes via @size. For
51970+ tail items they coincide */
51971+int
51972+can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
51973+ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
51974+ unsigned *size, unsigned want)
51975+{
51976+ /* make sure that that we do not want to shift more than we have */
51977+ assert("vs-364", want > 0
51978+ && want <= (unsigned)item_length_by_coord(source));
51979+
51980+ *size = min(want, free_space);
51981+ return *size;
51982+}
51983+
51984+/* plugin->u.item.b.copy_units */
51985+void
51986+copy_units_tail(coord_t * target, coord_t * source,
51987+ unsigned from, unsigned count,
51988+ shift_direction where_is_free_space,
51989+ unsigned free_space UNUSED_ARG)
51990+{
51991+ /* make sure that item @target is expanded already */
51992+ assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
51993+ assert("vs-370", free_space >= count);
51994+
51995+ if (where_is_free_space == SHIFT_LEFT) {
51996+ /* append item @target with @count first bytes of @source */
51997+ assert("vs-365", from == 0);
51998+
51999+ memcpy((char *)item_body_by_coord(target) +
52000+ item_length_by_coord(target) - count,
52001+ (char *)item_body_by_coord(source), count);
52002+ } else {
52003+ /* target item is moved to right already */
52004+ reiser4_key key;
52005+
52006+ assert("vs-367",
52007+ (unsigned)item_length_by_coord(source) == from + count);
52008+
52009+ memcpy((char *)item_body_by_coord(target),
52010+ (char *)item_body_by_coord(source) + from, count);
52011+
52012+ /* new units are inserted before first unit in an item,
52013+ therefore, we have to update item key */
52014+ item_key_by_coord(source, &key);
52015+ set_key_offset(&key, get_key_offset(&key) + from);
52016+
52017+ node_plugin_by_node(target->node)->update_item_key(target, &key,
52018+ NULL /*info */);
52019+ }
52020+}
52021+
52022+/* plugin->u.item.b.create_hook */
52023+
52024+/* item_plugin->b.kill_hook
52025+ this is called when @count units starting from @from-th one are going to be removed
52026+ */
52027+int
52028+kill_hook_tail(const coord_t * coord, pos_in_node_t from,
52029+ pos_in_node_t count, struct carry_kill_data *kdata)
52030+{
52031+ reiser4_key key;
52032+ loff_t start, end;
52033+
52034+ assert("vs-1577", kdata);
52035+ assert("vs-1579", kdata->inode);
52036+
52037+ item_key_by_coord(coord, &key);
52038+ start = get_key_offset(&key) + from;
52039+ end = start + count;
52040+ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
52041+ return 0;
52042+}
52043+
52044+/* plugin->u.item.b.shift_hook */
52045+
52046+/* helper for kill_units_tail and cut_units_tail */
52047+static int
52048+do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52049+ reiser4_key * smallest_removed, reiser4_key * new_first)
52050+{
52051+ pos_in_node_t count;
52052+
52053+ /* this method is only called to remove part of item */
52054+ assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
52055+ /* tails items are never cut from the middle of an item */
52056+ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
52057+ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
52058+
52059+ count = to - from + 1;
52060+
52061+ if (smallest_removed) {
52062+ /* store smallest key removed */
52063+ item_key_by_coord(coord, smallest_removed);
52064+ set_key_offset(smallest_removed,
52065+ get_key_offset(smallest_removed) + from);
52066+ }
52067+ if (new_first) {
52068+ /* head of item is cut */
52069+ assert("vs-1529", from == 0);
52070+
52071+ item_key_by_coord(coord, new_first);
52072+ set_key_offset(new_first,
52073+ get_key_offset(new_first) + from + count);
52074+ }
52075+
52076+ if (REISER4_DEBUG)
52077+ memset((char *)item_body_by_coord(coord) + from, 0, count);
52078+ return count;
52079+}
52080+
52081+/* plugin->u.item.b.cut_units */
52082+int
52083+cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52084+ struct carry_cut_data *cdata UNUSED_ARG,
52085+ reiser4_key * smallest_removed, reiser4_key * new_first)
52086+{
52087+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52088+}
52089+
52090+/* plugin->u.item.b.kill_units */
52091+int
52092+kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52093+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
52094+ reiser4_key * new_first)
52095+{
52096+ kill_hook_tail(coord, from, to - from + 1, kdata);
52097+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52098+}
52099+
52100+/* plugin->u.item.b.unit_key */
52101+reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
52102+{
52103+ assert("vs-375", coord_is_existing_unit(coord));
52104+
52105+ item_key_by_coord(coord, key);
52106+ set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
52107+
52108+ return key;
52109+}
52110+
52111+/* plugin->u.item.b.estimate
52112+ plugin->u.item.b.item_data_by_flow */
52113+
52114+/* tail redpage function. It is called from readpage_tail(). */
52115+static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
52116+{
52117+ tap_t tap;
52118+ int result;
52119+ coord_t coord;
52120+ lock_handle lh;
52121+ int count, mapped;
52122+ struct inode *inode;
52123+ char *pagedata;
52124+
52125+ /* saving passed coord in order to do not move it by tap. */
52126+ init_lh(&lh);
52127+ copy_lh(&lh, uf_coord->lh);
52128+ inode = page->mapping->host;
52129+ coord_dup(&coord, &uf_coord->coord);
52130+
52131+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
52132+
52133+ if ((result = reiser4_tap_load(&tap)))
52134+ goto out_tap_done;
52135+
52136+ /* lookup until page is filled up. */
52137+ for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
52138+ /* number of bytes to be copied to page */
52139+ count = item_length_by_coord(&coord) - coord.unit_pos;
52140+ if (count > PAGE_CACHE_SIZE - mapped)
52141+ count = PAGE_CACHE_SIZE - mapped;
52142+
52143+ /* attach @page to address space and get data address */
52144+ pagedata = kmap_atomic(page, KM_USER0);
52145+
52146+ /* copy tail item to page */
52147+ memcpy(pagedata + mapped,
52148+ ((char *)item_body_by_coord(&coord) + coord.unit_pos),
52149+ count);
52150+ mapped += count;
52151+
52152+ flush_dcache_page(page);
52153+
52154+ /* dettach page from address space */
52155+ kunmap_atomic(pagedata, KM_USER0);
52156+
52157+ /* Getting next tail item. */
52158+ if (mapped < PAGE_CACHE_SIZE) {
52159+ /*
52160+ * unlock page in order to avoid keep it locked
52161+ * during tree lookup, which takes long term locks
52162+ */
52163+ unlock_page(page);
52164+
52165+ /* getting right neighbour. */
52166+ result = go_dir_el(&tap, RIGHT_SIDE, 0);
52167+
52168+ /* lock page back */
52169+ lock_page(page);
52170+ if (PageUptodate(page)) {
52171+ /*
52172+ * another thread read the page, we have
52173+ * nothing to do
52174+ */
52175+ result = 0;
52176+ goto out_unlock_page;
52177+ }
52178+
52179+ if (result) {
52180+ if (result == -E_NO_NEIGHBOR) {
52181+ /*
52182+ * rigth neighbor is not a formatted
52183+ * node
52184+ */
52185+ result = 0;
52186+ goto done;
52187+ } else {
52188+ goto out_tap_relse;
52189+ }
52190+ } else {
52191+ if (!inode_file_plugin(inode)->
52192+ owns_item(inode, &coord)) {
52193+ /* item of another file is found */
52194+ result = 0;
52195+ goto done;
52196+ }
52197+ }
52198+ }
52199+ }
52200+
52201+ done:
52202+ if (mapped != PAGE_CACHE_SIZE)
52203+ zero_user_page(page, mapped, PAGE_CACHE_SIZE - mapped,
52204+ KM_USER0);
52205+ SetPageUptodate(page);
52206+ out_unlock_page:
52207+ unlock_page(page);
52208+ out_tap_relse:
52209+ reiser4_tap_relse(&tap);
52210+ out_tap_done:
52211+ reiser4_tap_done(&tap);
52212+ return result;
52213+}
52214+
52215+/*
52216+ plugin->s.file.readpage
52217+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
52218+ or
52219+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
52220+
52221+ At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
52222+ item. */
52223+int readpage_tail(void *vp, struct page *page)
52224+{
52225+ uf_coord_t *uf_coord = vp;
52226+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
52227+ ON_DEBUG(reiser4_key key);
52228+
52229+ assert("umka-2515", PageLocked(page));
52230+ assert("umka-2516", !PageUptodate(page));
52231+ assert("umka-2517", !jprivate(page) && !PagePrivate(page));
52232+ assert("umka-2518", page->mapping && page->mapping->host);
52233+
52234+ assert("umka-2519", znode_is_loaded(coord->node));
52235+ assert("umka-2520", item_is_tail(coord));
52236+ assert("umka-2521", coord_is_existing_unit(coord));
52237+ assert("umka-2522", znode_is_rlocked(coord->node));
52238+ assert("umka-2523",
52239+ page->mapping->host->i_ino ==
52240+ get_key_objectid(item_key_by_coord(coord, &key)));
52241+
52242+ return do_readpage_tail(uf_coord, page);
52243+}
52244+
52245+/**
52246+ * overwrite_tail
52247+ * @flow:
52248+ * @coord:
52249+ *
52250+ * Overwrites tail item or its part by user data. Returns number of bytes
52251+ * written or error code.
52252+ */
52253+static int overwrite_tail(flow_t *flow, coord_t *coord)
52254+{
52255+ unsigned count;
52256+
52257+ assert("vs-570", flow->user == 1);
52258+ assert("vs-946", flow->data);
52259+ assert("vs-947", coord_is_existing_unit(coord));
52260+ assert("vs-948", znode_is_write_locked(coord->node));
52261+ assert("nikita-3036", reiser4_schedulable());
52262+
52263+ count = item_length_by_coord(coord) - coord->unit_pos;
52264+ if (count > flow->length)
52265+ count = flow->length;
52266+
52267+ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
52268+ (const char __user *)flow->data, count))
52269+ return RETERR(-EFAULT);
52270+
52271+ znode_make_dirty(coord->node);
52272+ return count;
52273+}
52274+
52275+/**
52276+ * insert_first_tail
52277+ * @inode:
52278+ * @flow:
52279+ * @coord:
52280+ * @lh:
52281+ *
52282+ * Returns number of bytes written or error code.
52283+ */
52284+static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
52285+ coord_t *coord, lock_handle *lh)
52286+{
52287+ int result;
52288+ loff_t to_write;
52289+ struct unix_file_info *uf_info;
52290+
52291+ if (get_key_offset(&flow->key) != 0) {
52292+ /*
52293+ * file is empty and we have to write not to the beginning of
52294+ * file. Create a hole at the beginning of file. On success
52295+ * insert_flow returns 0 as number of written bytes which is
52296+ * what we have to return on padding a file with holes
52297+ */
52298+ flow->data = NULL;
52299+ flow->length = get_key_offset(&flow->key);
52300+ set_key_offset(&flow->key, 0);
52301+ /*
52302+ * holes in files built of tails are stored just like if there
52303+ * were real data which are all zeros. Therefore we have to
52304+ * allocate quota here as well
52305+ */
52306+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52307+ return RETERR(-EDQUOT);
52308+ result = reiser4_insert_flow(coord, lh, flow);
52309+ if (flow->length)
52310+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52311+
52312+ uf_info = unix_file_inode_data(inode);
52313+
52314+ /*
52315+ * first item insertion is only possible when writing to empty
52316+ * file or performing tail conversion
52317+ */
52318+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
52319+ (reiser4_inode_get_flag(inode,
52320+ REISER4_PART_MIXED) &&
52321+ reiser4_inode_get_flag(inode,
52322+ REISER4_PART_IN_CONV))));
52323+ /* if file was empty - update its state */
52324+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
52325+ uf_info->container = UF_CONTAINER_TAILS;
52326+ return result;
52327+ }
52328+
52329+ /* check quota before appending data */
52330+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52331+ return RETERR(-EDQUOT);
52332+
52333+ to_write = flow->length;
52334+ result = reiser4_insert_flow(coord, lh, flow);
52335+ if (flow->length)
52336+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52337+ return (to_write - flow->length) ? (to_write - flow->length) : result;
52338+}
52339+
52340+/**
52341+ * append_tail
52342+ * @inode:
52343+ * @flow:
52344+ * @coord:
52345+ * @lh:
52346+ *
52347+ * Returns number of bytes written or error code.
52348+ */
52349+static ssize_t append_tail(struct inode *inode,
52350+ flow_t *flow, coord_t *coord, lock_handle *lh)
52351+{
52352+ int result;
52353+ reiser4_key append_key;
52354+ loff_t to_write;
52355+
52356+ if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
52357+ flow->data = NULL;
52358+ flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
52359+ set_key_offset(&flow->key, get_key_offset(&append_key));
52360+ /*
52361+ * holes in files built of tails are stored just like if there
52362+ * were real data which are all zeros. Therefore we have to
52363+ * allocate quota here as well
52364+ */
52365+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52366+ return RETERR(-EDQUOT);
52367+ result = reiser4_insert_flow(coord, lh, flow);
52368+ if (flow->length)
52369+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52370+ return result;
52371+ }
52372+
52373+ /* check quota before appending data */
52374+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52375+ return RETERR(-EDQUOT);
52376+
52377+ to_write = flow->length;
52378+ result = reiser4_insert_flow(coord, lh, flow);
52379+ if (flow->length)
52380+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52381+ return (to_write - flow->length) ? (to_write - flow->length) : result;
52382+}
52383+
52384+/**
52385+ * write_tail_reserve_space - reserve space for tail write operation
52386+ * @inode:
52387+ *
52388+ * Estimates and reserves space which may be required for writing one flow to a
52389+ * file
52390+ */
52391+static int write_extent_reserve_space(struct inode *inode)
52392+{
52393+ __u64 count;
52394+ reiser4_tree *tree;
52395+
52396+ /*
52397+ * to write one flow to a file by tails we have to reserve disk space for:
52398+
52399+ * 1. find_file_item may have to insert empty node to the tree (empty
52400+ * leaf node between two extent items). This requires 1 block and
52401+ * number of blocks which are necessary to perform insertion of an
52402+ * internal item into twig level.
52403+ *
52404+ * 2. flow insertion
52405+ *
52406+ * 3. stat data update
52407+ */
52408+ tree = reiser4_tree_by_inode(inode);
52409+ count = estimate_one_insert_item(tree) +
52410+ estimate_insert_flow(tree->height) +
52411+ estimate_one_insert_item(tree);
52412+ grab_space_enable();
52413+ return reiser4_grab_space(count, 0 /* flags */);
52414+}
52415+
52416+#define PAGE_PER_FLOW 4
52417+
52418+static loff_t faultin_user_pages(const char __user *buf, size_t count)
52419+{
52420+ loff_t faulted;
52421+ int to_fault;
52422+
52423+ if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
52424+ count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
52425+ faulted = 0;
52426+ while (count > 0) {
52427+ to_fault = PAGE_CACHE_SIZE;
52428+ if (count < to_fault)
52429+ to_fault = count;
52430+ fault_in_pages_readable(buf + faulted, to_fault);
52431+ count -= to_fault;
52432+ faulted += to_fault;
52433+ }
52434+ return faulted;
52435+}
52436+
52437+/**
52438+ * reiser4_write_tail - write method of tail item plugin
52439+ * @file: file to write to
52440+ * @buf: address of user-space buffer
52441+ * @count: number of bytes to write
52442+ * @pos: position in file to write to
52443+ *
52444+ * Returns number of written bytes or error code.
52445+ */
52446+ssize_t reiser4_write_tail(struct file *file, struct inode * inode,
52447+ const char __user *buf, size_t count, loff_t *pos)
52448+{
52449+ struct hint hint;
52450+ int result;
52451+ flow_t flow;
52452+ coord_t *coord;
52453+ lock_handle *lh;
52454+ znode *loaded;
52455+
52456+ assert("edward-1548", inode != NULL);
52457+
52458+ if (write_extent_reserve_space(inode))
52459+ return RETERR(-ENOSPC);
52460+
52461+ result = load_file_hint(file, &hint);
52462+ BUG_ON(result != 0);
52463+
52464+ flow.length = faultin_user_pages(buf, count);
52465+ flow.user = 1;
52466+ memcpy(&flow.data, &buf, sizeof(buf));
52467+ flow.op = WRITE_OP;
52468+ key_by_inode_and_offset_common(inode, *pos, &flow.key);
52469+
52470+ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
52471+ if (IS_CBKERR(result))
52472+ return result;
52473+
52474+ coord = &hint.ext_coord.coord;
52475+ lh = hint.ext_coord.lh;
52476+
52477+ result = zload(coord->node);
52478+ BUG_ON(result != 0);
52479+ loaded = coord->node;
52480+
52481+ if (coord->between == AFTER_UNIT) {
52482+ /* append with data or hole */
52483+ result = append_tail(inode, &flow, coord, lh);
52484+ } else if (coord->between == AT_UNIT) {
52485+ /* overwrite */
52486+ result = overwrite_tail(&flow, coord);
52487+ } else {
52488+ /* no items of this file yet. insert data or hole */
52489+ result = insert_first_tail(inode, &flow, coord, lh);
52490+ }
52491+ zrelse(loaded);
52492+ if (result < 0) {
52493+ done_lh(lh);
52494+ return result;
52495+ }
52496+
52497+ /* seal and unlock znode */
52498+ hint.ext_coord.valid = 0;
52499+ if (hint.ext_coord.valid)
52500+ reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
52501+ else
52502+ reiser4_unset_hint(&hint);
52503+
52504+ save_file_hint(file, &hint);
52505+ return result;
52506+}
52507+
52508+#if REISER4_DEBUG
52509+
52510+static int
52511+coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
52512+{
52513+ reiser4_key item_key;
52514+
52515+ assert("vs-1356", coord_is_existing_unit(coord));
52516+ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
52517+ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
52518+ return get_key_offset(key) ==
52519+ get_key_offset(&item_key) + coord->unit_pos;
52520+
52521+}
52522+
52523+#endif
52524+
52525+/* plugin->u.item.s.file.read */
52526+int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
52527+{
52528+ unsigned count;
52529+ int item_length;
52530+ coord_t *coord;
52531+ uf_coord_t *uf_coord;
52532+
52533+ uf_coord = &hint->ext_coord;
52534+ coord = &uf_coord->coord;
52535+
52536+ assert("vs-571", f->user == 1);
52537+ assert("vs-571", f->data);
52538+ assert("vs-967", coord && coord->node);
52539+ assert("vs-1117", znode_is_rlocked(coord->node));
52540+ assert("vs-1118", znode_is_loaded(coord->node));
52541+
52542+ assert("nikita-3037", reiser4_schedulable());
52543+ assert("vs-1357", coord_matches_key_tail(coord, &f->key));
52544+
52545+ /* calculate number of bytes to read off the item */
52546+ item_length = item_length_by_coord(coord);
52547+ count = item_length_by_coord(coord) - coord->unit_pos;
52548+ if (count > f->length)
52549+ count = f->length;
52550+
52551+ /* user page has to be brought in so that major page fault does not
52552+ * occur here when longtem lock is held */
52553+ if (__copy_to_user((char __user *)f->data,
52554+ ((char *)item_body_by_coord(coord) + coord->unit_pos),
52555+ count))
52556+ return RETERR(-EFAULT);
52557+
52558+ /* probably mark_page_accessed() should only be called if
52559+ * coord->unit_pos is zero. */
52560+ mark_page_accessed(znode_page(coord->node));
52561+ move_flow_forward(f, count);
52562+
52563+ coord->unit_pos += count;
52564+ if (item_length == coord->unit_pos) {
52565+ coord->unit_pos--;
52566+ coord->between = AFTER_UNIT;
52567+ }
52568+ reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK);
52569+ return 0;
52570+}
52571+
52572+/*
52573+ plugin->u.item.s.file.append_key
52574+ key of first byte which is the next to last byte by addressed by this item
52575+*/
52576+reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
52577+{
52578+ item_key_by_coord(coord, key);
52579+ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
52580+ return key;
52581+}
52582+
52583+/* plugin->u.item.s.file.init_coord_extension */
52584+void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
52585+{
52586+ uf_coord->valid = 1;
52587+}
52588+
52589+/*
52590+ plugin->u.item.s.file.get_block
52591+*/
52592+int
52593+get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
52594+{
52595+ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
52596+
52597+ if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
52598+ /* if node has'nt obtainet its block number yet, return 0.
52599+ * Lets avoid upsetting users with some cosmic numbers beyond
52600+ * the device capacity.*/
52601+ *block = 0;
52602+ else
52603+ *block = *znode_get_block(coord->node);
52604+ return 0;
52605+}
52606+
52607+/*
52608+ * Local variables:
52609+ * c-indentation-style: "K&R"
52610+ * mode-name: "LC"
52611+ * c-basic-offset: 8
52612+ * tab-width: 8
52613+ * fill-column: 79
52614+ * scroll-step: 1
52615+ * End:
52616+ */
52617diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/tail.h linux-2.6.24/fs/reiser4/plugin/item/tail.h
52618--- linux-2.6.24.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 03:00:00.000000000 +0300
52619+++ linux-2.6.24/fs/reiser4/plugin/item/tail.h 2008-01-25 11:40:16.702170815 +0300
52620@@ -0,0 +1,58 @@
52621+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52622+
52623+#if !defined( __REISER4_TAIL_H__ )
52624+#define __REISER4_TAIL_H__
52625+
52626+struct tail_coord_extension {
52627+ int not_used;
52628+};
52629+
52630+struct cut_list;
52631+
52632+/* plugin->u.item.b.* */
52633+reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
52634+int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
52635+ const reiser4_item_data *);
52636+int mergeable_tail(const coord_t * p1, const coord_t * p2);
52637+pos_in_node_t nr_units_tail(const coord_t *);
52638+lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
52639+int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
52640+int can_shift_tail(unsigned free_space, coord_t * source,
52641+ znode * target, shift_direction, unsigned *size,
52642+ unsigned want);
52643+void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
52644+ unsigned count, shift_direction, unsigned free_space);
52645+int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
52646+ struct carry_kill_data *);
52647+int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52648+ struct carry_cut_data *, reiser4_key * smallest_removed,
52649+ reiser4_key * new_first);
52650+int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52651+ struct carry_kill_data *, reiser4_key * smallest_removed,
52652+ reiser4_key * new_first);
52653+reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
52654+
52655+/* plugin->u.item.s.* */
52656+ssize_t reiser4_write_tail(struct file *file, struct inode * inode,
52657+ const char __user *buf, size_t count, loff_t *pos);
52658+int reiser4_read_tail(struct file *, flow_t *, hint_t *);
52659+int readpage_tail(void *vp, struct page *page);
52660+reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
52661+void init_coord_extension_tail(uf_coord_t *, loff_t offset);
52662+int get_block_address_tail(const coord_t *, sector_t, sector_t *);
52663+int item_balance_dirty_pages(struct address_space *, const flow_t *,
52664+ hint_t *, int back_to_dirty, int set_hint);
52665+
52666+/* __REISER4_TAIL_H__ */
52667+#endif
52668+
52669+/* Make Linus happy.
52670+ Local variables:
52671+ c-indentation-style: "K&R"
52672+ mode-name: "LC"
52673+ c-basic-offset: 8
52674+ tab-width: 8
52675+ fill-column: 120
52676+ scroll-step: 1
52677+ End:
52678+*/
52679diff -urN linux-2.6.24.orig/fs/reiser4/plugin/Makefile linux-2.6.24/fs/reiser4/plugin/Makefile
52680--- linux-2.6.24.orig/fs/reiser4/plugin/Makefile 1970-01-01 03:00:00.000000000 +0300
52681+++ linux-2.6.24/fs/reiser4/plugin/Makefile 2008-01-25 11:39:07.032232418 +0300
52682@@ -0,0 +1,26 @@
52683+obj-$(CONFIG_REISER4_FS) += plugins.o
52684+
52685+plugins-objs := \
52686+ plugin.o \
52687+ plugin_set.o \
52688+ object.o \
52689+ inode_ops.o \
52690+ inode_ops_rename.o \
52691+ file_ops.o \
52692+ file_ops_readdir.o \
52693+ file_plugin_common.o \
52694+ dir_plugin_common.o \
52695+ digest.o \
52696+ hash.o \
52697+ fibration.o \
52698+ tail_policy.o \
52699+ regular.o
52700+
52701+obj-$(CONFIG_REISER4_FS) += item/
52702+obj-$(CONFIG_REISER4_FS) += file/
52703+obj-$(CONFIG_REISER4_FS) += dir/
52704+obj-$(CONFIG_REISER4_FS) += node/
52705+obj-$(CONFIG_REISER4_FS) += compress/
52706+obj-$(CONFIG_REISER4_FS) += space/
52707+obj-$(CONFIG_REISER4_FS) += disk_format/
52708+obj-$(CONFIG_REISER4_FS) += security/
52709diff -urN linux-2.6.24.orig/fs/reiser4/plugin/node/Makefile linux-2.6.24/fs/reiser4/plugin/node/Makefile
52710--- linux-2.6.24.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 03:00:00.000000000 +0300
52711+++ linux-2.6.24/fs/reiser4/plugin/node/Makefile 2008-01-25 11:39:07.032232418 +0300
52712@@ -0,0 +1,5 @@
52713+obj-$(CONFIG_REISER4_FS) += node_plugins.o
52714+
52715+node_plugins-objs := \
52716+ node.o \
52717+ node40.o
52718diff -urN linux-2.6.24.orig/fs/reiser4/plugin/node/node40.c linux-2.6.24/fs/reiser4/plugin/node/node40.c
52719--- linux-2.6.24.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 03:00:00.000000000 +0300
52720+++ linux-2.6.24/fs/reiser4/plugin/node/node40.c 2008-01-25 11:39:07.036233449 +0300
52721@@ -0,0 +1,2924 @@
52722+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52723+
52724+#include "../../debug.h"
52725+#include "../../key.h"
52726+#include "../../coord.h"
52727+#include "../plugin_header.h"
52728+#include "../item/item.h"
52729+#include "node.h"
52730+#include "node40.h"
52731+#include "../plugin.h"
52732+#include "../../jnode.h"
52733+#include "../../znode.h"
52734+#include "../../pool.h"
52735+#include "../../carry.h"
52736+#include "../../tap.h"
52737+#include "../../tree.h"
52738+#include "../../super.h"
52739+#include "../../reiser4.h"
52740+
52741+#include <asm/uaccess.h>
52742+#include <linux/types.h>
52743+#include <linux/prefetch.h>
52744+
52745+/* leaf 40 format:
52746+
52747+ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
52748+ plugin_id (16) key
52749+ free_space (16) pluginid (16)
52750+ free_space_start (16) offset (16)
52751+ level (8)
52752+ num_items (16)
52753+ magic (32)
52754+ flush_time (32)
52755+*/
52756+/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
52757+/* magic number that is stored in ->magic field of node header */
52758+static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
52759+
52760+static int prepare_for_update(znode * left, znode * right,
52761+ carry_plugin_info * info);
52762+
52763+/* header of node of reiser40 format is at the beginning of node */
52764+static inline node40_header *node40_node_header(const znode * node /* node to
52765+ * query */ )
52766+{
52767+ assert("nikita-567", node != NULL);
52768+ assert("nikita-568", znode_page(node) != NULL);
52769+ assert("nikita-569", zdata(node) != NULL);
52770+ return (node40_header *) zdata(node);
52771+}
52772+
52773+/* functions to get/set fields of node40_header */
52774+#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
52775+#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
52776+#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
52777+#define nh40_get_level(nh) get_unaligned(&(nh)->level)
52778+#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
52779+#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
52780+
52781+#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
52782+#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
52783+#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
52784+#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
52785+#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
52786+#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
52787+
52788+/* plugin field of node header should be read/set by
52789+ plugin_by_disk_id/save_disk_plugin */
52790+
52791+/* array of item headers is at the end of node */
52792+static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
52793+{
52794+ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
52795+}
52796+
52797+/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
52798+ */
52799+static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
52800+{
52801+ return (item_header40 *) (zdata(coord->node) +
52802+ znode_size(coord->node)) - (coord->item_pos) -
52803+ 1;
52804+}
52805+
52806+/* functions to get/set fields of item_header40 */
52807+#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
52808+
52809+#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
52810+
52811+/* plugin field of item header should be read/set by
52812+ plugin_by_disk_id/save_disk_plugin */
52813+
52814+/* plugin methods */
52815+
52816+/* plugin->u.node.item_overhead
52817+ look for description of this method in plugin/node/node.h */
52818+size_t
52819+item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
52820+{
52821+ return sizeof(item_header40);
52822+}
52823+
52824+/* plugin->u.node.free_space
52825+ look for description of this method in plugin/node/node.h */
52826+size_t free_space_node40(znode * node)
52827+{
52828+ assert("nikita-577", node != NULL);
52829+ assert("nikita-578", znode_is_loaded(node));
52830+ assert("nikita-579", zdata(node) != NULL);
52831+
52832+ return nh40_get_free_space(node40_node_header(node));
52833+}
52834+
52835+/* private inline version of node40_num_of_items() for use in this file. This
52836+ is necessary, because address of node40_num_of_items() is taken and it is
52837+ never inlined as a result. */
52838+static inline short node40_num_of_items_internal(const znode * node)
52839+{
52840+ return nh40_get_num_items(node40_node_header(node));
52841+}
52842+
52843+#if REISER4_DEBUG
52844+static inline void check_num_items(const znode * node)
52845+{
52846+ assert("nikita-2749",
52847+ node40_num_of_items_internal(node) == node->nr_items);
52848+ assert("nikita-2746", znode_is_write_locked(node));
52849+}
52850+#else
52851+#define check_num_items(node) noop
52852+#endif
52853+
52854+/* plugin->u.node.num_of_items
52855+ look for description of this method in plugin/node/node.h */
52856+int num_of_items_node40(const znode * node)
52857+{
52858+ return node40_num_of_items_internal(node);
52859+}
52860+
52861+static void
52862+node40_set_num_items(znode * node, node40_header * nh, unsigned value)
52863+{
52864+ assert("nikita-2751", node != NULL);
52865+ assert("nikita-2750", nh == node40_node_header(node));
52866+
52867+ check_num_items(node);
52868+ nh40_set_num_items(nh, value);
52869+ node->nr_items = value;
52870+ check_num_items(node);
52871+}
52872+
52873+/* plugin->u.node.item_by_coord
52874+ look for description of this method in plugin/node/node.h */
52875+char *item_by_coord_node40(const coord_t * coord)
52876+{
52877+ item_header40 *ih;
52878+ char *p;
52879+
52880+ /* @coord is set to existing item */
52881+ assert("nikita-596", coord != NULL);
52882+ assert("vs-255", coord_is_existing_item(coord));
52883+
52884+ ih = node40_ih_at_coord(coord);
52885+ p = zdata(coord->node) + ih40_get_offset(ih);
52886+ return p;
52887+}
52888+
52889+/* plugin->u.node.length_by_coord
52890+ look for description of this method in plugin/node/node.h */
52891+int length_by_coord_node40(const coord_t * coord)
52892+{
52893+ item_header40 *ih;
52894+ int result;
52895+
52896+ /* @coord is set to existing item */
52897+ assert("vs-256", coord != NULL);
52898+ assert("vs-257", coord_is_existing_item(coord));
52899+
52900+ ih = node40_ih_at_coord(coord);
52901+ if ((int)coord->item_pos ==
52902+ node40_num_of_items_internal(coord->node) - 1)
52903+ result =
52904+ nh40_get_free_space_start(node40_node_header(coord->node)) -
52905+ ih40_get_offset(ih);
52906+ else
52907+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
52908+
52909+ return result;
52910+}
52911+
52912+static pos_in_node_t
52913+node40_item_length(const znode * node, pos_in_node_t item_pos)
52914+{
52915+ item_header40 *ih;
52916+ pos_in_node_t result;
52917+
52918+ /* @coord is set to existing item */
52919+ assert("vs-256", node != NULL);
52920+ assert("vs-257", node40_num_of_items_internal(node) > item_pos);
52921+
52922+ ih = node40_ih_at(node, item_pos);
52923+ if (item_pos == node40_num_of_items_internal(node) - 1)
52924+ result =
52925+ nh40_get_free_space_start(node40_node_header(node)) -
52926+ ih40_get_offset(ih);
52927+ else
52928+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
52929+
52930+ return result;
52931+}
52932+
52933+/* plugin->u.node.plugin_by_coord
52934+ look for description of this method in plugin/node/node.h */
52935+item_plugin *plugin_by_coord_node40(const coord_t * coord)
52936+{
52937+ item_header40 *ih;
52938+ item_plugin *result;
52939+
52940+ /* @coord is set to existing item */
52941+ assert("vs-258", coord != NULL);
52942+ assert("vs-259", coord_is_existing_item(coord));
52943+
52944+ ih = node40_ih_at_coord(coord);
52945+ /* pass NULL in stead of current tree. This is time critical call. */
52946+ result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
52947+ return result;
52948+}
52949+
52950+/* plugin->u.node.key_at
52951+ look for description of this method in plugin/node/node.h */
52952+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
52953+{
52954+ item_header40 *ih;
52955+
52956+ assert("nikita-1765", coord_is_existing_item(coord));
52957+
52958+ /* @coord is set to existing item */
52959+ ih = node40_ih_at_coord(coord);
52960+ memcpy(key, &ih->key, sizeof(reiser4_key));
52961+ return key;
52962+}
52963+
52964+/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
52965+
52966+#define NODE_INCSTAT(n, counter) \
52967+ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
52968+
52969+#define NODE_ADDSTAT(n, counter, val) \
52970+ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
52971+
52972+/* plugin->u.node.lookup
52973+ look for description of this method in plugin/node/node.h */
52974+node_search_result lookup_node40(znode * node /* node to query */ ,
52975+ const reiser4_key * key /* key to look for */ ,
52976+ lookup_bias bias /* search bias */ ,
52977+ coord_t * coord /* resulting coord */ )
52978+{
52979+ int left;
52980+ int right;
52981+ int found;
52982+ int items;
52983+
52984+ item_header40 *lefth;
52985+ item_header40 *righth;
52986+
52987+ item_plugin *iplug;
52988+ item_header40 *bstop;
52989+ item_header40 *ih;
52990+ cmp_t order;
52991+
52992+ assert("nikita-583", node != NULL);
52993+ assert("nikita-584", key != NULL);
52994+ assert("nikita-585", coord != NULL);
52995+ assert("nikita-2693", znode_is_any_locked(node));
52996+ cassert(REISER4_SEQ_SEARCH_BREAK > 2);
52997+
52998+ items = node_num_items(node);
52999+
53000+ if (unlikely(items == 0)) {
53001+ coord_init_first_unit(coord, node);
53002+ return NS_NOT_FOUND;
53003+ }
53004+
53005+ /* binary search for item that can contain given key */
53006+ left = 0;
53007+ right = items - 1;
53008+ coord->node = node;
53009+ coord_clear_iplug(coord);
53010+ found = 0;
53011+
53012+ lefth = node40_ih_at(node, left);
53013+ righth = node40_ih_at(node, right);
53014+
53015+ /* It is known that for small arrays sequential search is on average
53016+ more efficient than binary. This is because sequential search is
53017+ coded as tight loop that can be better optimized by compilers and
53018+ for small array size gain from this optimization makes sequential
53019+ search the winner. Another, maybe more important, reason for this,
53020+ is that sequential array is more CPU cache friendly, whereas binary
53021+ search effectively destroys CPU caching.
53022+
53023+ Critical here is the notion of "smallness". Reasonable value of
53024+ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
53025+ fs/reiser4/ulevel/ulevel.c:test_search().
53026+
53027+ Don't try to further optimize sequential search by scanning from
53028+ right to left in attempt to use more efficient loop termination
53029+ condition (comparison with 0). This doesn't work.
53030+
53031+ */
53032+
53033+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
53034+ int median;
53035+ item_header40 *medianh;
53036+
53037+ median = (left + right) / 2;
53038+ medianh = node40_ih_at(node, median);
53039+
53040+ assert("nikita-1084", median >= 0);
53041+ assert("nikita-1085", median < items);
53042+ switch (keycmp(key, &medianh->key)) {
53043+ case LESS_THAN:
53044+ right = median;
53045+ righth = medianh;
53046+ break;
53047+ default:
53048+ wrong_return_value("nikita-586", "keycmp");
53049+ case GREATER_THAN:
53050+ left = median;
53051+ lefth = medianh;
53052+ break;
53053+ case EQUAL_TO:
53054+ do {
53055+ --median;
53056+ /* headers are ordered from right to left */
53057+ ++medianh;
53058+ } while (median >= 0 && keyeq(key, &medianh->key));
53059+ right = left = median + 1;
53060+ ih = lefth = righth = medianh - 1;
53061+ found = 1;
53062+ break;
53063+ }
53064+ }
53065+ /* sequential scan. Item headers, and, therefore, keys are stored at
53066+ the rightmost part of a node from right to left. We are trying to
53067+ access memory from left to right, and hence, scan in _descending_
53068+ order of item numbers.
53069+ */
53070+ if (!found) {
53071+ for (left = right, ih = righth; left >= 0; ++ih, --left) {
53072+ cmp_t comparison;
53073+
53074+ prefetchkey(&(ih + 1)->key);
53075+ comparison = keycmp(&ih->key, key);
53076+ if (comparison == GREATER_THAN)
53077+ continue;
53078+ if (comparison == EQUAL_TO) {
53079+ found = 1;
53080+ do {
53081+ --left;
53082+ ++ih;
53083+ } while (left >= 0 && keyeq(&ih->key, key));
53084+ ++left;
53085+ --ih;
53086+ } else {
53087+ assert("nikita-1256", comparison == LESS_THAN);
53088+ }
53089+ break;
53090+ }
53091+ if (unlikely(left < 0))
53092+ left = 0;
53093+ }
53094+
53095+ assert("nikita-3212", right >= left);
53096+ assert("nikita-3214",
53097+ equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
53098+
53099+ coord_set_item_pos(coord, left);
53100+ coord->unit_pos = 0;
53101+ coord->between = AT_UNIT;
53102+
53103+ /* key < leftmost key in a mode or node is corrupted and keys
53104+ are not sorted */
53105+ bstop = node40_ih_at(node, (unsigned)left);
53106+ order = keycmp(&bstop->key, key);
53107+ if (unlikely(order == GREATER_THAN)) {
53108+ if (unlikely(left != 0)) {
53109+ /* screw up */
53110+ warning("nikita-587", "Key less than %i key in a node",
53111+ left);
53112+ reiser4_print_key("key", key);
53113+ reiser4_print_key("min", &bstop->key);
53114+ print_coord_content("coord", coord);
53115+ return RETERR(-EIO);
53116+ } else {
53117+ coord->between = BEFORE_UNIT;
53118+ return NS_NOT_FOUND;
53119+ }
53120+ }
53121+ /* left <= key, ok */
53122+ iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
53123+
53124+ if (unlikely(iplug == NULL)) {
53125+ warning("nikita-588", "Unknown plugin %i",
53126+ le16_to_cpu(get_unaligned(&bstop->plugin_id)));
53127+ reiser4_print_key("key", key);
53128+ print_coord_content("coord", coord);
53129+ return RETERR(-EIO);
53130+ }
53131+
53132+ coord_set_iplug(coord, iplug);
53133+
53134+ /* if exact key from item header was found by binary search, no
53135+ further checks are necessary. */
53136+ if (found) {
53137+ assert("nikita-1259", order == EQUAL_TO);
53138+ return NS_FOUND;
53139+ }
53140+ if (iplug->b.max_key_inside != NULL) {
53141+ reiser4_key max_item_key;
53142+
53143+ /* key > max_item_key --- outside of an item */
53144+ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
53145+ coord->unit_pos = 0;
53146+ coord->between = AFTER_ITEM;
53147+ /* FIXME-VS: key we are looking for does not fit into
53148+ found item. Return NS_NOT_FOUND then. Without that
53149+ the following case does not work: there is extent of
53150+ file 10000, 10001. File 10000, 10002 has been just
53151+ created. When writing to position 0 in that file -
53152+ traverse_tree will stop here on twig level. When we
53153+ want it to go down to leaf level
53154+ */
53155+ return NS_NOT_FOUND;
53156+ }
53157+ }
53158+
53159+ if (iplug->b.lookup != NULL) {
53160+ return iplug->b.lookup(key, bias, coord);
53161+ } else {
53162+ assert("nikita-1260", order == LESS_THAN);
53163+ coord->between = AFTER_UNIT;
53164+ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
53165+ }
53166+}
53167+
53168+#undef NODE_ADDSTAT
53169+#undef NODE_INCSTAT
53170+
53171+/* plugin->u.node.estimate
53172+ look for description of this method in plugin/node/node.h */
53173+size_t estimate_node40(znode * node)
53174+{
53175+ size_t result;
53176+
53177+ assert("nikita-597", node != NULL);
53178+
53179+ result = free_space_node40(node) - sizeof(item_header40);
53180+
53181+ return (result > 0) ? result : 0;
53182+}
53183+
53184+/* plugin->u.node.check
53185+ look for description of this method in plugin/node/node.h */
53186+int check_node40(const znode * node /* node to check */ ,
53187+ __u32 flags /* check flags */ ,
53188+ const char **error /* where to store error message */ )
53189+{
53190+ int nr_items;
53191+ int i;
53192+ reiser4_key prev;
53193+ unsigned old_offset;
53194+ tree_level level;
53195+ coord_t coord;
53196+ int result;
53197+
53198+ assert("nikita-580", node != NULL);
53199+ assert("nikita-581", error != NULL);
53200+ assert("nikita-2948", znode_is_loaded(node));
53201+
53202+ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
53203+ return 0;
53204+
53205+ assert("nikita-582", zdata(node) != NULL);
53206+
53207+ nr_items = node40_num_of_items_internal(node);
53208+ if (nr_items < 0) {
53209+ *error = "Negative number of items";
53210+ return -1;
53211+ }
53212+
53213+ if (flags & REISER4_NODE_DKEYS)
53214+ prev = *znode_get_ld_key((znode *) node);
53215+ else
53216+ prev = *reiser4_min_key();
53217+
53218+ old_offset = 0;
53219+ coord_init_zero(&coord);
53220+ coord.node = (znode *) node;
53221+ coord.unit_pos = 0;
53222+ coord.between = AT_UNIT;
53223+ level = znode_get_level(node);
53224+ for (i = 0; i < nr_items; i++) {
53225+ item_header40 *ih;
53226+ reiser4_key unit_key;
53227+ unsigned j;
53228+
53229+ ih = node40_ih_at(node, (unsigned)i);
53230+ coord_set_item_pos(&coord, i);
53231+ if ((ih40_get_offset(ih) >=
53232+ znode_size(node) - nr_items * sizeof(item_header40)) ||
53233+ (ih40_get_offset(ih) < sizeof(node40_header))) {
53234+ *error = "Offset is out of bounds";
53235+ return -1;
53236+ }
53237+ if (ih40_get_offset(ih) <= old_offset) {
53238+ *error = "Offsets are in wrong order";
53239+ return -1;
53240+ }
53241+ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
53242+ *error = "Wrong offset of first item";
53243+ return -1;
53244+ }
53245+ old_offset = ih40_get_offset(ih);
53246+
53247+ if (keygt(&prev, &ih->key)) {
53248+ *error = "Keys are in wrong order";
53249+ return -1;
53250+ }
53251+ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
53252+ *error = "Wrong key of first unit";
53253+ return -1;
53254+ }
53255+ prev = ih->key;
53256+ for (j = 0; j < coord_num_units(&coord); ++j) {
53257+ coord.unit_pos = j;
53258+ unit_key_by_coord(&coord, &unit_key);
53259+ if (keygt(&prev, &unit_key)) {
53260+ *error = "Unit keys are in wrong order";
53261+ return -1;
53262+ }
53263+ prev = unit_key;
53264+ }
53265+ coord.unit_pos = 0;
53266+ if (level != TWIG_LEVEL && item_is_extent(&coord)) {
53267+ *error = "extent on the wrong level";
53268+ return -1;
53269+ }
53270+ if (level == LEAF_LEVEL && item_is_internal(&coord)) {
53271+ *error = "internal item on the wrong level";
53272+ return -1;
53273+ }
53274+ if (level != LEAF_LEVEL &&
53275+ !item_is_internal(&coord) && !item_is_extent(&coord)) {
53276+ *error = "wrong item on the internal level";
53277+ return -1;
53278+ }
53279+ if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
53280+ *error = "non-internal item on the internal level";
53281+ return -1;
53282+ }
53283+#if REISER4_DEBUG
53284+ if (item_plugin_by_coord(&coord)->b.check
53285+ && item_plugin_by_coord(&coord)->b.check(&coord, error))
53286+ return -1;
53287+#endif
53288+ if (i) {
53289+ coord_t prev_coord;
53290+ /* two neighboring items can not be mergeable */
53291+ coord_dup(&prev_coord, &coord);
53292+ coord_prev_item(&prev_coord);
53293+ if (are_items_mergeable(&prev_coord, &coord)) {
53294+ *error = "mergeable items in one node";
53295+ return -1;
53296+ }
53297+
53298+ }
53299+ }
53300+
53301+ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
53302+ coord_t coord;
53303+ item_plugin *iplug;
53304+
53305+ coord_init_last_unit(&coord, node);
53306+ iplug = item_plugin_by_coord(&coord);
53307+ if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
53308+ iplug->s.file.append_key != NULL) {
53309+ reiser4_key mkey;
53310+
53311+ iplug->s.file.append_key(&coord, &mkey);
53312+ set_key_offset(&mkey, get_key_offset(&mkey) - 1);
53313+ read_lock_dk(current_tree);
53314+ result = keygt(&mkey, znode_get_rd_key((znode *) node));
53315+ read_unlock_dk(current_tree);
53316+ if (result) {
53317+ *error = "key of rightmost item is too large";
53318+ return -1;
53319+ }
53320+ }
53321+ }
53322+ if (flags & REISER4_NODE_DKEYS) {
53323+ read_lock_tree(current_tree);
53324+ read_lock_dk(current_tree);
53325+
53326+ flags |= REISER4_NODE_TREE_STABLE;
53327+
53328+ if (keygt(&prev, znode_get_rd_key((znode *) node))) {
53329+ if (flags & REISER4_NODE_TREE_STABLE) {
53330+ *error = "Last key is greater than rdkey";
53331+ read_unlock_dk(current_tree);
53332+ read_unlock_tree(current_tree);
53333+ return -1;
53334+ }
53335+ }
53336+ if (keygt
53337+ (znode_get_ld_key((znode *) node),
53338+ znode_get_rd_key((znode *) node))) {
53339+ *error = "ldkey is greater than rdkey";
53340+ read_unlock_dk(current_tree);
53341+ read_unlock_tree(current_tree);
53342+ return -1;
53343+ }
53344+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
53345+ (node->left != NULL) &&
53346+ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
53347+ ergo(flags & REISER4_NODE_TREE_STABLE,
53348+ !keyeq(znode_get_rd_key(node->left),
53349+ znode_get_ld_key((znode *) node)))
53350+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53351+ keygt(znode_get_rd_key(node->left),
53352+ znode_get_ld_key((znode *) node)))) {
53353+ *error = "left rdkey or ldkey is wrong";
53354+ read_unlock_dk(current_tree);
53355+ read_unlock_tree(current_tree);
53356+ return -1;
53357+ }
53358+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
53359+ (node->right != NULL) &&
53360+ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
53361+ ergo(flags & REISER4_NODE_TREE_STABLE,
53362+ !keyeq(znode_get_rd_key((znode *) node),
53363+ znode_get_ld_key(node->right)))
53364+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53365+ keygt(znode_get_rd_key((znode *) node),
53366+ znode_get_ld_key(node->right)))) {
53367+ *error = "rdkey or right ldkey is wrong";
53368+ read_unlock_dk(current_tree);
53369+ read_unlock_tree(current_tree);
53370+ return -1;
53371+ }
53372+
53373+ read_unlock_dk(current_tree);
53374+ read_unlock_tree(current_tree);
53375+ }
53376+
53377+ return 0;
53378+}
53379+
53380+/* plugin->u.node.parse
53381+ look for description of this method in plugin/node/node.h */
53382+int parse_node40(znode * node /* node to parse */ )
53383+{
53384+ node40_header *header;
53385+ int result;
53386+ d8 level;
53387+
53388+ header = node40_node_header((znode *) node);
53389+ result = -EIO;
53390+ level = nh40_get_level(header);
53391+ if (unlikely(((__u8) znode_get_level(node)) != level))
53392+ warning("nikita-494", "Wrong level found in node: %i != %i",
53393+ znode_get_level(node), level);
53394+ else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
53395+ warning("nikita-495",
53396+ "Wrong magic in tree node: want %x, got %x",
53397+ REISER4_NODE_MAGIC, nh40_get_magic(header));
53398+ else {
53399+ node->nr_items = node40_num_of_items_internal(node);
53400+ result = 0;
53401+ }
53402+ return RETERR(result);
53403+}
53404+
53405+/* plugin->u.node.init
53406+ look for description of this method in plugin/node/node.h */
53407+int init_node40(znode * node /* node to initialise */ )
53408+{
53409+ node40_header *header;
53410+
53411+ assert("nikita-570", node != NULL);
53412+ assert("nikita-572", zdata(node) != NULL);
53413+
53414+ header = node40_node_header(node);
53415+ memset(header, 0, sizeof(node40_header));
53416+ nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
53417+ nh40_set_free_space_start(header, sizeof(node40_header));
53418+ /* sane hypothesis: 0 in CPU format is 0 in disk format */
53419+ /* items: 0 */
53420+ save_plugin_id(node_plugin_to_plugin(node->nplug),
53421+ &header->common_header.plugin_id);
53422+ nh40_set_level(header, znode_get_level(node));
53423+ nh40_set_magic(header, REISER4_NODE_MAGIC);
53424+ node->nr_items = 0;
53425+ nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
53426+
53427+ /* flags: 0 */
53428+ return 0;
53429+}
53430+
53431+#ifdef GUESS_EXISTS
53432+int guess_node40(const znode * node /* node to guess plugin of */ )
53433+{
53434+ node40_header *nethack;
53435+
53436+ assert("nikita-1058", node != NULL);
53437+ nethack = node40_node_header(node);
53438+ return
53439+ (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
53440+ (plugin_by_disk_id(znode_get_tree(node),
53441+ REISER4_NODE_PLUGIN_TYPE,
53442+ &nethack->common_header.plugin_id)->h.id ==
53443+ NODE40_ID);
53444+}
53445+#endif
53446+
53447+/* plugin->u.node.chage_item_size
53448+ look for description of this method in plugin/node/node.h */
53449+void change_item_size_node40(coord_t * coord, int by)
53450+{
53451+ node40_header *nh;
53452+ item_header40 *ih;
53453+ char *item_data;
53454+ int item_length;
53455+ unsigned i;
53456+
53457+ /* make sure that @item is coord of existing item */
53458+ assert("vs-210", coord_is_existing_item(coord));
53459+
53460+ nh = node40_node_header(coord->node);
53461+
53462+ item_data = item_by_coord_node40(coord);
53463+ item_length = length_by_coord_node40(coord);
53464+
53465+ /* move item bodies */
53466+ ih = node40_ih_at_coord(coord);
53467+ memmove(item_data + item_length + by, item_data + item_length,
53468+ nh40_get_free_space_start(node40_node_header(coord->node)) -
53469+ (ih40_get_offset(ih) + item_length));
53470+
53471+ /* update offsets of moved items */
53472+ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
53473+ ih = node40_ih_at(coord->node, i);
53474+ ih40_set_offset(ih, ih40_get_offset(ih) + by);
53475+ }
53476+
53477+ /* update node header */
53478+ nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
53479+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
53480+}
53481+
53482+static int should_notify_parent(const znode * node)
53483+{
53484+ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
53485+ return !disk_addr_eq(znode_get_block(node),
53486+ &znode_get_tree(node)->root_block);
53487+}
53488+
53489+/* plugin->u.node.create_item
53490+ look for description of this method in plugin/node/node.h */
53491+int
53492+create_item_node40(coord_t *target, const reiser4_key *key,
53493+ reiser4_item_data *data, carry_plugin_info *info)
53494+{
53495+ node40_header *nh;
53496+ item_header40 *ih;
53497+ unsigned offset;
53498+ unsigned i;
53499+
53500+ nh = node40_node_header(target->node);
53501+
53502+ assert("vs-212", coord_is_between_items(target));
53503+ /* node must have enough free space */
53504+ assert("vs-254",
53505+ free_space_node40(target->node) >=
53506+ data->length + sizeof(item_header40));
53507+ assert("vs-1410", data->length >= 0);
53508+
53509+ if (coord_set_to_right(target))
53510+ /* there are not items to the right of @target, so, new item
53511+ will be inserted after last one */
53512+ coord_set_item_pos(target, nh40_get_num_items(nh));
53513+
53514+ if (target->item_pos < nh40_get_num_items(nh)) {
53515+ /* there are items to be moved to prepare space for new
53516+ item */
53517+ ih = node40_ih_at_coord(target);
53518+ /* new item will start at this offset */
53519+ offset = ih40_get_offset(ih);
53520+
53521+ memmove(zdata(target->node) + offset + data->length,
53522+ zdata(target->node) + offset,
53523+ nh40_get_free_space_start(nh) - offset);
53524+ /* update headers of moved items */
53525+ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
53526+ ih = node40_ih_at(target->node, i);
53527+ ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
53528+ }
53529+
53530+ /* @ih is set to item header of the last item, move item headers */
53531+ memmove(ih - 1, ih,
53532+ sizeof(item_header40) * (nh40_get_num_items(nh) -
53533+ target->item_pos));
53534+ } else {
53535+ /* new item will start at this offset */
53536+ offset = nh40_get_free_space_start(nh);
53537+ }
53538+
53539+ /* make item header for the new item */
53540+ ih = node40_ih_at_coord(target);
53541+ memcpy(&ih->key, key, sizeof(reiser4_key));
53542+ ih40_set_offset(ih, offset);
53543+ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
53544+
53545+ /* update node header */
53546+ nh40_set_free_space(nh,
53547+ nh40_get_free_space(nh) - data->length -
53548+ sizeof(item_header40));
53549+ nh40_set_free_space_start(nh,
53550+ nh40_get_free_space_start(nh) + data->length);
53551+ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
53552+
53553+ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
53554+ target->unit_pos = 0;
53555+ target->between = AT_UNIT;
53556+ coord_clear_iplug(target);
53557+
53558+ /* initialize item */
53559+ if (data->iplug->b.init != NULL) {
53560+ data->iplug->b.init(target, NULL, data);
53561+ }
53562+ /* copy item body */
53563+ if (data->iplug->b.paste != NULL) {
53564+ data->iplug->b.paste(target, data, info);
53565+ } else if (data->data != NULL) {
53566+ if (data->user) {
53567+ /* AUDIT: Are we really should not check that pointer
53568+ from userspace was valid and data bytes were
53569+ available? How will we return -EFAULT of some kind
53570+ without this check? */
53571+ assert("nikita-3038", reiser4_schedulable());
53572+ /* copy data from user space */
53573+ __copy_from_user(zdata(target->node) + offset,
53574+ (const char __user *)data->data,
53575+ (unsigned)data->length);
53576+ } else
53577+ /* copy from kernel space */
53578+ memcpy(zdata(target->node) + offset, data->data,
53579+ (unsigned)data->length);
53580+ }
53581+
53582+ if (target->item_pos == 0) {
53583+ /* left delimiting key has to be updated */
53584+ prepare_for_update(NULL, target->node, info);
53585+ }
53586+
53587+ if (item_plugin_by_coord(target)->b.create_hook != NULL) {
53588+ item_plugin_by_coord(target)->b.create_hook(target, data->arg);
53589+ }
53590+
53591+ return 0;
53592+}
53593+
53594+/* plugin->u.node.update_item_key
53595+ look for description of this method in plugin/node/node.h */
53596+void
53597+update_item_key_node40(coord_t * target, const reiser4_key * key,
53598+ carry_plugin_info * info)
53599+{
53600+ item_header40 *ih;
53601+
53602+ ih = node40_ih_at_coord(target);
53603+ memcpy(&ih->key, key, sizeof(reiser4_key));
53604+
53605+ if (target->item_pos == 0) {
53606+ prepare_for_update(NULL, target->node, info);
53607+ }
53608+}
53609+
53610+/* this bits encode cut mode */
53611+#define CMODE_TAIL 1
53612+#define CMODE_WHOLE 2
53613+#define CMODE_HEAD 4
53614+
53615+struct cut40_info {
53616+ int mode;
53617+ pos_in_node_t tail_removed; /* position of item which gets tail removed */
53618+ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
53619+ pos_in_node_t removed_count; /* number of items removed completely */
53620+ pos_in_node_t head_removed; /* position of item which gets head removed */
53621+
53622+ pos_in_node_t freed_space_start;
53623+ pos_in_node_t freed_space_end;
53624+ pos_in_node_t first_moved;
53625+ pos_in_node_t head_removed_location;
53626+};
53627+
53628+static void init_cinfo(struct cut40_info *cinfo)
53629+{
53630+ cinfo->mode = 0;
53631+ cinfo->tail_removed = MAX_POS_IN_NODE;
53632+ cinfo->first_removed = MAX_POS_IN_NODE;
53633+ cinfo->removed_count = MAX_POS_IN_NODE;
53634+ cinfo->head_removed = MAX_POS_IN_NODE;
53635+ cinfo->freed_space_start = MAX_POS_IN_NODE;
53636+ cinfo->freed_space_end = MAX_POS_IN_NODE;
53637+ cinfo->first_moved = MAX_POS_IN_NODE;
53638+ cinfo->head_removed_location = MAX_POS_IN_NODE;
53639+}
53640+
53641+/* complete cut_node40/kill_node40 content by removing the gap created by */
53642+static void compact(znode * node, struct cut40_info *cinfo)
53643+{
53644+ node40_header *nh;
53645+ item_header40 *ih;
53646+ pos_in_node_t freed;
53647+ pos_in_node_t pos, nr_items;
53648+
53649+ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
53650+ cinfo->freed_space_end != MAX_POS_IN_NODE &&
53651+ cinfo->first_moved != MAX_POS_IN_NODE));
53652+ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
53653+
53654+ nh = node40_node_header(node);
53655+ nr_items = nh40_get_num_items(nh);
53656+
53657+ /* remove gap made up by removal */
53658+ memmove(zdata(node) + cinfo->freed_space_start,
53659+ zdata(node) + cinfo->freed_space_end,
53660+ nh40_get_free_space_start(nh) - cinfo->freed_space_end);
53661+
53662+ /* update item headers of moved items - change their locations */
53663+ pos = cinfo->first_moved;
53664+ ih = node40_ih_at(node, pos);
53665+ if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
53666+ assert("vs-1580", pos == cinfo->head_removed);
53667+ ih40_set_offset(ih, cinfo->head_removed_location);
53668+ pos++;
53669+ ih--;
53670+ }
53671+
53672+ freed = cinfo->freed_space_end - cinfo->freed_space_start;
53673+ for (; pos < nr_items; pos++, ih--) {
53674+ assert("vs-1581", ih == node40_ih_at(node, pos));
53675+ ih40_set_offset(ih, ih40_get_offset(ih) - freed);
53676+ }
53677+
53678+ /* free space start moved to right */
53679+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
53680+
53681+ if (cinfo->removed_count != MAX_POS_IN_NODE) {
53682+ /* number of items changed. Remove item headers of those items */
53683+ ih = node40_ih_at(node, nr_items - 1);
53684+ memmove(ih + cinfo->removed_count, ih,
53685+ sizeof(item_header40) * (nr_items -
53686+ cinfo->removed_count -
53687+ cinfo->first_removed));
53688+ freed += sizeof(item_header40) * cinfo->removed_count;
53689+ node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
53690+ }
53691+
53692+ /* total amount of free space increased */
53693+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
53694+}
53695+
53696+int shrink_item_node40(coord_t * coord, int delta)
53697+{
53698+ node40_header *nh;
53699+ item_header40 *ih;
53700+ pos_in_node_t pos;
53701+ pos_in_node_t nr_items;
53702+ char *end;
53703+ znode *node;
53704+ int off;
53705+
53706+ assert("nikita-3487", coord != NULL);
53707+ assert("nikita-3488", delta >= 0);
53708+
53709+ node = coord->node;
53710+ nh = node40_node_header(node);
53711+ nr_items = nh40_get_num_items(nh);
53712+
53713+ ih = node40_ih_at_coord(coord);
53714+ assert("nikita-3489", delta <= length_by_coord_node40(coord));
53715+ off = ih40_get_offset(ih) + length_by_coord_node40(coord);
53716+ end = zdata(node) + off;
53717+
53718+ /* remove gap made up by removal */
53719+ memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
53720+
53721+ /* update item headers of moved items - change their locations */
53722+ pos = coord->item_pos + 1;
53723+ ih = node40_ih_at(node, pos);
53724+ for (; pos < nr_items; pos++, ih--) {
53725+ assert("nikita-3490", ih == node40_ih_at(node, pos));
53726+ ih40_set_offset(ih, ih40_get_offset(ih) - delta);
53727+ }
53728+
53729+ /* free space start moved to left */
53730+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
53731+ /* total amount of free space increased */
53732+ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
53733+ /*
53734+ * This method does _not_ changes number of items. Hence, it cannot
53735+ * make node empty. Also it doesn't remove items at all, which means
53736+ * that no keys have to be updated either.
53737+ */
53738+ return 0;
53739+}
53740+
53741+/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
53742+ of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
53743+ rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
53744+ getting head cut. Function returns 0 in this case */
53745+static int
53746+parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
53747+{
53748+ reiser4_key left_key, right_key;
53749+ reiser4_key min_from_key, max_to_key;
53750+ const reiser4_key *from_key, *to_key;
53751+
53752+ init_cinfo(cinfo);
53753+
53754+ /* calculate minimal key stored in first item of items to be cut (params->from) */
53755+ item_key_by_coord(params->from, &min_from_key);
53756+ /* and max key stored in last item of items to be cut (params->to) */
53757+ max_item_key_by_coord(params->to, &max_to_key);
53758+
53759+ /* if cut key range is not defined in input parameters - define it using cut coord range */
53760+ if (params->from_key == NULL) {
53761+ assert("vs-1513", params->to_key == NULL);
53762+ unit_key_by_coord(params->from, &left_key);
53763+ from_key = &left_key;
53764+ max_unit_key_by_coord(params->to, &right_key);
53765+ to_key = &right_key;
53766+ } else {
53767+ from_key = params->from_key;
53768+ to_key = params->to_key;
53769+ }
53770+
53771+ if (params->from->item_pos == params->to->item_pos) {
53772+ if (keylt(&min_from_key, from_key)
53773+ && keylt(to_key, &max_to_key))
53774+ return 1;
53775+
53776+ if (keygt(from_key, &min_from_key)) {
53777+ /* tail of item is to be cut cut */
53778+ cinfo->tail_removed = params->from->item_pos;
53779+ cinfo->mode |= CMODE_TAIL;
53780+ } else if (keylt(to_key, &max_to_key)) {
53781+ /* head of item is to be cut */
53782+ cinfo->head_removed = params->from->item_pos;
53783+ cinfo->mode |= CMODE_HEAD;
53784+ } else {
53785+ /* item is removed completely */
53786+ cinfo->first_removed = params->from->item_pos;
53787+ cinfo->removed_count = 1;
53788+ cinfo->mode |= CMODE_WHOLE;
53789+ }
53790+ } else {
53791+ cinfo->first_removed = params->from->item_pos + 1;
53792+ cinfo->removed_count =
53793+ params->to->item_pos - params->from->item_pos - 1;
53794+
53795+ if (keygt(from_key, &min_from_key)) {
53796+ /* first item is not cut completely */
53797+ cinfo->tail_removed = params->from->item_pos;
53798+ cinfo->mode |= CMODE_TAIL;
53799+ } else {
53800+ cinfo->first_removed--;
53801+ cinfo->removed_count++;
53802+ }
53803+ if (keylt(to_key, &max_to_key)) {
53804+ /* last item is not cut completely */
53805+ cinfo->head_removed = params->to->item_pos;
53806+ cinfo->mode |= CMODE_HEAD;
53807+ } else {
53808+ cinfo->removed_count++;
53809+ }
53810+ if (cinfo->removed_count)
53811+ cinfo->mode |= CMODE_WHOLE;
53812+ }
53813+
53814+ return 0;
53815+}
53816+
53817+static void
53818+call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
53819+ carry_kill_data * kdata)
53820+{
53821+ coord_t coord;
53822+ item_plugin *iplug;
53823+ pos_in_node_t pos;
53824+
53825+ coord.node = node;
53826+ coord.unit_pos = 0;
53827+ coord.between = AT_UNIT;
53828+ for (pos = 0; pos < count; pos++) {
53829+ coord_set_item_pos(&coord, from + pos);
53830+ coord.unit_pos = 0;
53831+ coord.between = AT_UNIT;
53832+ iplug = item_plugin_by_coord(&coord);
53833+ if (iplug->b.kill_hook) {
53834+ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
53835+ kdata);
53836+ }
53837+ }
53838+}
53839+
53840+/* this is used to kill item partially */
53841+static pos_in_node_t
53842+kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
53843+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
53844+{
53845+ struct carry_kill_data *kdata;
53846+ item_plugin *iplug;
53847+
53848+ kdata = data;
53849+ iplug = item_plugin_by_coord(coord);
53850+
53851+ assert("vs-1524", iplug->b.kill_units);
53852+ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
53853+ new_first_key);
53854+}
53855+
53856+/* call item plugin to cut tail of file */
53857+static pos_in_node_t
53858+kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
53859+{
53860+ struct carry_kill_data *kdata;
53861+ pos_in_node_t to;
53862+
53863+ kdata = data;
53864+ to = coord_last_unit_pos(coord);
53865+ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
53866+ NULL);
53867+}
53868+
53869+/* call item plugin to cut head of item */
53870+static pos_in_node_t
53871+kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
53872+ reiser4_key * new_first_key)
53873+{
53874+ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
53875+ new_first_key);
53876+}
53877+
53878+/* this is used to cut item partially */
53879+static pos_in_node_t
53880+cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
53881+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
53882+{
53883+ carry_cut_data *cdata;
53884+ item_plugin *iplug;
53885+
53886+ cdata = data;
53887+ iplug = item_plugin_by_coord(coord);
53888+ assert("vs-302", iplug->b.cut_units);
53889+ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
53890+ new_first_key);
53891+}
53892+
53893+/* call item plugin to cut tail of file */
53894+static pos_in_node_t
53895+cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
53896+{
53897+ carry_cut_data *cdata;
53898+ pos_in_node_t to;
53899+
53900+ cdata = data;
53901+ to = coord_last_unit_pos(cdata->params.from);
53902+ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
53903+}
53904+
53905+/* call item plugin to cut head of item */
53906+static pos_in_node_t
53907+cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
53908+ reiser4_key * new_first_key)
53909+{
53910+ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
53911+ new_first_key);
53912+}
53913+
53914+/* this returns 1 of key of first item changed, 0 - if it did not */
53915+static int
53916+prepare_for_compact(struct cut40_info *cinfo,
53917+ const struct cut_kill_params *params, int is_cut,
53918+ void *data, carry_plugin_info * info)
53919+{
53920+ znode *node;
53921+ item_header40 *ih;
53922+ pos_in_node_t freed;
53923+ pos_in_node_t item_pos;
53924+ coord_t coord;
53925+ reiser4_key new_first_key;
53926+ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
53927+ void *, reiser4_key *, reiser4_key *);
53928+ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
53929+ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
53930+ reiser4_key *);
53931+ int retval;
53932+
53933+ retval = 0;
53934+
53935+ node = params->from->node;
53936+
53937+ assert("vs-184", node == params->to->node);
53938+ assert("vs-312", !node_is_empty(node));
53939+ assert("vs-297",
53940+ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
53941+
53942+ if (is_cut) {
53943+ kill_units_f = cut_units;
53944+ kill_tail_f = cut_tail;
53945+ kill_head_f = cut_head;
53946+ } else {
53947+ kill_units_f = kill_units;
53948+ kill_tail_f = kill_tail;
53949+ kill_head_f = kill_head;
53950+ }
53951+
53952+ if (parse_cut(cinfo, params) == 1) {
53953+ /* cut from the middle of item */
53954+ freed =
53955+ kill_units_f(params->from, params->from->unit_pos,
53956+ params->to->unit_pos, data,
53957+ params->smallest_removed, NULL);
53958+
53959+ item_pos = params->from->item_pos;
53960+ ih = node40_ih_at(node, item_pos);
53961+ cinfo->freed_space_start =
53962+ ih40_get_offset(ih) + node40_item_length(node,
53963+ item_pos) - freed;
53964+ cinfo->freed_space_end = cinfo->freed_space_start + freed;
53965+ cinfo->first_moved = item_pos + 1;
53966+ } else {
53967+ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
53968+ cinfo->first_removed != MAX_POS_IN_NODE ||
53969+ cinfo->head_removed != MAX_POS_IN_NODE));
53970+
53971+ switch (cinfo->mode) {
53972+ case CMODE_TAIL:
53973+ /* one item gets cut partially from its end */
53974+ assert("vs-1562",
53975+ cinfo->tail_removed == params->from->item_pos);
53976+
53977+ freed =
53978+ kill_tail_f(params->from, data,
53979+ params->smallest_removed);
53980+
53981+ item_pos = cinfo->tail_removed;
53982+ ih = node40_ih_at(node, item_pos);
53983+ cinfo->freed_space_start =
53984+ ih40_get_offset(ih) + node40_item_length(node,
53985+ item_pos) -
53986+ freed;
53987+ cinfo->freed_space_end =
53988+ cinfo->freed_space_start + freed;
53989+ cinfo->first_moved = cinfo->tail_removed + 1;
53990+ break;
53991+
53992+ case CMODE_WHOLE:
53993+ /* one or more items get removed completely */
53994+ assert("vs-1563",
53995+ cinfo->first_removed == params->from->item_pos);
53996+ assert("vs-1564", cinfo->removed_count > 0
53997+ && cinfo->removed_count != MAX_POS_IN_NODE);
53998+
53999+ /* call kill hook for all items removed completely */
54000+ if (is_cut == 0)
54001+ call_kill_hooks(node, cinfo->first_removed,
54002+ cinfo->removed_count, data);
54003+
54004+ item_pos = cinfo->first_removed;
54005+ ih = node40_ih_at(node, item_pos);
54006+
54007+ if (params->smallest_removed)
54008+ memcpy(params->smallest_removed, &ih->key,
54009+ sizeof(reiser4_key));
54010+
54011+ cinfo->freed_space_start = ih40_get_offset(ih);
54012+
54013+ item_pos += (cinfo->removed_count - 1);
54014+ ih -= (cinfo->removed_count - 1);
54015+ cinfo->freed_space_end =
54016+ ih40_get_offset(ih) + node40_item_length(node,
54017+ item_pos);
54018+ cinfo->first_moved = item_pos + 1;
54019+ if (cinfo->first_removed == 0)
54020+ /* key of first item of the node changes */
54021+ retval = 1;
54022+ break;
54023+
54024+ case CMODE_HEAD:
54025+ /* one item gets cut partially from its head */
54026+ assert("vs-1565",
54027+ cinfo->head_removed == params->from->item_pos);
54028+
54029+ freed =
54030+ kill_head_f(params->to, data,
54031+ params->smallest_removed,
54032+ &new_first_key);
54033+
54034+ item_pos = cinfo->head_removed;
54035+ ih = node40_ih_at(node, item_pos);
54036+ cinfo->freed_space_start = ih40_get_offset(ih);
54037+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
54038+ cinfo->first_moved = cinfo->head_removed + 1;
54039+
54040+ /* item head is removed, therefore, item key changed */
54041+ coord.node = node;
54042+ coord_set_item_pos(&coord, item_pos);
54043+ coord.unit_pos = 0;
54044+ coord.between = AT_UNIT;
54045+ update_item_key_node40(&coord, &new_first_key, NULL);
54046+ if (item_pos == 0)
54047+ /* key of first item of the node changes */
54048+ retval = 1;
54049+ break;
54050+
54051+ case CMODE_TAIL | CMODE_WHOLE:
54052+ /* one item gets cut from its end and one or more items get removed completely */
54053+ assert("vs-1566",
54054+ cinfo->tail_removed == params->from->item_pos);
54055+ assert("vs-1567",
54056+ cinfo->first_removed == cinfo->tail_removed + 1);
54057+ assert("vs-1564", cinfo->removed_count > 0
54058+ && cinfo->removed_count != MAX_POS_IN_NODE);
54059+
54060+ freed =
54061+ kill_tail_f(params->from, data,
54062+ params->smallest_removed);
54063+
54064+ item_pos = cinfo->tail_removed;
54065+ ih = node40_ih_at(node, item_pos);
54066+ cinfo->freed_space_start =
54067+ ih40_get_offset(ih) + node40_item_length(node,
54068+ item_pos) -
54069+ freed;
54070+
54071+ /* call kill hook for all items removed completely */
54072+ if (is_cut == 0)
54073+ call_kill_hooks(node, cinfo->first_removed,
54074+ cinfo->removed_count, data);
54075+
54076+ item_pos += cinfo->removed_count;
54077+ ih -= cinfo->removed_count;
54078+ cinfo->freed_space_end =
54079+ ih40_get_offset(ih) + node40_item_length(node,
54080+ item_pos);
54081+ cinfo->first_moved = item_pos + 1;
54082+ break;
54083+
54084+ case CMODE_WHOLE | CMODE_HEAD:
54085+ /* one or more items get removed completely and one item gets cut partially from its head */
54086+ assert("vs-1568",
54087+ cinfo->first_removed == params->from->item_pos);
54088+ assert("vs-1564", cinfo->removed_count > 0
54089+ && cinfo->removed_count != MAX_POS_IN_NODE);
54090+ assert("vs-1569",
54091+ cinfo->head_removed ==
54092+ cinfo->first_removed + cinfo->removed_count);
54093+
54094+ /* call kill hook for all items removed completely */
54095+ if (is_cut == 0)
54096+ call_kill_hooks(node, cinfo->first_removed,
54097+ cinfo->removed_count, data);
54098+
54099+ item_pos = cinfo->first_removed;
54100+ ih = node40_ih_at(node, item_pos);
54101+
54102+ if (params->smallest_removed)
54103+ memcpy(params->smallest_removed, &ih->key,
54104+ sizeof(reiser4_key));
54105+
54106+ freed =
54107+ kill_head_f(params->to, data, NULL, &new_first_key);
54108+
54109+ cinfo->freed_space_start = ih40_get_offset(ih);
54110+
54111+ ih = node40_ih_at(node, cinfo->head_removed);
54112+ /* this is the most complex case. Item which got head removed and items which are to be moved
54113+ intact change their location differently. */
54114+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
54115+ cinfo->first_moved = cinfo->head_removed;
54116+ cinfo->head_removed_location = cinfo->freed_space_start;
54117+
54118+ /* item head is removed, therefore, item key changed */
54119+ coord.node = node;
54120+ coord_set_item_pos(&coord, cinfo->head_removed);
54121+ coord.unit_pos = 0;
54122+ coord.between = AT_UNIT;
54123+ update_item_key_node40(&coord, &new_first_key, NULL);
54124+
54125+ assert("vs-1579", cinfo->first_removed == 0);
54126+ /* key of first item of the node changes */
54127+ retval = 1;
54128+ break;
54129+
54130+ case CMODE_TAIL | CMODE_HEAD:
54131+ /* one item get cut from its end and its neighbor gets cut from its tail */
54132+ impossible("vs-1576", "this can not happen currently");
54133+ break;
54134+
54135+ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
54136+ impossible("vs-1577", "this can not happen currently");
54137+ break;
54138+ default:
54139+ impossible("vs-1578", "unexpected cut mode");
54140+ break;
54141+ }
54142+ }
54143+ return retval;
54144+}
54145+
54146+/* plugin->u.node.kill
54147+ return value is number of items removed completely */
54148+int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
54149+{
54150+ znode *node;
54151+ struct cut40_info cinfo;
54152+ int first_key_changed;
54153+
54154+ node = kdata->params.from->node;
54155+
54156+ first_key_changed =
54157+ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
54158+ info);
54159+ compact(node, &cinfo);
54160+
54161+ if (info) {
54162+ /* it is not called by node40_shift, so we have to take care
54163+ of changes on upper levels */
54164+ if (node_is_empty(node)
54165+ && !(kdata->flags & DELETE_RETAIN_EMPTY))
54166+ /* all contents of node is deleted */
54167+ prepare_removal_node40(node, info);
54168+ else if (first_key_changed) {
54169+ prepare_for_update(NULL, node, info);
54170+ }
54171+ }
54172+
54173+ coord_clear_iplug(kdata->params.from);
54174+ coord_clear_iplug(kdata->params.to);
54175+
54176+ znode_make_dirty(node);
54177+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54178+}
54179+
54180+/* plugin->u.node.cut
54181+ return value is number of items removed completely */
54182+int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
54183+{
54184+ znode *node;
54185+ struct cut40_info cinfo;
54186+ int first_key_changed;
54187+
54188+ node = cdata->params.from->node;
54189+
54190+ first_key_changed =
54191+ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
54192+ info);
54193+ compact(node, &cinfo);
54194+
54195+ if (info) {
54196+ /* it is not called by node40_shift, so we have to take care
54197+ of changes on upper levels */
54198+ if (node_is_empty(node))
54199+ /* all contents of node is deleted */
54200+ prepare_removal_node40(node, info);
54201+ else if (first_key_changed) {
54202+ prepare_for_update(NULL, node, info);
54203+ }
54204+ }
54205+
54206+ coord_clear_iplug(cdata->params.from);
54207+ coord_clear_iplug(cdata->params.to);
54208+
54209+ znode_make_dirty(node);
54210+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54211+}
54212+
54213+/* this structure is used by shift method of node40 plugin */
54214+struct shift_params {
54215+ shift_direction pend; /* when @pend == append - we are shifting to
54216+ left, when @pend == prepend - to right */
54217+ coord_t wish_stop; /* when shifting to left this is last unit we
54218+ want shifted, when shifting to right - this
54219+ is set to unit we want to start shifting
54220+ from */
54221+ znode *target;
54222+ int everything; /* it is set to 1 if everything we have to shift is
54223+ shifted, 0 - otherwise */
54224+
54225+ /* FIXME-VS: get rid of read_stop */
54226+
54227+ /* these are set by estimate_shift */
54228+ coord_t real_stop; /* this will be set to last unit which will be
54229+ really shifted */
54230+
54231+ /* coordinate in source node before operation of unit which becomes
54232+ first after shift to left of last after shift to right */
54233+ union {
54234+ coord_t future_first;
54235+ coord_t future_last;
54236+ } u;
54237+
54238+ unsigned merging_units; /* number of units of first item which have to
54239+ be merged with last item of target node */
54240+ unsigned merging_bytes; /* number of bytes in those units */
54241+
54242+ unsigned entire; /* items shifted in their entirety */
54243+ unsigned entire_bytes; /* number of bytes in those items */
54244+
54245+ unsigned part_units; /* number of units of partially copied item */
54246+ unsigned part_bytes; /* number of bytes in those units */
54247+
54248+ unsigned shift_bytes; /* total number of bytes in items shifted (item
54249+ headers not included) */
54250+
54251+};
54252+
54253+static int item_creation_overhead(coord_t *item)
54254+{
54255+ return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
54256+}
54257+
54258+/* how many units are there in @source starting from source->unit_pos
54259+ but not further than @stop_coord */
54260+static int
54261+wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
54262+{
54263+ if (pend == SHIFT_LEFT) {
54264+ assert("vs-181", source->unit_pos == 0);
54265+ } else {
54266+ assert("vs-182",
54267+ source->unit_pos == coord_last_unit_pos(source));
54268+ }
54269+
54270+ if (source->item_pos != stop_coord->item_pos) {
54271+ /* @source and @stop_coord are different items */
54272+ return coord_last_unit_pos(source) + 1;
54273+ }
54274+
54275+ if (pend == SHIFT_LEFT) {
54276+ return stop_coord->unit_pos + 1;
54277+ } else {
54278+ return source->unit_pos - stop_coord->unit_pos + 1;
54279+ }
54280+}
54281+
54282+/* this calculates what can be copied from @shift->wish_stop.node to
54283+ @shift->target */
54284+static void
54285+estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
54286+{
54287+ unsigned target_free_space, size;
54288+ pos_in_node_t stop_item; /* item which estimating should not consider */
54289+ unsigned want; /* number of units of item we want shifted */
54290+ coord_t source; /* item being estimated */
54291+ item_plugin *iplug;
54292+
54293+ /* shifting to left/right starts from first/last units of
54294+ @shift->wish_stop.node */
54295+ if (shift->pend == SHIFT_LEFT) {
54296+ coord_init_first_unit(&source, shift->wish_stop.node);
54297+ } else {
54298+ coord_init_last_unit(&source, shift->wish_stop.node);
54299+ }
54300+ shift->real_stop = source;
54301+
54302+ /* free space in target node and number of items in source */
54303+ target_free_space = znode_free_space(shift->target);
54304+
54305+ shift->everything = 0;
54306+ if (!node_is_empty(shift->target)) {
54307+ /* target node is not empty, check for boundary items
54308+ mergeability */
54309+ coord_t to;
54310+
54311+ /* item we try to merge @source with */
54312+ if (shift->pend == SHIFT_LEFT) {
54313+ coord_init_last_unit(&to, shift->target);
54314+ } else {
54315+ coord_init_first_unit(&to, shift->target);
54316+ }
54317+
54318+ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
54319+ &source) :
54320+ are_items_mergeable(&source, &to)) {
54321+ /* how many units of @source do we want to merge to
54322+ item @to */
54323+ want =
54324+ wanted_units(&source, &shift->wish_stop,
54325+ shift->pend);
54326+
54327+ /* how many units of @source we can merge to item
54328+ @to */
54329+ iplug = item_plugin_by_coord(&source);
54330+ if (iplug->b.can_shift != NULL)
54331+ shift->merging_units =
54332+ iplug->b.can_shift(target_free_space,
54333+ &source, shift->target,
54334+ shift->pend, &size,
54335+ want);
54336+ else {
54337+ shift->merging_units = 0;
54338+ size = 0;
54339+ }
54340+ shift->merging_bytes = size;
54341+ shift->shift_bytes += size;
54342+ /* update stop coord to be set to last unit of @source
54343+ we can merge to @target */
54344+ if (shift->merging_units)
54345+ /* at least one unit can be shifted */
54346+ shift->real_stop.unit_pos =
54347+ (shift->merging_units - source.unit_pos -
54348+ 1) * shift->pend;
54349+ else {
54350+ /* nothing can be shifted */
54351+ if (shift->pend == SHIFT_LEFT)
54352+ coord_init_before_first_item(&shift->
54353+ real_stop,
54354+ source.
54355+ node);
54356+ else
54357+ coord_init_after_last_item(&shift->
54358+ real_stop,
54359+ source.node);
54360+ }
54361+ assert("nikita-2081", shift->real_stop.unit_pos + 1);
54362+
54363+ if (shift->merging_units != want) {
54364+ /* we could not copy as many as we want, so,
54365+ there is no reason for estimating any
54366+ longer */
54367+ return;
54368+ }
54369+
54370+ target_free_space -= size;
54371+ coord_add_item_pos(&source, shift->pend);
54372+ }
54373+ }
54374+
54375+ /* number of item nothing of which we want to shift */
54376+ stop_item = shift->wish_stop.item_pos + shift->pend;
54377+
54378+ /* calculate how many items can be copied into given free
54379+ space as whole */
54380+ for (; source.item_pos != stop_item;
54381+ coord_add_item_pos(&source, shift->pend)) {
54382+ if (shift->pend == SHIFT_RIGHT)
54383+ source.unit_pos = coord_last_unit_pos(&source);
54384+
54385+ /* how many units of @source do we want to copy */
54386+ want = wanted_units(&source, &shift->wish_stop, shift->pend);
54387+
54388+ if (want == coord_last_unit_pos(&source) + 1) {
54389+ /* we want this item to be copied entirely */
54390+ size =
54391+ item_length_by_coord(&source) +
54392+ item_creation_overhead(&source);
54393+ if (size <= target_free_space) {
54394+ /* item fits into target node as whole */
54395+ target_free_space -= size;
54396+ shift->shift_bytes +=
54397+ size - item_creation_overhead(&source);
54398+ shift->entire_bytes +=
54399+ size - item_creation_overhead(&source);
54400+ shift->entire++;
54401+
54402+ /* update shift->real_stop coord to be set to
54403+ last unit of @source we can merge to
54404+ @target */
54405+ shift->real_stop = source;
54406+ if (shift->pend == SHIFT_LEFT)
54407+ shift->real_stop.unit_pos =
54408+ coord_last_unit_pos(&shift->
54409+ real_stop);
54410+ else
54411+ shift->real_stop.unit_pos = 0;
54412+ continue;
54413+ }
54414+ }
54415+
54416+ /* we reach here only for an item which does not fit into
54417+ target node in its entirety. This item may be either
54418+ partially shifted, or not shifted at all. We will have to
54419+ create new item in target node, so decrease amout of free
54420+ space by an item creation overhead. We can reach here also
54421+ if stop coord is in this item */
54422+ if (target_free_space >=
54423+ (unsigned)item_creation_overhead(&source)) {
54424+ target_free_space -= item_creation_overhead(&source);
54425+ iplug = item_plugin_by_coord(&source);
54426+ if (iplug->b.can_shift) {
54427+ shift->part_units = iplug->b.can_shift(target_free_space,
54428+ &source,
54429+ NULL, /* target */
54430+ shift->pend,
54431+ &size,
54432+ want);
54433+ } else {
54434+ target_free_space = 0;
54435+ shift->part_units = 0;
54436+ size = 0;
54437+ }
54438+ } else {
54439+ target_free_space = 0;
54440+ shift->part_units = 0;
54441+ size = 0;
54442+ }
54443+ shift->part_bytes = size;
54444+ shift->shift_bytes += size;
54445+
54446+ /* set @shift->real_stop to last unit of @source we can merge
54447+ to @shift->target */
54448+ if (shift->part_units) {
54449+ shift->real_stop = source;
54450+ shift->real_stop.unit_pos =
54451+ (shift->part_units - source.unit_pos -
54452+ 1) * shift->pend;
54453+ assert("nikita-2082", shift->real_stop.unit_pos + 1);
54454+ }
54455+
54456+ if (want != shift->part_units)
54457+ /* not everything wanted were shifted */
54458+ return;
54459+ break;
54460+ }
54461+
54462+ shift->everything = 1;
54463+}
54464+
54465+static void
54466+copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
54467+ shift_direction dir, unsigned free_space)
54468+{
54469+ item_plugin *iplug;
54470+
54471+ assert("nikita-1463", target != NULL);
54472+ assert("nikita-1464", source != NULL);
54473+ assert("nikita-1465", from + count <= coord_num_units(source));
54474+
54475+ iplug = item_plugin_by_coord(source);
54476+ assert("nikita-1468", iplug == item_plugin_by_coord(target));
54477+ iplug->b.copy_units(target, source, from, count, dir, free_space);
54478+
54479+ if (dir == SHIFT_RIGHT) {
54480+ /* FIXME-VS: this looks not necessary. update_item_key was
54481+ called already by copy_units method */
54482+ reiser4_key split_key;
54483+
54484+ assert("nikita-1469", target->unit_pos == 0);
54485+
54486+ unit_key_by_coord(target, &split_key);
54487+ node_plugin_by_coord(target)->update_item_key(target,
54488+ &split_key, NULL);
54489+ }
54490+}
54491+
54492+/* copy part of @shift->real_stop.node starting either from its beginning or
54493+ from its end and ending at @shift->real_stop to either the end or the
54494+ beginning of @shift->target */
54495+static void copy(struct shift_params *shift)
54496+{
54497+ node40_header *nh;
54498+ coord_t from;
54499+ coord_t to;
54500+ item_header40 *from_ih, *to_ih;
54501+ int free_space_start;
54502+ int new_items;
54503+ unsigned old_items;
54504+ int old_offset;
54505+ unsigned i;
54506+
54507+ nh = node40_node_header(shift->target);
54508+ free_space_start = nh40_get_free_space_start(nh);
54509+ old_items = nh40_get_num_items(nh);
54510+ new_items = shift->entire + (shift->part_units ? 1 : 0);
54511+ assert("vs-185",
54512+ shift->shift_bytes ==
54513+ shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
54514+
54515+ from = shift->wish_stop;
54516+
54517+ coord_init_first_unit(&to, shift->target);
54518+
54519+ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
54520+ hence to.between is set to EMPTY_NODE above. Looks like we want it
54521+ to be AT_UNIT.
54522+
54523+ Oh, wonders of ->betweeness...
54524+
54525+ */
54526+ to.between = AT_UNIT;
54527+
54528+ if (shift->pend == SHIFT_LEFT) {
54529+ /* copying to left */
54530+
54531+ coord_set_item_pos(&from, 0);
54532+ from_ih = node40_ih_at(from.node, 0);
54533+
54534+ coord_set_item_pos(&to,
54535+ node40_num_of_items_internal(to.node) - 1);
54536+ if (shift->merging_units) {
54537+ /* expand last item, so that plugin methods will see
54538+ correct data */
54539+ free_space_start += shift->merging_bytes;
54540+ nh40_set_free_space_start(nh,
54541+ (unsigned)free_space_start);
54542+ nh40_set_free_space(nh,
54543+ nh40_get_free_space(nh) -
54544+ shift->merging_bytes);
54545+
54546+ /* appending last item of @target */
54547+ copy_units(&to, &from, 0, /* starting from 0-th unit */
54548+ shift->merging_units, SHIFT_LEFT,
54549+ shift->merging_bytes);
54550+ coord_inc_item_pos(&from);
54551+ from_ih--;
54552+ coord_inc_item_pos(&to);
54553+ }
54554+
54555+ to_ih = node40_ih_at(shift->target, old_items);
54556+ if (shift->entire) {
54557+ /* copy @entire items entirely */
54558+
54559+ /* copy item headers */
54560+ memcpy(to_ih - shift->entire + 1,
54561+ from_ih - shift->entire + 1,
54562+ shift->entire * sizeof(item_header40));
54563+ /* update item header offset */
54564+ old_offset = ih40_get_offset(from_ih);
54565+ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
54566+ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
54567+ ih40_set_offset(to_ih,
54568+ ih40_get_offset(from_ih) -
54569+ old_offset + free_space_start);
54570+
54571+ /* copy item bodies */
54572+ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
54573+ shift->entire_bytes);
54574+
54575+ coord_add_item_pos(&from, (int)shift->entire);
54576+ coord_add_item_pos(&to, (int)shift->entire);
54577+ }
54578+
54579+ nh40_set_free_space_start(nh,
54580+ free_space_start +
54581+ shift->shift_bytes -
54582+ shift->merging_bytes);
54583+ nh40_set_free_space(nh,
54584+ nh40_get_free_space(nh) -
54585+ (shift->shift_bytes - shift->merging_bytes +
54586+ sizeof(item_header40) * new_items));
54587+
54588+ /* update node header */
54589+ node40_set_num_items(shift->target, nh, old_items + new_items);
54590+ assert("vs-170",
54591+ nh40_get_free_space(nh) < znode_size(shift->target));
54592+
54593+ if (shift->part_units) {
54594+ /* copy heading part (@part units) of @source item as
54595+ a new item into @target->node */
54596+
54597+ /* copy item header of partially copied item */
54598+ coord_set_item_pos(&to,
54599+ node40_num_of_items_internal(to.node)
54600+ - 1);
54601+ memcpy(to_ih, from_ih, sizeof(item_header40));
54602+ ih40_set_offset(to_ih,
54603+ nh40_get_free_space_start(nh) -
54604+ shift->part_bytes);
54605+ if (item_plugin_by_coord(&to)->b.init)
54606+ item_plugin_by_coord(&to)->b.init(&to, &from,
54607+ NULL);
54608+ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
54609+ shift->part_bytes);
54610+ }
54611+
54612+ } else {
54613+ /* copying to right */
54614+
54615+ coord_set_item_pos(&from,
54616+ node40_num_of_items_internal(from.node) - 1);
54617+ from_ih = node40_ih_at_coord(&from);
54618+
54619+ coord_set_item_pos(&to, 0);
54620+
54621+ /* prepare space for new items */
54622+ memmove(zdata(to.node) + sizeof(node40_header) +
54623+ shift->shift_bytes,
54624+ zdata(to.node) + sizeof(node40_header),
54625+ free_space_start - sizeof(node40_header));
54626+ /* update item headers of moved items */
54627+ to_ih = node40_ih_at(to.node, 0);
54628+ /* first item gets @merging_bytes longer. free space appears
54629+ at its beginning */
54630+ if (!node_is_empty(to.node))
54631+ ih40_set_offset(to_ih,
54632+ ih40_get_offset(to_ih) +
54633+ shift->shift_bytes -
54634+ shift->merging_bytes);
54635+
54636+ for (i = 1; i < old_items; i++)
54637+ ih40_set_offset(to_ih - i,
54638+ ih40_get_offset(to_ih - i) +
54639+ shift->shift_bytes);
54640+
54641+ /* move item headers to make space for new items */
54642+ memmove(to_ih - old_items + 1 - new_items,
54643+ to_ih - old_items + 1,
54644+ sizeof(item_header40) * old_items);
54645+ to_ih -= (new_items - 1);
54646+
54647+ nh40_set_free_space_start(nh,
54648+ free_space_start +
54649+ shift->shift_bytes);
54650+ nh40_set_free_space(nh,
54651+ nh40_get_free_space(nh) -
54652+ (shift->shift_bytes +
54653+ sizeof(item_header40) * new_items));
54654+
54655+ /* update node header */
54656+ node40_set_num_items(shift->target, nh, old_items + new_items);
54657+ assert("vs-170",
54658+ nh40_get_free_space(nh) < znode_size(shift->target));
54659+
54660+ if (shift->merging_units) {
54661+ coord_add_item_pos(&to, new_items);
54662+ to.unit_pos = 0;
54663+ to.between = AT_UNIT;
54664+ /* prepend first item of @to */
54665+ copy_units(&to, &from,
54666+ coord_last_unit_pos(&from) -
54667+ shift->merging_units + 1,
54668+ shift->merging_units, SHIFT_RIGHT,
54669+ shift->merging_bytes);
54670+ coord_dec_item_pos(&from);
54671+ from_ih++;
54672+ }
54673+
54674+ if (shift->entire) {
54675+ /* copy @entire items entirely */
54676+
54677+ /* copy item headers */
54678+ memcpy(to_ih, from_ih,
54679+ shift->entire * sizeof(item_header40));
54680+
54681+ /* update item header offset */
54682+ old_offset =
54683+ ih40_get_offset(from_ih + shift->entire - 1);
54684+ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
54685+ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
54686+ ih40_set_offset(to_ih,
54687+ ih40_get_offset(from_ih) -
54688+ old_offset +
54689+ sizeof(node40_header) +
54690+ shift->part_bytes);
54691+ /* copy item bodies */
54692+ coord_add_item_pos(&from, -(int)(shift->entire - 1));
54693+ memcpy(zdata(to.node) + sizeof(node40_header) +
54694+ shift->part_bytes, item_by_coord_node40(&from),
54695+ shift->entire_bytes);
54696+ coord_dec_item_pos(&from);
54697+ }
54698+
54699+ if (shift->part_units) {
54700+ coord_set_item_pos(&to, 0);
54701+ to.unit_pos = 0;
54702+ to.between = AT_UNIT;
54703+ /* copy heading part (@part units) of @source item as
54704+ a new item into @target->node */
54705+
54706+ /* copy item header of partially copied item */
54707+ memcpy(to_ih, from_ih, sizeof(item_header40));
54708+ ih40_set_offset(to_ih, sizeof(node40_header));
54709+ if (item_plugin_by_coord(&to)->b.init)
54710+ item_plugin_by_coord(&to)->b.init(&to, &from,
54711+ NULL);
54712+ copy_units(&to, &from,
54713+ coord_last_unit_pos(&from) -
54714+ shift->part_units + 1, shift->part_units,
54715+ SHIFT_RIGHT, shift->part_bytes);
54716+ }
54717+ }
54718+}
54719+
54720+/* remove everything either before or after @fact_stop. Number of items
54721+ removed completely is returned */
54722+static int delete_copied(struct shift_params *shift)
54723+{
54724+ coord_t from;
54725+ coord_t to;
54726+ struct carry_cut_data cdata;
54727+
54728+ if (shift->pend == SHIFT_LEFT) {
54729+ /* we were shifting to left, remove everything from the
54730+ beginning of @shift->wish_stop->node upto
54731+ @shift->wish_stop */
54732+ coord_init_first_unit(&from, shift->real_stop.node);
54733+ to = shift->real_stop;
54734+
54735+ /* store old coordinate of unit which will be first after
54736+ shift to left */
54737+ shift->u.future_first = to;
54738+ coord_next_unit(&shift->u.future_first);
54739+ } else {
54740+ /* we were shifting to right, remove everything from
54741+ @shift->stop_coord upto to end of
54742+ @shift->stop_coord->node */
54743+ from = shift->real_stop;
54744+ coord_init_last_unit(&to, from.node);
54745+
54746+ /* store old coordinate of unit which will be last after
54747+ shift to right */
54748+ shift->u.future_last = from;
54749+ coord_prev_unit(&shift->u.future_last);
54750+ }
54751+
54752+ cdata.params.from = &from;
54753+ cdata.params.to = &to;
54754+ cdata.params.from_key = NULL;
54755+ cdata.params.to_key = NULL;
54756+ cdata.params.smallest_removed = NULL;
54757+ return cut_node40(&cdata, NULL);
54758+}
54759+
54760+/* something was moved between @left and @right. Add carry operation to @info
54761+ list to have carry to update delimiting key between them */
54762+static int
54763+prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
54764+{
54765+ carry_op *op;
54766+ carry_node *cn;
54767+
54768+ if (info == NULL)
54769+ /* nowhere to send operation to. */
54770+ return 0;
54771+
54772+ if (!should_notify_parent(right))
54773+ return 0;
54774+
54775+ op = node_post_carry(info, COP_UPDATE, right, 1);
54776+ if (IS_ERR(op) || op == NULL)
54777+ return op ? PTR_ERR(op) : -EIO;
54778+
54779+ if (left != NULL) {
54780+ carry_node *reference;
54781+
54782+ if (info->doing)
54783+ reference = insert_carry_node(info->doing,
54784+ info->todo, left);
54785+ else
54786+ reference = op->node;
54787+ assert("nikita-2992", reference != NULL);
54788+ cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
54789+ if (IS_ERR(cn))
54790+ return PTR_ERR(cn);
54791+ cn->parent = 1;
54792+ cn->node = left;
54793+ if (ZF_ISSET(left, JNODE_ORPHAN))
54794+ cn->left_before = 1;
54795+ op->u.update.left = cn;
54796+ } else
54797+ op->u.update.left = NULL;
54798+ return 0;
54799+}
54800+
54801+/* plugin->u.node.prepare_removal
54802+ to delete a pointer to @empty from the tree add corresponding carry
54803+ operation (delete) to @info list */
54804+int prepare_removal_node40(znode * empty, carry_plugin_info * info)
54805+{
54806+ carry_op *op;
54807+ reiser4_tree *tree;
54808+
54809+ if (!should_notify_parent(empty))
54810+ return 0;
54811+ /* already on a road to Styx */
54812+ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
54813+ return 0;
54814+ op = node_post_carry(info, COP_DELETE, empty, 1);
54815+ if (IS_ERR(op) || op == NULL)
54816+ return RETERR(op ? PTR_ERR(op) : -EIO);
54817+
54818+ op->u.delete.child = NULL;
54819+ op->u.delete.flags = 0;
54820+
54821+ /* fare thee well */
54822+ tree = znode_get_tree(empty);
54823+ read_lock_tree(tree);
54824+ write_lock_dk(tree);
54825+ znode_set_ld_key(empty, znode_get_rd_key(empty));
54826+ if (znode_is_left_connected(empty) && empty->left)
54827+ znode_set_rd_key(empty->left, znode_get_rd_key(empty));
54828+ write_unlock_dk(tree);
54829+ read_unlock_tree(tree);
54830+
54831+ ZF_SET(empty, JNODE_HEARD_BANSHEE);
54832+ return 0;
54833+}
54834+
54835+/* something were shifted from @insert_coord->node to @shift->target, update
54836+ @insert_coord correspondingly */
54837+static void
54838+adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
54839+ int including_insert_coord)
54840+{
54841+ /* item plugin was invalidated by shifting */
54842+ coord_clear_iplug(insert_coord);
54843+
54844+ if (node_is_empty(shift->wish_stop.node)) {
54845+ assert("vs-242", shift->everything);
54846+ if (including_insert_coord) {
54847+ if (shift->pend == SHIFT_RIGHT) {
54848+ /* set @insert_coord before first unit of
54849+ @shift->target node */
54850+ coord_init_before_first_item(insert_coord,
54851+ shift->target);
54852+ } else {
54853+ /* set @insert_coord after last in target node */
54854+ coord_init_after_last_item(insert_coord,
54855+ shift->target);
54856+ }
54857+ } else {
54858+ /* set @insert_coord inside of empty node. There is
54859+ only one possible coord within an empty
54860+ node. init_first_unit will set that coord */
54861+ coord_init_first_unit(insert_coord,
54862+ shift->wish_stop.node);
54863+ }
54864+ return;
54865+ }
54866+
54867+ if (shift->pend == SHIFT_RIGHT) {
54868+ /* there was shifting to right */
54869+ if (shift->everything) {
54870+ /* everything wanted was shifted */
54871+ if (including_insert_coord) {
54872+ /* @insert_coord is set before first unit of
54873+ @to node */
54874+ coord_init_before_first_item(insert_coord,
54875+ shift->target);
54876+ insert_coord->between = BEFORE_UNIT;
54877+ } else {
54878+ /* @insert_coord is set after last unit of
54879+ @insert->node */
54880+ coord_init_last_unit(insert_coord,
54881+ shift->wish_stop.node);
54882+ insert_coord->between = AFTER_UNIT;
54883+ }
54884+ }
54885+ return;
54886+ }
54887+
54888+ /* there was shifting to left */
54889+ if (shift->everything) {
54890+ /* everything wanted was shifted */
54891+ if (including_insert_coord) {
54892+ /* @insert_coord is set after last unit in @to node */
54893+ coord_init_after_last_item(insert_coord, shift->target);
54894+ } else {
54895+ /* @insert_coord is set before first unit in the same
54896+ node */
54897+ coord_init_before_first_item(insert_coord,
54898+ shift->wish_stop.node);
54899+ }
54900+ return;
54901+ }
54902+
54903+ /* FIXME-VS: the code below is complicated because with between ==
54904+ AFTER_ITEM unit_pos is set to 0 */
54905+
54906+ if (!removed) {
54907+ /* no items were shifted entirely */
54908+ assert("vs-195", shift->merging_units == 0
54909+ || shift->part_units == 0);
54910+
54911+ if (shift->real_stop.item_pos == insert_coord->item_pos) {
54912+ if (shift->merging_units) {
54913+ if (insert_coord->between == AFTER_UNIT) {
54914+ assert("nikita-1441",
54915+ insert_coord->unit_pos >=
54916+ shift->merging_units);
54917+ insert_coord->unit_pos -=
54918+ shift->merging_units;
54919+ } else if (insert_coord->between == BEFORE_UNIT) {
54920+ assert("nikita-2090",
54921+ insert_coord->unit_pos >
54922+ shift->merging_units);
54923+ insert_coord->unit_pos -=
54924+ shift->merging_units;
54925+ }
54926+
54927+ assert("nikita-2083",
54928+ insert_coord->unit_pos + 1);
54929+ } else {
54930+ if (insert_coord->between == AFTER_UNIT) {
54931+ assert("nikita-1442",
54932+ insert_coord->unit_pos >=
54933+ shift->part_units);
54934+ insert_coord->unit_pos -=
54935+ shift->part_units;
54936+ } else if (insert_coord->between == BEFORE_UNIT) {
54937+ assert("nikita-2089",
54938+ insert_coord->unit_pos >
54939+ shift->part_units);
54940+ insert_coord->unit_pos -=
54941+ shift->part_units;
54942+ }
54943+
54944+ assert("nikita-2084",
54945+ insert_coord->unit_pos + 1);
54946+ }
54947+ }
54948+ return;
54949+ }
54950+
54951+ /* we shifted to left and there was no enough space for everything */
54952+ switch (insert_coord->between) {
54953+ case AFTER_UNIT:
54954+ case BEFORE_UNIT:
54955+ if (shift->real_stop.item_pos == insert_coord->item_pos)
54956+ insert_coord->unit_pos -= shift->part_units;
54957+ case AFTER_ITEM:
54958+ coord_add_item_pos(insert_coord, -removed);
54959+ break;
54960+ default:
54961+ impossible("nikita-2087", "not ready");
54962+ }
54963+ assert("nikita-2085", insert_coord->unit_pos + 1);
54964+}
54965+
54966+static int call_shift_hooks(struct shift_params *shift)
54967+{
54968+ unsigned i, shifted;
54969+ coord_t coord;
54970+ item_plugin *iplug;
54971+
54972+ assert("vs-275", !node_is_empty(shift->target));
54973+
54974+ /* number of items shift touches */
54975+ shifted =
54976+ shift->entire + (shift->merging_units ? 1 : 0) +
54977+ (shift->part_units ? 1 : 0);
54978+
54979+ if (shift->pend == SHIFT_LEFT) {
54980+ /* moved items are at the end */
54981+ coord_init_last_unit(&coord, shift->target);
54982+ coord.unit_pos = 0;
54983+
54984+ assert("vs-279", shift->pend == 1);
54985+ for (i = 0; i < shifted; i++) {
54986+ unsigned from, count;
54987+
54988+ iplug = item_plugin_by_coord(&coord);
54989+ if (i == 0 && shift->part_units) {
54990+ assert("vs-277",
54991+ coord_num_units(&coord) ==
54992+ shift->part_units);
54993+ count = shift->part_units;
54994+ from = 0;
54995+ } else if (i == shifted - 1 && shift->merging_units) {
54996+ count = shift->merging_units;
54997+ from = coord_num_units(&coord) - count;
54998+ } else {
54999+ count = coord_num_units(&coord);
55000+ from = 0;
55001+ }
55002+
55003+ if (iplug->b.shift_hook) {
55004+ iplug->b.shift_hook(&coord, from, count,
55005+ shift->wish_stop.node);
55006+ }
55007+ coord_add_item_pos(&coord, -shift->pend);
55008+ }
55009+ } else {
55010+ /* moved items are at the beginning */
55011+ coord_init_first_unit(&coord, shift->target);
55012+
55013+ assert("vs-278", shift->pend == -1);
55014+ for (i = 0; i < shifted; i++) {
55015+ unsigned from, count;
55016+
55017+ iplug = item_plugin_by_coord(&coord);
55018+ if (i == 0 && shift->part_units) {
55019+ assert("vs-277",
55020+ coord_num_units(&coord) ==
55021+ shift->part_units);
55022+ count = coord_num_units(&coord);
55023+ from = 0;
55024+ } else if (i == shifted - 1 && shift->merging_units) {
55025+ count = shift->merging_units;
55026+ from = 0;
55027+ } else {
55028+ count = coord_num_units(&coord);
55029+ from = 0;
55030+ }
55031+
55032+ if (iplug->b.shift_hook) {
55033+ iplug->b.shift_hook(&coord, from, count,
55034+ shift->wish_stop.node);
55035+ }
55036+ coord_add_item_pos(&coord, -shift->pend);
55037+ }
55038+ }
55039+
55040+ return 0;
55041+}
55042+
55043+/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
55044+static int
55045+unit_moved_left(const struct shift_params *shift, const coord_t * old)
55046+{
55047+ assert("vs-944", shift->real_stop.node == old->node);
55048+
55049+ if (shift->real_stop.item_pos < old->item_pos)
55050+ return 0;
55051+ if (shift->real_stop.item_pos == old->item_pos) {
55052+ if (shift->real_stop.unit_pos < old->unit_pos)
55053+ return 0;
55054+ }
55055+ return 1;
55056+}
55057+
55058+/* shift to right is completed. Return 1 if unit @old was moved to right
55059+ neighbor */
55060+static int
55061+unit_moved_right(const struct shift_params *shift, const coord_t * old)
55062+{
55063+ assert("vs-944", shift->real_stop.node == old->node);
55064+
55065+ if (shift->real_stop.item_pos > old->item_pos)
55066+ return 0;
55067+ if (shift->real_stop.item_pos == old->item_pos) {
55068+ if (shift->real_stop.unit_pos > old->unit_pos)
55069+ return 0;
55070+ }
55071+ return 1;
55072+}
55073+
55074+/* coord @old was set in node from which shift was performed. What was shifted
55075+ is stored in @shift. Update @old correspondingly to performed shift */
55076+static coord_t *adjust_coord2(const struct shift_params *shift,
55077+ const coord_t * old, coord_t * new)
55078+{
55079+ coord_clear_iplug(new);
55080+ new->between = old->between;
55081+
55082+ coord_clear_iplug(new);
55083+ if (old->node == shift->target) {
55084+ if (shift->pend == SHIFT_LEFT) {
55085+ /* coord which is set inside of left neighbor does not
55086+ change during shift to left */
55087+ coord_dup(new, old);
55088+ return new;
55089+ }
55090+ new->node = old->node;
55091+ coord_set_item_pos(new,
55092+ old->item_pos + shift->entire +
55093+ (shift->part_units ? 1 : 0));
55094+ new->unit_pos = old->unit_pos;
55095+ if (old->item_pos == 0 && shift->merging_units)
55096+ new->unit_pos += shift->merging_units;
55097+ return new;
55098+ }
55099+
55100+ assert("vs-977", old->node == shift->wish_stop.node);
55101+ if (shift->pend == SHIFT_LEFT) {
55102+ if (unit_moved_left(shift, old)) {
55103+ /* unit @old moved to left neighbor. Calculate its
55104+ coordinate there */
55105+ new->node = shift->target;
55106+ coord_set_item_pos(new,
55107+ node_num_items(shift->target) -
55108+ shift->entire -
55109+ (shift->part_units ? 1 : 0) +
55110+ old->item_pos);
55111+
55112+ new->unit_pos = old->unit_pos;
55113+ if (shift->merging_units) {
55114+ coord_dec_item_pos(new);
55115+ if (old->item_pos == 0) {
55116+ /* unit_pos only changes if item got
55117+ merged */
55118+ new->unit_pos =
55119+ coord_num_units(new) -
55120+ (shift->merging_units -
55121+ old->unit_pos);
55122+ }
55123+ }
55124+ } else {
55125+ /* unit @old did not move to left neighbor.
55126+
55127+ Use _nocheck, because @old is outside of its node.
55128+ */
55129+ coord_dup_nocheck(new, old);
55130+ coord_add_item_pos(new,
55131+ -shift->u.future_first.item_pos);
55132+ if (new->item_pos == 0)
55133+ new->unit_pos -= shift->u.future_first.unit_pos;
55134+ }
55135+ } else {
55136+ if (unit_moved_right(shift, old)) {
55137+ /* unit @old moved to right neighbor */
55138+ new->node = shift->target;
55139+ coord_set_item_pos(new,
55140+ old->item_pos -
55141+ shift->real_stop.item_pos);
55142+ if (new->item_pos == 0) {
55143+ /* unit @old might change unit pos */
55144+ coord_set_item_pos(new,
55145+ old->unit_pos -
55146+ shift->real_stop.unit_pos);
55147+ }
55148+ } else {
55149+ /* unit @old did not move to right neighbor, therefore
55150+ it did not change */
55151+ coord_dup(new, old);
55152+ }
55153+ }
55154+ coord_set_iplug(new, item_plugin_by_coord(new));
55155+ return new;
55156+}
55157+
55158+/* this is called when shift is completed (something of source node is copied
55159+ to target and deleted in source) to update all taps set in current
55160+ context */
55161+static void update_taps(const struct shift_params *shift)
55162+{
55163+ tap_t *tap;
55164+ coord_t new;
55165+
55166+ for_all_taps(tap) {
55167+ /* update only taps set to nodes participating in shift */
55168+ if (tap->coord->node == shift->wish_stop.node
55169+ || tap->coord->node == shift->target)
55170+ tap_to_coord(tap,
55171+ adjust_coord2(shift, tap->coord, &new));
55172+ }
55173+}
55174+
55175+#if REISER4_DEBUG
55176+
55177+struct shift_check {
55178+ reiser4_key key;
55179+ __u16 plugin_id;
55180+ union {
55181+ __u64 bytes;
55182+ __u64 entries;
55183+ void *unused;
55184+ } u;
55185+};
55186+
55187+void *shift_check_prepare(const znode * left, const znode * right)
55188+{
55189+ pos_in_node_t i, nr_items;
55190+ int mergeable;
55191+ struct shift_check *data;
55192+ item_header40 *ih;
55193+
55194+ if (node_is_empty(left) || node_is_empty(right))
55195+ mergeable = 0;
55196+ else {
55197+ coord_t l, r;
55198+
55199+ coord_init_last_unit(&l, left);
55200+ coord_init_first_unit(&r, right);
55201+ mergeable = are_items_mergeable(&l, &r);
55202+ }
55203+ nr_items =
55204+ node40_num_of_items_internal(left) +
55205+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55206+ data =
55207+ kmalloc(sizeof(struct shift_check) * nr_items,
55208+ reiser4_ctx_gfp_mask_get());
55209+ if (data != NULL) {
55210+ coord_t coord;
55211+ pos_in_node_t item_pos;
55212+
55213+ coord_init_first_unit(&coord, left);
55214+ i = 0;
55215+
55216+ for (item_pos = 0;
55217+ item_pos < node40_num_of_items_internal(left);
55218+ item_pos++) {
55219+
55220+ coord_set_item_pos(&coord, item_pos);
55221+ ih = node40_ih_at_coord(&coord);
55222+
55223+ data[i].key = ih->key;
55224+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55225+ switch (data[i].plugin_id) {
55226+ case CTAIL_ID:
55227+ case FORMATTING_ID:
55228+ data[i].u.bytes = coord_num_units(&coord);
55229+ break;
55230+ case EXTENT_POINTER_ID:
55231+ data[i].u.bytes =
55232+ reiser4_extent_size(&coord,
55233+ coord_num_units(&coord));
55234+ break;
55235+ case COMPOUND_DIR_ID:
55236+ data[i].u.entries = coord_num_units(&coord);
55237+ break;
55238+ default:
55239+ data[i].u.unused = NULL;
55240+ break;
55241+ }
55242+ i++;
55243+ }
55244+
55245+ coord_init_first_unit(&coord, right);
55246+
55247+ if (mergeable) {
55248+ assert("vs-1609", i != 0);
55249+
55250+ ih = node40_ih_at_coord(&coord);
55251+
55252+ assert("vs-1589",
55253+ data[i - 1].plugin_id ==
55254+ le16_to_cpu(get_unaligned(&ih->plugin_id)));
55255+ switch (data[i - 1].plugin_id) {
55256+ case CTAIL_ID:
55257+ case FORMATTING_ID:
55258+ data[i - 1].u.bytes += coord_num_units(&coord);
55259+ break;
55260+ case EXTENT_POINTER_ID:
55261+ data[i - 1].u.bytes +=
55262+ reiser4_extent_size(&coord,
55263+ coord_num_units(&coord));
55264+ break;
55265+ case COMPOUND_DIR_ID:
55266+ data[i - 1].u.entries +=
55267+ coord_num_units(&coord);
55268+ break;
55269+ default:
55270+ impossible("vs-1605", "wrong mergeable item");
55271+ break;
55272+ }
55273+ item_pos = 1;
55274+ } else
55275+ item_pos = 0;
55276+ for (; item_pos < node40_num_of_items_internal(right);
55277+ item_pos++) {
55278+
55279+ assert("vs-1604", i < nr_items);
55280+ coord_set_item_pos(&coord, item_pos);
55281+ ih = node40_ih_at_coord(&coord);
55282+
55283+ data[i].key = ih->key;
55284+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55285+ switch (data[i].plugin_id) {
55286+ case CTAIL_ID:
55287+ case FORMATTING_ID:
55288+ data[i].u.bytes = coord_num_units(&coord);
55289+ break;
55290+ case EXTENT_POINTER_ID:
55291+ data[i].u.bytes =
55292+ reiser4_extent_size(&coord,
55293+ coord_num_units(&coord));
55294+ break;
55295+ case COMPOUND_DIR_ID:
55296+ data[i].u.entries = coord_num_units(&coord);
55297+ break;
55298+ default:
55299+ data[i].u.unused = NULL;
55300+ break;
55301+ }
55302+ i++;
55303+ }
55304+ assert("vs-1606", i == nr_items);
55305+ }
55306+ return data;
55307+}
55308+
55309+void shift_check(void *vp, const znode * left, const znode * right)
55310+{
55311+ pos_in_node_t i, nr_items;
55312+ coord_t coord;
55313+ __u64 last_bytes;
55314+ int mergeable;
55315+ item_header40 *ih;
55316+ pos_in_node_t item_pos;
55317+ struct shift_check *data;
55318+
55319+ data = (struct shift_check *)vp;
55320+
55321+ if (data == NULL)
55322+ return;
55323+
55324+ if (node_is_empty(left) || node_is_empty(right))
55325+ mergeable = 0;
55326+ else {
55327+ coord_t l, r;
55328+
55329+ coord_init_last_unit(&l, left);
55330+ coord_init_first_unit(&r, right);
55331+ mergeable = are_items_mergeable(&l, &r);
55332+ }
55333+
55334+ nr_items =
55335+ node40_num_of_items_internal(left) +
55336+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55337+
55338+ i = 0;
55339+ last_bytes = 0;
55340+
55341+ coord_init_first_unit(&coord, left);
55342+
55343+ for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
55344+ item_pos++) {
55345+
55346+ coord_set_item_pos(&coord, item_pos);
55347+ ih = node40_ih_at_coord(&coord);
55348+
55349+ assert("vs-1611", i == item_pos);
55350+ assert("vs-1590", keyeq(&ih->key, &data[i].key));
55351+ assert("vs-1591",
55352+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55353+ if ((i < (node40_num_of_items_internal(left) - 1))
55354+ || !mergeable) {
55355+ switch (data[i].plugin_id) {
55356+ case CTAIL_ID:
55357+ case FORMATTING_ID:
55358+ assert("vs-1592",
55359+ data[i].u.bytes ==
55360+ coord_num_units(&coord));
55361+ break;
55362+ case EXTENT_POINTER_ID:
55363+ assert("vs-1593",
55364+ data[i].u.bytes ==
55365+ reiser4_extent_size(&coord,
55366+ coord_num_units
55367+ (&coord)));
55368+ break;
55369+ case COMPOUND_DIR_ID:
55370+ assert("vs-1594",
55371+ data[i].u.entries ==
55372+ coord_num_units(&coord));
55373+ break;
55374+ default:
55375+ break;
55376+ }
55377+ }
55378+ if (item_pos == (node40_num_of_items_internal(left) - 1)
55379+ && mergeable) {
55380+ switch (data[i].plugin_id) {
55381+ case CTAIL_ID:
55382+ case FORMATTING_ID:
55383+ last_bytes = coord_num_units(&coord);
55384+ break;
55385+ case EXTENT_POINTER_ID:
55386+ last_bytes =
55387+ reiser4_extent_size(&coord,
55388+ coord_num_units(&coord));
55389+ break;
55390+ case COMPOUND_DIR_ID:
55391+ last_bytes = coord_num_units(&coord);
55392+ break;
55393+ default:
55394+ impossible("vs-1595", "wrong mergeable item");
55395+ break;
55396+ }
55397+ }
55398+ i++;
55399+ }
55400+
55401+ coord_init_first_unit(&coord, right);
55402+ if (mergeable) {
55403+ ih = node40_ih_at_coord(&coord);
55404+
55405+ assert("vs-1589",
55406+ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
55407+ assert("vs-1608", last_bytes != 0);
55408+ switch (data[i - 1].plugin_id) {
55409+ case CTAIL_ID:
55410+ case FORMATTING_ID:
55411+ assert("vs-1596",
55412+ data[i - 1].u.bytes ==
55413+ last_bytes + coord_num_units(&coord));
55414+ break;
55415+
55416+ case EXTENT_POINTER_ID:
55417+ assert("vs-1597",
55418+ data[i - 1].u.bytes ==
55419+ last_bytes + reiser4_extent_size(&coord,
55420+ coord_num_units
55421+ (&coord)));
55422+ break;
55423+
55424+ case COMPOUND_DIR_ID:
55425+ assert("vs-1598",
55426+ data[i - 1].u.bytes ==
55427+ last_bytes + coord_num_units(&coord));
55428+ break;
55429+ default:
55430+ impossible("vs-1599", "wrong mergeable item");
55431+ break;
55432+ }
55433+ item_pos = 1;
55434+ } else
55435+ item_pos = 0;
55436+
55437+ for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
55438+
55439+ coord_set_item_pos(&coord, item_pos);
55440+ ih = node40_ih_at_coord(&coord);
55441+
55442+ assert("vs-1612", keyeq(&ih->key, &data[i].key));
55443+ assert("vs-1613",
55444+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55445+ switch (data[i].plugin_id) {
55446+ case CTAIL_ID:
55447+ case FORMATTING_ID:
55448+ assert("vs-1600",
55449+ data[i].u.bytes == coord_num_units(&coord));
55450+ break;
55451+ case EXTENT_POINTER_ID:
55452+ assert("vs-1601",
55453+ data[i].u.bytes ==
55454+ reiser4_extent_size(&coord,
55455+ coord_num_units
55456+ (&coord)));
55457+ break;
55458+ case COMPOUND_DIR_ID:
55459+ assert("vs-1602",
55460+ data[i].u.entries == coord_num_units(&coord));
55461+ break;
55462+ default:
55463+ break;
55464+ }
55465+ i++;
55466+ }
55467+
55468+ assert("vs-1603", i == nr_items);
55469+ kfree(data);
55470+}
55471+
55472+#endif
55473+
55474+/* plugin->u.node.shift
55475+ look for description of this method in plugin/node/node.h */
55476+int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
55477+ deleted from the tree if this is set to 1 */
55478+ int including_stop_coord, carry_plugin_info * info)
55479+{
55480+ struct shift_params shift;
55481+ int result;
55482+ znode *left, *right;
55483+ znode *source;
55484+ int target_empty;
55485+
55486+ assert("nikita-2161", coord_check(from));
55487+
55488+ memset(&shift, 0, sizeof(shift));
55489+ shift.pend = pend;
55490+ shift.wish_stop = *from;
55491+ shift.target = to;
55492+
55493+ assert("nikita-1473", znode_is_write_locked(from->node));
55494+ assert("nikita-1474", znode_is_write_locked(to));
55495+
55496+ source = from->node;
55497+
55498+ /* set @shift.wish_stop to rightmost/leftmost unit among units we want
55499+ shifted */
55500+ if (pend == SHIFT_LEFT) {
55501+ result = coord_set_to_left(&shift.wish_stop);
55502+ left = to;
55503+ right = from->node;
55504+ } else {
55505+ result = coord_set_to_right(&shift.wish_stop);
55506+ left = from->node;
55507+ right = to;
55508+ }
55509+
55510+ if (result) {
55511+ /* move insertion coord even if there is nothing to move */
55512+ if (including_stop_coord) {
55513+ /* move insertion coord (@from) */
55514+ if (pend == SHIFT_LEFT) {
55515+ /* after last item in target node */
55516+ coord_init_after_last_item(from, to);
55517+ } else {
55518+ /* before first item in target node */
55519+ coord_init_before_first_item(from, to);
55520+ }
55521+ }
55522+
55523+ if (delete_child && node_is_empty(shift.wish_stop.node))
55524+ result =
55525+ prepare_removal_node40(shift.wish_stop.node, info);
55526+ else
55527+ result = 0;
55528+ /* there is nothing to shift */
55529+ assert("nikita-2078", coord_check(from));
55530+ return result;
55531+ }
55532+
55533+ target_empty = node_is_empty(to);
55534+
55535+ /* when first node plugin with item body compression is implemented,
55536+ this must be changed to call node specific plugin */
55537+
55538+ /* shift->stop_coord is updated to last unit which really will be
55539+ shifted */
55540+ estimate_shift(&shift, get_current_context());
55541+ if (!shift.shift_bytes) {
55542+ /* we could not shift anything */
55543+ assert("nikita-2079", coord_check(from));
55544+ return 0;
55545+ }
55546+
55547+ copy(&shift);
55548+
55549+ /* result value of this is important. It is used by adjust_coord below */
55550+ result = delete_copied(&shift);
55551+
55552+ assert("vs-1610", result >= 0);
55553+ assert("vs-1471",
55554+ ((reiser4_context *) current->journal_info)->magic ==
55555+ context_magic);
55556+
55557+ /* item which has been moved from one node to another might want to do
55558+ something on that event. This can be done by item's shift_hook
55559+ method, which will be now called for every moved items */
55560+ call_shift_hooks(&shift);
55561+
55562+ assert("vs-1472",
55563+ ((reiser4_context *) current->journal_info)->magic ==
55564+ context_magic);
55565+
55566+ update_taps(&shift);
55567+
55568+ assert("vs-1473",
55569+ ((reiser4_context *) current->journal_info)->magic ==
55570+ context_magic);
55571+
55572+ /* adjust @from pointer in accordance with @including_stop_coord flag
55573+ and amount of data which was really shifted */
55574+ adjust_coord(from, &shift, result, including_stop_coord);
55575+
55576+ if (target_empty)
55577+ /*
55578+ * items were shifted into empty node. Update delimiting key.
55579+ */
55580+ result = prepare_for_update(NULL, left, info);
55581+
55582+ /* add update operation to @info, which is the list of operations to
55583+ be performed on a higher level */
55584+ result = prepare_for_update(left, right, info);
55585+ if (!result && node_is_empty(source) && delete_child) {
55586+ /* all contents of @from->node is moved to @to and @from->node
55587+ has to be removed from the tree, so, on higher level we
55588+ will be removing the pointer to node @from->node */
55589+ result = prepare_removal_node40(source, info);
55590+ }
55591+ assert("nikita-2080", coord_check(from));
55592+ return result ? result : (int)shift.shift_bytes;
55593+}
55594+
55595+/* plugin->u.node.fast_insert()
55596+ look for description of this method in plugin/node/node.h */
55597+int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55598+{
55599+ return 1;
55600+}
55601+
55602+/* plugin->u.node.fast_paste()
55603+ look for description of this method in plugin/node/node.h */
55604+int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55605+{
55606+ return 1;
55607+}
55608+
55609+/* plugin->u.node.fast_cut()
55610+ look for description of this method in plugin/node/node.h */
55611+int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55612+{
55613+ return 1;
55614+}
55615+
55616+/* plugin->u.node.modify - not defined */
55617+
55618+/* plugin->u.node.max_item_size */
55619+int max_item_size_node40(void)
55620+{
55621+ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
55622+ sizeof(item_header40);
55623+}
55624+
55625+/* plugin->u.node.set_item_plugin */
55626+int set_item_plugin_node40(coord_t *coord, item_id id)
55627+{
55628+ item_header40 *ih;
55629+
55630+ ih = node40_ih_at_coord(coord);
55631+ put_unaligned(cpu_to_le16(id), &ih->plugin_id);
55632+ coord->iplugid = id;
55633+ return 0;
55634+}
55635+
55636+/*
55637+ Local variables:
55638+ c-indentation-style: "K&R"
55639+ mode-name: "LC"
55640+ c-basic-offset: 8
55641+ tab-width: 8
55642+ fill-column: 120
55643+ scroll-step: 1
55644+ End:
55645+*/
55646diff -urN linux-2.6.24.orig/fs/reiser4/plugin/node/node40.h linux-2.6.24/fs/reiser4/plugin/node/node40.h
55647--- linux-2.6.24.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 03:00:00.000000000 +0300
55648+++ linux-2.6.24/fs/reiser4/plugin/node/node40.h 2008-01-25 11:39:07.040234479 +0300
55649@@ -0,0 +1,125 @@
55650+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55651+
55652+#if !defined( __REISER4_NODE40_H__ )
55653+#define __REISER4_NODE40_H__
55654+
55655+#include "../../forward.h"
55656+#include "../../dformat.h"
55657+#include "node.h"
55658+
55659+#include <linux/types.h>
55660+
55661+/* format of node header for 40 node layouts. Keep bloat out of this struct. */
55662+typedef struct node40_header {
55663+ /* identifier of node plugin. Must be located at the very beginning
55664+ of a node. */
55665+ common_node_header common_header; /* this is 16 bits */
55666+ /* number of items. Should be first element in the node header,
55667+ because we haven't yet finally decided whether it shouldn't go into
55668+ common_header.
55669+ */
55670+/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
55671+ * node format at compile time, and it is this one, accesses do not function dereference when
55672+ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
55673+ d16 nr_items;
55674+ /* free space in node measured in bytes */
55675+ d16 free_space;
55676+ /* offset to start of free space in node */
55677+ d16 free_space_start;
55678+ /* for reiser4_fsck. When information about what is a free
55679+ block is corrupted, and we try to recover everything even
55680+ if marked as freed, then old versions of data may
55681+ duplicate newer versions, and this field allows us to
55682+ restore the newer version. Also useful for when users
55683+ who don't have the new trashcan installed on their linux distro
55684+ delete the wrong files and send us desperate emails
55685+ offering $25 for them back. */
55686+
55687+ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
55688+ d32 magic;
55689+ /* flushstamp is made of mk_id and write_counter. mk_id is an
55690+ id generated randomly at mkreiserfs time. So we can just
55691+ skip all nodes with different mk_id. write_counter is d64
55692+ incrementing counter of writes on disk. It is used for
55693+ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
55694+
55695+ d32 mkfs_id;
55696+ d64 flush_id;
55697+ /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
55698+ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
55699+ d16 flags;
55700+
55701+ /* 1 is leaf level, 2 is twig level, root is the numerically
55702+ largest level */
55703+ d8 level;
55704+
55705+ d8 pad;
55706+} PACKED node40_header;
55707+
55708+/* item headers are not standard across all node layouts, pass
55709+ pos_in_node to functions instead */
55710+typedef struct item_header40 {
55711+ /* key of item */
55712+ /* 0 */ reiser4_key key;
55713+ /* offset from start of a node measured in 8-byte chunks */
55714+ /* 24 */ d16 offset;
55715+ /* 26 */ d16 flags;
55716+ /* 28 */ d16 plugin_id;
55717+} PACKED item_header40;
55718+
55719+size_t item_overhead_node40(const znode * node, flow_t * aflow);
55720+size_t free_space_node40(znode * node);
55721+node_search_result lookup_node40(znode * node, const reiser4_key * key,
55722+ lookup_bias bias, coord_t * coord);
55723+int num_of_items_node40(const znode * node);
55724+char *item_by_coord_node40(const coord_t * coord);
55725+int length_by_coord_node40(const coord_t * coord);
55726+item_plugin *plugin_by_coord_node40(const coord_t * coord);
55727+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
55728+size_t estimate_node40(znode * node);
55729+int check_node40(const znode * node, __u32 flags, const char **error);
55730+int parse_node40(znode * node);
55731+int init_node40(znode * node);
55732+#ifdef GUESS_EXISTS
55733+int guess_node40(const znode * node);
55734+#endif
55735+void change_item_size_node40(coord_t * coord, int by);
55736+int create_item_node40(coord_t * target, const reiser4_key * key,
55737+ reiser4_item_data * data, carry_plugin_info * info);
55738+void update_item_key_node40(coord_t * target, const reiser4_key * key,
55739+ carry_plugin_info * info);
55740+int kill_node40(struct carry_kill_data *, carry_plugin_info *);
55741+int cut_node40(struct carry_cut_data *, carry_plugin_info *);
55742+int shift_node40(coord_t * from, znode * to, shift_direction pend,
55743+ /* if @from->node becomes
55744+ empty - it will be deleted from
55745+ the tree if this is set to 1
55746+ */
55747+ int delete_child, int including_stop_coord,
55748+ carry_plugin_info * info);
55749+
55750+int fast_insert_node40(const coord_t * coord);
55751+int fast_paste_node40(const coord_t * coord);
55752+int fast_cut_node40(const coord_t * coord);
55753+int max_item_size_node40(void);
55754+int prepare_removal_node40(znode * empty, carry_plugin_info * info);
55755+int set_item_plugin_node40(coord_t * coord, item_id id);
55756+int shrink_item_node40(coord_t * coord, int delta);
55757+
55758+#if REISER4_DEBUG
55759+void *shift_check_prepare(const znode *left, const znode *right);
55760+void shift_check(void *vp, const znode *left, const znode *right);
55761+#endif
55762+
55763+/* __REISER4_NODE40_H__ */
55764+#endif
55765+/*
55766+ Local variables:
55767+ c-indentation-style: "K&R"
55768+ mode-name: "LC"
55769+ c-basic-offset: 8
55770+ tab-width: 8
55771+ fill-column: 120
55772+ scroll-step: 1
55773+ End:
55774+*/
55775diff -urN linux-2.6.24.orig/fs/reiser4/plugin/node/node.c linux-2.6.24/fs/reiser4/plugin/node/node.c
55776--- linux-2.6.24.orig/fs/reiser4/plugin/node/node.c 1970-01-01 03:00:00.000000000 +0300
55777+++ linux-2.6.24/fs/reiser4/plugin/node/node.c 2008-01-25 11:39:07.040234479 +0300
55778@@ -0,0 +1,131 @@
55779+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55780+
55781+/* Node plugin interface.
55782+
55783+ Description: The tree provides the abstraction of flows, which it
55784+ internally fragments into items which it stores in nodes.
55785+
55786+ A key_atom is a piece of data bound to a single key.
55787+
55788+ For reasonable space efficiency to be achieved it is often
55789+ necessary to store key_atoms in the nodes in the form of items, where
55790+ an item is a sequence of key_atoms of the same or similar type. It is
55791+ more space-efficient, because the item can implement (very)
55792+ efficient compression of key_atom's bodies using internal knowledge
55793+ about their semantics, and it can often avoid having a key for each
55794+ key_atom. Each type of item has specific operations implemented by its
55795+ item handler (see balance.c).
55796+
55797+ Rationale: the rest of the code (specifically balancing routines)
55798+ accesses leaf level nodes through this interface. This way we can
55799+ implement various block layouts and even combine various layouts
55800+ within the same tree. Balancing/allocating algorithms should not
55801+ care about peculiarities of splitting/merging specific item types,
55802+ but rather should leave that to the item's item handler.
55803+
55804+ Items, including those that provide the abstraction of flows, have
55805+ the property that if you move them in part or in whole to another
55806+ node, the balancing code invokes their is_left_mergeable()
55807+ item_operation to determine if they are mergeable with their new
55808+ neighbor in the node you have moved them to. For some items the
55809+ is_left_mergeable() function always returns null.
55810+
55811+ When moving the bodies of items from one node to another:
55812+
55813+ if a partial item is shifted to another node the balancing code invokes
55814+ an item handler method to handle the item splitting.
55815+
55816+ if the balancing code needs to merge with an item in the node it
55817+ is shifting to, it will invoke an item handler method to handle
55818+ the item merging.
55819+
55820+ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
55821+ adjusting the item headers after the move is done using the node handler.
55822+*/
55823+
55824+#include "../../forward.h"
55825+#include "../../debug.h"
55826+#include "../../key.h"
55827+#include "../../coord.h"
55828+#include "../plugin_header.h"
55829+#include "../item/item.h"
55830+#include "node.h"
55831+#include "../plugin.h"
55832+#include "../../znode.h"
55833+#include "../../tree.h"
55834+#include "../../super.h"
55835+#include "../../reiser4.h"
55836+
55837+/**
55838+ * leftmost_key_in_node - get the smallest key in node
55839+ * @node:
55840+ * @key: store result here
55841+ *
55842+ * Stores the leftmost key of @node in @key.
55843+ */
55844+reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
55845+{
55846+ assert("nikita-1634", node != NULL);
55847+ assert("nikita-1635", key != NULL);
55848+
55849+ if (!node_is_empty(node)) {
55850+ coord_t first_item;
55851+
55852+ coord_init_first_unit(&first_item, (znode *) node);
55853+ item_key_by_coord(&first_item, key);
55854+ } else
55855+ *key = *reiser4_max_key();
55856+ return key;
55857+}
55858+
55859+node_plugin node_plugins[LAST_NODE_ID] = {
55860+ [NODE40_ID] = {
55861+ .h = {
55862+ .type_id = REISER4_NODE_PLUGIN_TYPE,
55863+ .id = NODE40_ID,
55864+ .pops = NULL,
55865+ .label = "unified",
55866+ .desc = "unified node layout",
55867+ .linkage = {NULL, NULL}
55868+ },
55869+ .item_overhead = item_overhead_node40,
55870+ .free_space = free_space_node40,
55871+ .lookup = lookup_node40,
55872+ .num_of_items = num_of_items_node40,
55873+ .item_by_coord = item_by_coord_node40,
55874+ .length_by_coord = length_by_coord_node40,
55875+ .plugin_by_coord = plugin_by_coord_node40,
55876+ .key_at = key_at_node40,
55877+ .estimate = estimate_node40,
55878+ .check = check_node40,
55879+ .parse = parse_node40,
55880+ .init = init_node40,
55881+#ifdef GUESS_EXISTS
55882+ .guess = guess_node40,
55883+#endif
55884+ .change_item_size = change_item_size_node40,
55885+ .create_item = create_item_node40,
55886+ .update_item_key = update_item_key_node40,
55887+ .cut_and_kill = kill_node40,
55888+ .cut = cut_node40,
55889+ .shift = shift_node40,
55890+ .shrink_item = shrink_item_node40,
55891+ .fast_insert = fast_insert_node40,
55892+ .fast_paste = fast_paste_node40,
55893+ .fast_cut = fast_cut_node40,
55894+ .max_item_size = max_item_size_node40,
55895+ .prepare_removal = prepare_removal_node40,
55896+ .set_item_plugin = set_item_plugin_node40
55897+ }
55898+};
55899+
55900+/*
55901+ Local variables:
55902+ c-indentation-style: "K&R"
55903+ mode-name: "LC"
55904+ c-basic-offset: 8
55905+ tab-width: 8
55906+ fill-column: 120
55907+ scroll-step: 1
55908+ End:
55909+*/
55910diff -urN linux-2.6.24.orig/fs/reiser4/plugin/node/node.h linux-2.6.24/fs/reiser4/plugin/node/node.h
55911--- linux-2.6.24.orig/fs/reiser4/plugin/node/node.h 1970-01-01 03:00:00.000000000 +0300
55912+++ linux-2.6.24/fs/reiser4/plugin/node/node.h 2008-01-25 11:39:07.044235509 +0300
55913@@ -0,0 +1,272 @@
55914+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55915+
55916+/* We need a definition of the default node layout here. */
55917+
55918+/* Generally speaking, it is best to have free space in the middle of the
55919+ node so that two sets of things can grow towards it, and to have the
55920+ item bodies on the left so that the last one of them grows into free
55921+ space. We optimize for the case where we append new items to the end
55922+ of the node, or grow the last item, because it hurts nothing to so
55923+ optimize and it is a common special case to do massive insertions in
55924+ increasing key order (and one of cases more likely to have a real user
55925+ notice the delay time for).
55926+
55927+ formatted leaf default layout: (leaf1)
55928+
55929+ |node header:item bodies:free space:key + pluginid + item offset|
55930+
55931+ We grow towards the middle, optimizing layout for the case where we
55932+ append new items to the end of the node. The node header is fixed
55933+ length. Keys, and item offsets plus pluginids for the items
55934+ corresponding to them are in increasing key order, and are fixed
55935+ length. Item offsets are relative to start of node (16 bits creating
55936+ a node size limit of 64k, 12 bits might be a better choice....). Item
55937+ bodies are in decreasing key order. Item bodies have a variable size.
55938+ There is a one to one to one mapping of keys to item offsets to item
55939+ bodies. Item offsets consist of pointers to the zeroth byte of the
55940+ item body. Item length equals the start of the next item minus the
55941+ start of this item, except the zeroth item whose length equals the end
55942+ of the node minus the start of that item (plus a byte). In other
55943+ words, the item length is not recorded anywhere, and it does not need
55944+ to be since it is computable.
55945+
55946+ Leaf variable length items and keys layout : (lvar)
55947+
55948+ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
55949+
55950+ We grow towards the middle, optimizing layout for the case where we
55951+ append new items to the end of the node. The node header is fixed
55952+ length. Keys and item offsets for the items corresponding to them are
55953+ in increasing key order, and keys are variable length. Item offsets
55954+ are relative to start of node (16 bits). Item bodies are in
55955+ decreasing key order. Item bodies have a variable size. There is a
55956+ one to one to one mapping of keys to item offsets to item bodies.
55957+ Item offsets consist of pointers to the zeroth byte of the item body.
55958+ Item length equals the start of the next item's key minus the start of
55959+ this item, except the zeroth item whose length equals the end of the
55960+ node minus the start of that item (plus a byte).
55961+
55962+ leaf compressed keys layout: (lcomp)
55963+
55964+ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
55965+
55966+ We grow towards the middle, optimizing layout for the case where we
55967+ append new items to the end of the node. The node header is fixed
55968+ length. Keys and item offsets for the items corresponding to them are
55969+ in increasing key order, and keys are variable length. The "key
55970+ inherit" field indicates how much of the key prefix is identical to
55971+ the previous key (stem compression as described in "Managing
55972+ Gigabytes" is used). key_inherit is a one byte integer. The
55973+ intra-node searches performed through this layout are linear searches,
55974+ and this is theorized to not hurt performance much due to the high
55975+ cost of processor stalls on modern CPUs, and the small number of keys
55976+ in a single node. Item offsets are relative to start of node (16
55977+ bits). Item bodies are in decreasing key order. Item bodies have a
55978+ variable size. There is a one to one to one mapping of keys to item
55979+ offsets to item bodies. Item offsets consist of pointers to the
55980+ zeroth byte of the item body. Item length equals the start of the
55981+ next item minus the start of this item, except the zeroth item whose
55982+ length equals the end of the node minus the start of that item (plus a
55983+ byte). In other words, item length and key length is not recorded
55984+ anywhere, and it does not need to be since it is computable.
55985+
55986+ internal node default layout: (idef1)
55987+
55988+ just like ldef1 except that item bodies are either blocknrs of
55989+ children or extents, and moving them may require updating parent
55990+ pointers in the nodes that they point to.
55991+*/
55992+
55993+/* There is an inherent 3-way tradeoff between optimizing and
55994+ exchanging disks between different architectures and code
55995+ complexity. This is optimal and simple and inexchangeable.
55996+ Someone else can do the code for exchanging disks and make it
55997+ complex. It would not be that hard. Using other than the PAGE_SIZE
55998+ might be suboptimal.
55999+*/
56000+
56001+#if !defined( __REISER4_NODE_H__ )
56002+#define __REISER4_NODE_H__
56003+
56004+#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
56005+
56006+#include "../../dformat.h"
56007+#include "../plugin_header.h"
56008+
56009+#include <linux/types.h>
56010+
56011+typedef enum {
56012+ NS_FOUND = 0,
56013+ NS_NOT_FOUND = -ENOENT
56014+} node_search_result;
56015+
56016+/* Maximal possible space overhead for creation of new item in a node */
56017+#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
56018+
56019+typedef enum {
56020+ REISER4_NODE_DKEYS = (1 << 0),
56021+ REISER4_NODE_TREE_STABLE = (1 << 1)
56022+} reiser4_node_check_flag;
56023+
56024+/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
56025+struct cut_list {
56026+ coord_t *from;
56027+ coord_t *to;
56028+ const reiser4_key *from_key;
56029+ const reiser4_key *to_key;
56030+ reiser4_key *smallest_removed;
56031+ carry_plugin_info *info;
56032+ __u32 flags;
56033+ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
56034+ lock_handle *left;
56035+ lock_handle *right;
56036+};
56037+
56038+struct carry_cut_data;
56039+struct carry_kill_data;
56040+
56041+/* The responsibility of the node plugin is to store and give access
56042+ to the sequence of items within the node. */
56043+typedef struct node_plugin {
56044+ /* generic plugin fields */
56045+ plugin_header h;
56046+
56047+ /* calculates the amount of space that will be required to store an
56048+ item which is in addition to the space consumed by the item body.
56049+ (the space consumed by the item body can be gotten by calling
56050+ item->estimate) */
56051+ size_t(*item_overhead) (const znode * node, flow_t * f);
56052+
56053+ /* returns free space by looking into node (i.e., without using
56054+ znode->free_space). */
56055+ size_t(*free_space) (znode * node);
56056+ /* search within the node for the one item which might
56057+ contain the key, invoking item->search_within to search within
56058+ that item to see if it is in there */
56059+ node_search_result(*lookup) (znode * node, const reiser4_key * key,
56060+ lookup_bias bias, coord_t * coord);
56061+ /* number of items in node */
56062+ int (*num_of_items) (const znode * node);
56063+
56064+ /* store information about item in @coord in @data */
56065+ /* break into several node ops, don't add any more uses of this before doing so */
56066+ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
56067+ char *(*item_by_coord) (const coord_t * coord);
56068+ int (*length_by_coord) (const coord_t * coord);
56069+ item_plugin *(*plugin_by_coord) (const coord_t * coord);
56070+
56071+ /* store item key in @key */
56072+ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
56073+ /* conservatively estimate whether unit of what size can fit
56074+ into node. This estimation should be performed without
56075+ actually looking into the node's content (free space is saved in
56076+ znode). */
56077+ size_t(*estimate) (znode * node);
56078+
56079+ /* performs every consistency check the node plugin author could
56080+ imagine. Optional. */
56081+ int (*check) (const znode * node, __u32 flags, const char **error);
56082+
56083+ /* Called when node is read into memory and node plugin is
56084+ already detected. This should read some data into znode (like free
56085+ space counter) and, optionally, check data consistency.
56086+ */
56087+ int (*parse) (znode * node);
56088+ /* This method is called on a new node to initialise plugin specific
56089+ data (header, etc.) */
56090+ int (*init) (znode * node);
56091+ /* Check whether @node content conforms to this plugin format.
56092+ Probably only useful after support for old V3.x formats is added.
56093+ Uncomment after 4.0 only.
56094+ */
56095+ /* int ( *guess )( const znode *node ); */
56096+#if REISER4_DEBUG
56097+ void (*print) (const char *prefix, const znode * node, __u32 flags);
56098+#endif
56099+ /* change size of @item by @by bytes. @item->node has enough free
56100+ space. When @by > 0 - free space is appended to end of item. When
56101+ @by < 0 - item is truncated - it is assumed that last @by bytes if
56102+ the item are freed already */
56103+ void (*change_item_size) (coord_t * item, int by);
56104+
56105+ /* create new item @length bytes long in coord @target */
56106+ int (*create_item) (coord_t * target, const reiser4_key * key,
56107+ reiser4_item_data * data, carry_plugin_info * info);
56108+
56109+ /* update key of item. */
56110+ void (*update_item_key) (coord_t * target, const reiser4_key * key,
56111+ carry_plugin_info * info);
56112+
56113+ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
56114+ int (*cut) (struct carry_cut_data *, carry_plugin_info *);
56115+
56116+ /*
56117+ * shrink item pointed to by @coord by @delta bytes.
56118+ */
56119+ int (*shrink_item) (coord_t * coord, int delta);
56120+
56121+ /* copy as much as possible but not more than up to @stop from
56122+ @stop->node to @target. If (pend == append) then data from beginning of
56123+ @stop->node are copied to the end of @target. If (pend == prepend) then
56124+ data from the end of @stop->node are copied to the beginning of
56125+ @target. Copied data are removed from @stop->node. Information
56126+ about what to do on upper level is stored in @todo */
56127+ int (*shift) (coord_t * stop, znode * target, shift_direction pend,
56128+ int delete_node, int including_insert_coord,
56129+ carry_plugin_info * info);
56130+ /* return true if this node allows skip carry() in some situations
56131+ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
56132+ emulation doesn't.
56133+
56134+ This will speedup insertions that doesn't require updates to the
56135+ parent, by bypassing initialisation of carry() structures. It's
56136+ believed that majority of insertions will fit there.
56137+
56138+ */
56139+ int (*fast_insert) (const coord_t * coord);
56140+ int (*fast_paste) (const coord_t * coord);
56141+ int (*fast_cut) (const coord_t * coord);
56142+ /* this limits max size of item which can be inserted into a node and
56143+ number of bytes item in a node may be appended with */
56144+ int (*max_item_size) (void);
56145+ int (*prepare_removal) (znode * empty, carry_plugin_info * info);
56146+ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
56147+ * files */
56148+ int (*set_item_plugin) (coord_t * coord, item_id);
56149+} node_plugin;
56150+
56151+typedef enum {
56152+ /* standard unified node layout used for both leaf and internal
56153+ nodes */
56154+ NODE40_ID,
56155+ LAST_NODE_ID
56156+} reiser4_node_id;
56157+
56158+extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
56159+#if REISER4_DEBUG
56160+extern void print_node_content(const char *prefix, const znode * node,
56161+ __u32 flags);
56162+#endif
56163+
56164+extern void indent_znode(const znode * node);
56165+
56166+typedef struct common_node_header {
56167+ /*
56168+ * identifier of node plugin. Must be located at the very beginning of
56169+ * a node.
56170+ */
56171+ __le16 plugin_id;
56172+} common_node_header;
56173+
56174+/* __REISER4_NODE_H__ */
56175+#endif
56176+/*
56177+ * Local variables:
56178+ * c-indentation-style: "K&R"
56179+ * mode-name: "LC"
56180+ * c-basic-offset: 8
56181+ * tab-width: 8
56182+ * fill-column: 79
56183+ * scroll-step: 1
56184+ * End:
56185+ */
56186diff -urN linux-2.6.24.orig/fs/reiser4/plugin/object.c linux-2.6.24/fs/reiser4/plugin/object.c
56187--- linux-2.6.24.orig/fs/reiser4/plugin/object.c 1970-01-01 03:00:00.000000000 +0300
56188+++ linux-2.6.24/fs/reiser4/plugin/object.c 2008-01-25 11:39:07.044235509 +0300
56189@@ -0,0 +1,531 @@
56190+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
56191+ * reiser4/README */
56192+
56193+/*
56194+ * Examples of object plugins: file, directory, symlink, special file.
56195+ *
56196+ * Plugins associated with inode:
56197+ *
56198+ * Plugin of inode is plugin referenced by plugin-id field of on-disk
56199+ * stat-data. How we store this plugin in in-core inode is not
56200+ * important. Currently pointers are used, another variant is to store offsets
56201+ * and do array lookup on each access.
56202+ *
56203+ * Now, each inode has one selected plugin: object plugin that
56204+ * determines what type of file this object is: directory, regular etc.
56205+ *
56206+ * This main plugin can use other plugins that are thus subordinated to
56207+ * it. Directory instance of object plugin uses hash; regular file
56208+ * instance uses tail policy plugin.
56209+ *
56210+ * Object plugin is either taken from id in stat-data or guessed from
56211+ * i_mode bits. Once it is established we ask it to install its
56212+ * subordinate plugins, by looking again in stat-data or inheriting them
56213+ * from parent.
56214+ *
56215+ * How new inode is initialized during ->read_inode():
56216+ * 1 read stat-data and initialize inode fields: i_size, i_mode,
56217+ * i_generation, capabilities etc.
56218+ * 2 read plugin id from stat data or try to guess plugin id
56219+ * from inode->i_mode bits if plugin id is missing.
56220+ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
56221+ *
56222+ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
56223+ * if stat data does contain i_size, etc., due to it being an unusual plugin?
56224+ *
56225+ * 4 Call ->activate() method of object's plugin. Plugin is either read from
56226+ * from stat-data or guessed from mode bits
56227+ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
56228+ * plugins from parent.
56229+ *
56230+ * Easy induction proves that on last step all plugins of inode would be
56231+ * initialized.
56232+ *
56233+ * When creating new object:
56234+ * 1 obtain object plugin id (see next period)
56235+ * NIKITA-FIXME-HANS: period?
56236+ * 2 ->install() this plugin
56237+ * 3 ->inherit() the rest from the parent
56238+ *
56239+ * We need some examples of creating an object with default and non-default
56240+ * plugin ids. Nikita, please create them.
56241+ */
56242+
56243+#include "../inode.h"
56244+
56245+static int _bugop(void)
56246+{
56247+ BUG_ON(1);
56248+ return 0;
56249+}
56250+
56251+#define bugop ((void *)_bugop)
56252+
56253+static int _dummyop(void)
56254+{
56255+ return 0;
56256+}
56257+
56258+#define dummyop ((void *)_dummyop)
56259+
56260+static int change_file(struct inode *inode,
56261+ reiser4_plugin * plugin,
56262+ pset_member memb)
56263+{
56264+ /* cannot change object plugin of already existing object */
56265+ if (memb == PSET_FILE)
56266+ return RETERR(-EINVAL);
56267+
56268+ /* Change PSET_CREATE */
56269+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
56270+}
56271+
56272+static reiser4_plugin_ops file_plugin_ops = {
56273+ .change = change_file
56274+};
56275+
56276+static struct inode_operations null_i_ops = {.create = NULL};
56277+static struct file_operations null_f_ops = {.owner = NULL};
56278+static struct address_space_operations null_a_ops = {.writepage = NULL};
56279+
56280+/* VFS methods for regular files */
56281+static struct inode_operations regular_file_i_ops = {
56282+ .permission = reiser4_permission_common,
56283+ .setattr = reiser4_setattr,
56284+ .getattr = reiser4_getattr_common
56285+};
56286+static struct file_operations regular_file_f_ops = {
56287+ .llseek = generic_file_llseek,
56288+ .read = reiser4_read_careful,
56289+ .write = reiser4_write_careful,
56290+ .aio_read = generic_file_aio_read,
56291+ .ioctl = reiser4_ioctl_careful,
56292+ .mmap = reiser4_mmap_careful,
56293+ .open = reiser4_open_careful,
56294+ .release = reiser4_release_careful,
56295+ .fsync = reiser4_sync_file_common,
56296+ .splice_read = generic_file_splice_read,
56297+ .splice_write = generic_file_splice_write
56298+};
56299+static struct address_space_operations regular_file_a_ops = {
56300+ .writepage = reiser4_writepage,
56301+ .readpage = reiser4_readpage,
56302+ .sync_page = block_sync_page,
56303+ .writepages = reiser4_writepages,
56304+ .set_page_dirty = reiser4_set_page_dirty,
56305+ .readpages = reiser4_readpages,
56306+ .prepare_write = reiser4_prepare_write,
56307+ .commit_write = reiser4_commit_write,
56308+ .bmap = reiser4_bmap_careful,
56309+ .invalidatepage = reiser4_invalidatepage,
56310+ .releasepage = reiser4_releasepage
56311+};
56312+
56313+/* VFS methods for symlink files */
56314+static struct inode_operations symlink_file_i_ops = {
56315+ .readlink = generic_readlink,
56316+ .follow_link = reiser4_follow_link_common,
56317+ .permission = reiser4_permission_common,
56318+ .setattr = reiser4_setattr_common,
56319+ .getattr = reiser4_getattr_common
56320+};
56321+
56322+/* VFS methods for special files */
56323+static struct inode_operations special_file_i_ops = {
56324+ .permission = reiser4_permission_common,
56325+ .setattr = reiser4_setattr_common,
56326+ .getattr = reiser4_getattr_common
56327+};
56328+
56329+/* VFS methods for directories */
56330+static struct inode_operations directory_i_ops = {
56331+ .create = reiser4_create_common,
56332+ .lookup = reiser4_lookup_common,
56333+ .link = reiser4_link_common,
56334+ .unlink = reiser4_unlink_common,
56335+ .symlink = reiser4_symlink_common,
56336+ .mkdir = reiser4_mkdir_common,
56337+ .rmdir = reiser4_unlink_common,
56338+ .mknod = reiser4_mknod_common,
56339+ .rename = reiser4_rename_common,
56340+ .permission = reiser4_permission_common,
56341+ .setattr = reiser4_setattr_common,
56342+ .getattr = reiser4_getattr_common
56343+};
56344+static struct file_operations directory_f_ops = {
56345+ .llseek = reiser4_llseek_dir_common,
56346+ .read = generic_read_dir,
56347+ .readdir = reiser4_readdir_common,
56348+ .release = reiser4_release_dir_common,
56349+ .fsync = reiser4_sync_common
56350+};
56351+static struct address_space_operations directory_a_ops = {
56352+ .writepage = bugop,
56353+ .sync_page = bugop,
56354+ .writepages = dummyop,
56355+ .set_page_dirty = bugop,
56356+ .readpages = bugop,
56357+ .prepare_write = bugop,
56358+ .commit_write = bugop,
56359+ .bmap = bugop,
56360+ .invalidatepage = bugop,
56361+ .releasepage = bugop
56362+};
56363+
56364+/*
56365+ * Definitions of object plugins.
56366+ */
56367+
56368+file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
56369+ [UNIX_FILE_PLUGIN_ID] = {
56370+ .h = {
56371+ .type_id = REISER4_FILE_PLUGIN_TYPE,
56372+ .id = UNIX_FILE_PLUGIN_ID,
56373+ .groups = (1 << REISER4_REGULAR_FILE),
56374+ .pops = &file_plugin_ops,
56375+ .label = "reg",
56376+ .desc = "regular file",
56377+ .linkage = {NULL, NULL},
56378+ },
56379+ /*
56380+ * invariant vfs ops
56381+ */
56382+ .inode_ops = &regular_file_i_ops,
56383+ .file_ops = &regular_file_f_ops,
56384+ .as_ops = &regular_file_a_ops,
56385+ /*
56386+ * private i_ops
56387+ */
56388+ .setattr = setattr_unix_file,
56389+ .open = open_unix_file,
56390+ .read = read_unix_file,
56391+ .write = write_unix_file,
56392+ .ioctl = ioctl_unix_file,
56393+ .mmap = mmap_unix_file,
56394+ .release = release_unix_file,
56395+ /*
56396+ * private f_ops
56397+ */
56398+ .readpage = readpage_unix_file,
56399+ .readpages = readpages_unix_file,
56400+ .writepages = writepages_unix_file,
56401+ .prepare_write = prepare_write_unix_file,
56402+ .commit_write = commit_write_unix_file,
56403+ /*
56404+ * private a_ops
56405+ */
56406+ .bmap = bmap_unix_file,
56407+ /*
56408+ * other private methods
56409+ */
56410+ .write_sd_by_inode = write_sd_by_inode_common,
56411+ .flow_by_inode = flow_by_inode_unix_file,
56412+ .key_by_inode = key_by_inode_and_offset_common,
56413+ .set_plug_in_inode = set_plug_in_inode_common,
56414+ .adjust_to_parent = adjust_to_parent_common,
56415+ .create_object = reiser4_create_object_common,
56416+ .delete_object = delete_object_unix_file,
56417+ .add_link = reiser4_add_link_common,
56418+ .rem_link = reiser4_rem_link_common,
56419+ .owns_item = owns_item_unix_file,
56420+ .can_add_link = can_add_link_common,
56421+ .detach = dummyop,
56422+ .bind = dummyop,
56423+ .safelink = safelink_common,
56424+ .estimate = {
56425+ .create = estimate_create_common,
56426+ .update = estimate_update_common,
56427+ .unlink = estimate_unlink_common
56428+ },
56429+ .init_inode_data = init_inode_data_unix_file,
56430+ .cut_tree_worker = cut_tree_worker_common,
56431+ .wire = {
56432+ .write = wire_write_common,
56433+ .read = wire_read_common,
56434+ .get = wire_get_common,
56435+ .size = wire_size_common,
56436+ .done = wire_done_common
56437+ }
56438+ },
56439+ [DIRECTORY_FILE_PLUGIN_ID] = {
56440+ .h = {
56441+ .type_id = REISER4_FILE_PLUGIN_TYPE,
56442+ .id = DIRECTORY_FILE_PLUGIN_ID,
56443+ .groups = (1 << REISER4_DIRECTORY_FILE),
56444+ .pops = &file_plugin_ops,
56445+ .label = "dir",
56446+ .desc = "directory",
56447+ .linkage = {NULL, NULL}
56448+ },
56449+ .inode_ops = &null_i_ops,
56450+ .file_ops = &null_f_ops,
56451+ .as_ops = &null_a_ops,
56452+
56453+ .write_sd_by_inode = write_sd_by_inode_common,
56454+ .flow_by_inode = bugop,
56455+ .key_by_inode = bugop,
56456+ .set_plug_in_inode = set_plug_in_inode_common,
56457+ .adjust_to_parent = adjust_to_parent_common_dir,
56458+ .create_object = reiser4_create_object_common,
56459+ .delete_object = reiser4_delete_dir_common,
56460+ .add_link = reiser4_add_link_common,
56461+ .rem_link = rem_link_common_dir,
56462+ .owns_item = owns_item_common_dir,
56463+ .can_add_link = can_add_link_common,
56464+ .can_rem_link = can_rem_link_common_dir,
56465+ .detach = reiser4_detach_common_dir,
56466+ .bind = reiser4_bind_common_dir,
56467+ .safelink = safelink_common,
56468+ .estimate = {
56469+ .create = estimate_create_common_dir,
56470+ .update = estimate_update_common,
56471+ .unlink = estimate_unlink_common_dir
56472+ },
56473+ .wire = {
56474+ .write = wire_write_common,
56475+ .read = wire_read_common,
56476+ .get = wire_get_common,
56477+ .size = wire_size_common,
56478+ .done = wire_done_common
56479+ },
56480+ .init_inode_data = init_inode_ordering,
56481+ .cut_tree_worker = cut_tree_worker_common,
56482+ },
56483+ [SYMLINK_FILE_PLUGIN_ID] = {
56484+ .h = {
56485+ .type_id = REISER4_FILE_PLUGIN_TYPE,
56486+ .id = SYMLINK_FILE_PLUGIN_ID,
56487+ .groups = (1 << REISER4_SYMLINK_FILE),
56488+ .pops = &file_plugin_ops,
56489+ .label = "symlink",
56490+ .desc = "symbolic link",
56491+ .linkage = {NULL,NULL}
56492+ },
56493+ .inode_ops = &symlink_file_i_ops,
56494+ /* inode->i_fop of symlink is initialized
56495+ by NULL in setup_inode_ops */
56496+ .file_ops = &null_f_ops,
56497+ .as_ops = &null_a_ops,
56498+
56499+ .write_sd_by_inode = write_sd_by_inode_common,
56500+ .set_plug_in_inode = set_plug_in_inode_common,
56501+ .adjust_to_parent = adjust_to_parent_common,
56502+ .create_object = reiser4_create_symlink,
56503+ .delete_object = reiser4_delete_object_common,
56504+ .add_link = reiser4_add_link_common,
56505+ .rem_link = reiser4_rem_link_common,
56506+ .can_add_link = can_add_link_common,
56507+ .detach = dummyop,
56508+ .bind = dummyop,
56509+ .safelink = safelink_common,
56510+ .estimate = {
56511+ .create = estimate_create_common,
56512+ .update = estimate_update_common,
56513+ .unlink = estimate_unlink_common
56514+ },
56515+ .init_inode_data = init_inode_ordering,
56516+ .cut_tree_worker = cut_tree_worker_common,
56517+ .destroy_inode = destroy_inode_symlink,
56518+ .wire = {
56519+ .write = wire_write_common,
56520+ .read = wire_read_common,
56521+ .get = wire_get_common,
56522+ .size = wire_size_common,
56523+ .done = wire_done_common
56524+ }
56525+ },
56526+ [SPECIAL_FILE_PLUGIN_ID] = {
56527+ .h = {
56528+ .type_id = REISER4_FILE_PLUGIN_TYPE,
56529+ .id = SPECIAL_FILE_PLUGIN_ID,
56530+ .groups = (1 << REISER4_SPECIAL_FILE),
56531+ .pops = &file_plugin_ops,
56532+ .label = "special",
56533+ .desc =
56534+ "special: fifo, device or socket",
56535+ .linkage = {NULL, NULL}
56536+ },
56537+ .inode_ops = &special_file_i_ops,
56538+ /* file_ops of special files (sockets, block, char, fifo) are
56539+ initialized by init_special_inode. */
56540+ .file_ops = &null_f_ops,
56541+ .as_ops = &null_a_ops,
56542+
56543+ .write_sd_by_inode = write_sd_by_inode_common,
56544+ .set_plug_in_inode = set_plug_in_inode_common,
56545+ .adjust_to_parent = adjust_to_parent_common,
56546+ .create_object = reiser4_create_object_common,
56547+ .delete_object = reiser4_delete_object_common,
56548+ .add_link = reiser4_add_link_common,
56549+ .rem_link = reiser4_rem_link_common,
56550+ .owns_item = owns_item_common,
56551+ .can_add_link = can_add_link_common,
56552+ .detach = dummyop,
56553+ .bind = dummyop,
56554+ .safelink = safelink_common,
56555+ .estimate = {
56556+ .create = estimate_create_common,
56557+ .update = estimate_update_common,
56558+ .unlink = estimate_unlink_common
56559+ },
56560+ .init_inode_data = init_inode_ordering,
56561+ .cut_tree_worker = cut_tree_worker_common,
56562+ .wire = {
56563+ .write = wire_write_common,
56564+ .read = wire_read_common,
56565+ .get = wire_get_common,
56566+ .size = wire_size_common,
56567+ .done = wire_done_common
56568+ }
56569+ },
56570+ [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
56571+ .h = {
56572+ .type_id = REISER4_FILE_PLUGIN_TYPE,
56573+ .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
56574+ .groups = (1 << REISER4_REGULAR_FILE),
56575+ .pops = &file_plugin_ops,
56576+ .label = "cryptcompress",
56577+ .desc = "cryptcompress file",
56578+ .linkage = {NULL, NULL}
56579+ },
56580+ .inode_ops = &regular_file_i_ops,
56581+ .file_ops = &regular_file_f_ops,
56582+ .as_ops = &regular_file_a_ops,
56583+
56584+ .setattr = setattr_cryptcompress,
56585+ .open = open_cryptcompress,
56586+ .read = read_cryptcompress,
56587+ .write = write_cryptcompress,
56588+ .ioctl = ioctl_cryptcompress,
56589+ .mmap = mmap_cryptcompress,
56590+ .release = release_cryptcompress,
56591+
56592+ .readpage = readpage_cryptcompress,
56593+ .readpages = readpages_cryptcompress,
56594+ .writepages = writepages_cryptcompress,
56595+ .prepare_write = prepare_write_cryptcompress,
56596+ .commit_write = commit_write_cryptcompress,
56597+
56598+ .bmap = bmap_cryptcompress,
56599+
56600+ .write_sd_by_inode = write_sd_by_inode_common,
56601+ .flow_by_inode = flow_by_inode_cryptcompress,
56602+ .key_by_inode = key_by_inode_cryptcompress,
56603+ .set_plug_in_inode = set_plug_in_inode_common,
56604+ .adjust_to_parent = adjust_to_parent_cryptcompress,
56605+ .create_object = create_object_cryptcompress,
56606+ .delete_object = delete_object_cryptcompress,
56607+ .add_link = reiser4_add_link_common,
56608+ .rem_link = reiser4_rem_link_common,
56609+ .owns_item = owns_item_common,
56610+ .can_add_link = can_add_link_common,
56611+ .detach = dummyop,
56612+ .bind = dummyop,
56613+ .safelink = safelink_common,
56614+ .estimate = {
56615+ .create = estimate_create_common,
56616+ .update = estimate_update_common,
56617+ .unlink = estimate_unlink_common
56618+ },
56619+ .init_inode_data = init_inode_data_cryptcompress,
56620+ .cut_tree_worker = cut_tree_worker_cryptcompress,
56621+ .destroy_inode = destroy_inode_cryptcompress,
56622+ .wire = {
56623+ .write = wire_write_common,
56624+ .read = wire_read_common,
56625+ .get = wire_get_common,
56626+ .size = wire_size_common,
56627+ .done = wire_done_common
56628+ }
56629+ }
56630+};
56631+
56632+static int change_dir(struct inode *inode,
56633+ reiser4_plugin * plugin,
56634+ pset_member memb)
56635+{
56636+ /* cannot change dir plugin of already existing object */
56637+ return RETERR(-EINVAL);
56638+}
56639+
56640+static reiser4_plugin_ops dir_plugin_ops = {
56641+ .change = change_dir
56642+};
56643+
56644+/*
56645+ * definition of directory plugins
56646+ */
56647+
56648+dir_plugin dir_plugins[LAST_DIR_ID] = {
56649+ /* standard hashed directory plugin */
56650+ [HASHED_DIR_PLUGIN_ID] = {
56651+ .h = {
56652+ .type_id = REISER4_DIR_PLUGIN_TYPE,
56653+ .id = HASHED_DIR_PLUGIN_ID,
56654+ .pops = &dir_plugin_ops,
56655+ .label = "dir",
56656+ .desc = "hashed directory",
56657+ .linkage = {NULL, NULL}
56658+ },
56659+ .inode_ops = &directory_i_ops,
56660+ .file_ops = &directory_f_ops,
56661+ .as_ops = &directory_a_ops,
56662+
56663+ .get_parent = get_parent_common,
56664+ .is_name_acceptable = is_name_acceptable_common,
56665+ .build_entry_key = build_entry_key_hashed,
56666+ .build_readdir_key = build_readdir_key_common,
56667+ .add_entry = reiser4_add_entry_common,
56668+ .rem_entry = reiser4_rem_entry_common,
56669+ .init = reiser4_dir_init_common,
56670+ .done = reiser4_dir_done_common,
56671+ .attach = reiser4_attach_common,
56672+ .detach = reiser4_detach_common,
56673+ .estimate = {
56674+ .add_entry = estimate_add_entry_common,
56675+ .rem_entry = estimate_rem_entry_common,
56676+ .unlink = dir_estimate_unlink_common
56677+ }
56678+ },
56679+ /* hashed directory for which seekdir/telldir are guaranteed to
56680+ * work. Brain-damage. */
56681+ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
56682+ .h = {
56683+ .type_id = REISER4_DIR_PLUGIN_TYPE,
56684+ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
56685+ .pops = &dir_plugin_ops,
56686+ .label = "dir32",
56687+ .desc = "directory hashed with 31 bit hash",
56688+ .linkage = {NULL, NULL}
56689+ },
56690+ .inode_ops = &directory_i_ops,
56691+ .file_ops = &directory_f_ops,
56692+ .as_ops = &directory_a_ops,
56693+
56694+ .get_parent = get_parent_common,
56695+ .is_name_acceptable = is_name_acceptable_common,
56696+ .build_entry_key = build_entry_key_seekable,
56697+ .build_readdir_key = build_readdir_key_common,
56698+ .add_entry = reiser4_add_entry_common,
56699+ .rem_entry = reiser4_rem_entry_common,
56700+ .init = reiser4_dir_init_common,
56701+ .done = reiser4_dir_done_common,
56702+ .attach = reiser4_attach_common,
56703+ .detach = reiser4_detach_common,
56704+ .estimate = {
56705+ .add_entry = estimate_add_entry_common,
56706+ .rem_entry = estimate_rem_entry_common,
56707+ .unlink = dir_estimate_unlink_common
56708+ }
56709+ }
56710+};
56711+
56712+/* Make Linus happy.
56713+ Local variables:
56714+ c-indentation-style: "K&R"
56715+ mode-name: "LC"
56716+ c-basic-offset: 8
56717+ tab-width: 8
56718+ fill-column: 120
56719+ End:
56720+*/
56721diff -urN linux-2.6.24.orig/fs/reiser4/plugin/object.h linux-2.6.24/fs/reiser4/plugin/object.h
56722--- linux-2.6.24.orig/fs/reiser4/plugin/object.h 1970-01-01 03:00:00.000000000 +0300
56723+++ linux-2.6.24/fs/reiser4/plugin/object.h 2008-01-25 11:39:07.044235509 +0300
56724@@ -0,0 +1,121 @@
56725+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
56726+ * reiser4/README */
56727+
56728+/* Declaration of object plugin functions. */
56729+
56730+#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
56731+#define __FS_REISER4_PLUGIN_OBJECT_H__
56732+
56733+#include "../type_safe_hash.h"
56734+
56735+/* common implementations of inode operations */
56736+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
56737+ int mode, struct nameidata *);
56738+struct dentry * reiser4_lookup_common(struct inode *parent,
56739+ struct dentry *dentry,
56740+ struct nameidata *nameidata);
56741+int reiser4_link_common(struct dentry *existing, struct inode *parent,
56742+ struct dentry *newname);
56743+int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
56744+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
56745+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
56746+ const char *linkname);
56747+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
56748+ int mode, dev_t rdev);
56749+int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
56750+ struct inode *new_dir, struct dentry *new_name);
56751+void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
56752+int reiser4_permission_common(struct inode *, int mask,
56753+ struct nameidata *nameidata);
56754+int reiser4_setattr_common(struct dentry *, struct iattr *);
56755+int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
56756+ struct kstat *);
56757+
56758+/* common implementations of file operations */
56759+loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
56760+int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
56761+int reiser4_release_dir_common(struct inode *, struct file *);
56762+int reiser4_sync_common(struct file *, struct dentry *, int datasync);
56763+
56764+/* common implementations of address space operations */
56765+int prepare_write_common(struct file *, struct page *, unsigned from,
56766+ unsigned to);
56767+
56768+/* file plugin operations: common implementations */
56769+int write_sd_by_inode_common(struct inode *);
56770+int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
56771+int set_plug_in_inode_common(struct inode *object, struct inode *parent,
56772+ reiser4_object_create_data *);
56773+int adjust_to_parent_common(struct inode *object, struct inode *parent,
56774+ struct inode *root);
56775+int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
56776+ struct inode *root);
56777+int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
56778+ struct inode *root);
56779+int reiser4_create_object_common(struct inode *object, struct inode *parent,
56780+ reiser4_object_create_data *);
56781+int reiser4_delete_object_common(struct inode *);
56782+int reiser4_delete_dir_common(struct inode *);
56783+int reiser4_add_link_common(struct inode *object, struct inode *parent);
56784+int reiser4_rem_link_common(struct inode *object, struct inode *parent);
56785+int rem_link_common_dir(struct inode *object, struct inode *parent);
56786+int owns_item_common(const struct inode *, const coord_t *);
56787+int owns_item_common_dir(const struct inode *, const coord_t *);
56788+int can_add_link_common(const struct inode *);
56789+int can_rem_link_common_dir(const struct inode *);
56790+int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
56791+int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
56792+int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
56793+reiser4_block_nr estimate_create_common(const struct inode *);
56794+reiser4_block_nr estimate_create_common_dir(const struct inode *);
56795+reiser4_block_nr estimate_update_common(const struct inode *);
56796+reiser4_block_nr estimate_unlink_common(const struct inode *,
56797+ const struct inode *);
56798+reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
56799+ const struct inode *);
56800+char *wire_write_common(struct inode *, char *start);
56801+char *wire_read_common(char *addr, reiser4_object_on_wire *);
56802+struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
56803+int wire_size_common(struct inode *);
56804+void wire_done_common(reiser4_object_on_wire *);
56805+
56806+/* dir plugin operations: common implementations */
56807+struct dentry *get_parent_common(struct inode *child);
56808+int is_name_acceptable_common(const struct inode *, const char *name, int len);
56809+void build_entry_key_common(const struct inode *,
56810+ const struct qstr *qname, reiser4_key *);
56811+int build_readdir_key_common(struct file *dir, reiser4_key *);
56812+int reiser4_add_entry_common(struct inode *object, struct dentry *where,
56813+ reiser4_object_create_data *, reiser4_dir_entry_desc *);
56814+int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
56815+ reiser4_dir_entry_desc *);
56816+int reiser4_dir_init_common(struct inode *object, struct inode *parent,
56817+ reiser4_object_create_data *);
56818+int reiser4_dir_done_common(struct inode *);
56819+int reiser4_attach_common(struct inode *child, struct inode *parent);
56820+int reiser4_detach_common(struct inode *object, struct inode *parent);
56821+reiser4_block_nr estimate_add_entry_common(const struct inode *);
56822+reiser4_block_nr estimate_rem_entry_common(const struct inode *);
56823+reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
56824+ const struct inode *);
56825+
56826+/* these are essential parts of common implementations, they are to make
56827+ customized implementations easier */
56828+int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
56829+
56830+/* merely useful functions */
56831+int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
56832+ const reiser4_key *, int silent);
56833+
56834+/* __FS_REISER4_PLUGIN_OBJECT_H__ */
56835+#endif
56836+
56837+/* Make Linus happy.
56838+ Local variables:
56839+ c-indentation-style: "K&R"
56840+ mode-name: "LC"
56841+ c-basic-offset: 8
56842+ tab-width: 8
56843+ fill-column: 120
56844+ End:
56845+*/
56846diff -urN linux-2.6.24.orig/fs/reiser4/plugin/plugin.c linux-2.6.24/fs/reiser4/plugin/plugin.c
56847--- linux-2.6.24.orig/fs/reiser4/plugin/plugin.c 1970-01-01 03:00:00.000000000 +0300
56848+++ linux-2.6.24/fs/reiser4/plugin/plugin.c 2008-01-25 11:39:07.048236540 +0300
56849@@ -0,0 +1,559 @@
56850+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
56851+ * reiser4/README */
56852+
56853+/* Basic plugin infrastructure, lookup etc. */
56854+
56855+/* PLUGINS:
56856+
56857+ Plugins are internal Reiser4 "modules" or "objects" used to increase
56858+ extensibility and allow external users to easily adapt reiser4 to
56859+ their needs.
56860+
56861+ Plugins are classified into several disjoint "types". Plugins
56862+ belonging to the particular plugin type are termed "instances" of
56863+ this type. Existing types are listed by enum reiser4_plugin_type
56864+ (see plugin/plugin_header.h)
56865+
56866+NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
56867+
56868+ Object (file) plugin determines how given file-system object serves
56869+ standard VFS requests for read, write, seek, mmap etc. Instances of
56870+ file plugins are: regular file, directory, symlink. Another example
56871+ of file plugin is audit plugin, that optionally records accesses to
56872+ underlying object and forwards requests to it.
56873+
56874+ Hash plugins compute hashes used by reiser4 to store and locate
56875+ files within directories. Instances of hash plugin type are: r5,
56876+ tea, rupasov.
56877+
56878+ Tail plugins (or, more precisely, tail policy plugins) determine
56879+ when last part of the file should be stored in a formatted item.
56880+
56881+ Scope and lookup:
56882+
56883+ label such that pair ( type_label, plugin_label ) is unique. This
56884+ pair is a globally persistent and user-visible plugin
56885+ identifier. Internally kernel maintains plugins and plugin types in
56886+ arrays using an index into those arrays as plugin and plugin type
56887+ identifiers. File-system in turn, also maintains persistent
56888+ "dictionary" which is mapping from plugin label to numerical
56889+ identifier which is stored in file-system objects. That is, we
56890+ store the offset into the plugin array for that plugin type as the
56891+ plugin id in the stat data of the filesystem object.
56892+
56893+ Internal kernel plugin type identifier (index in plugins[] array) is
56894+ of type reiser4_plugin_type. Set of available plugin types is
56895+ currently static, but dynamic loading doesn't seem to pose
56896+ insurmountable problems.
56897+
56898+ Within each type plugins are addressed by the identifiers of type
56899+ reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]).
56900+ Such identifiers are only required to be unique within one type,
56901+ not globally.
56902+
56903+ Thus, plugin in memory is uniquely identified by the pair (type_id,
56904+ id).
56905+
56906+ Usage:
56907+
56908+ There exists only one instance of each plugin instance, but this
56909+ single instance can be associated with many entities (file-system
56910+ objects, items, nodes, transactions, file-descriptors etc.). Entity
56911+ to which plugin of given type is termed (due to the lack of
56912+ imagination) "subject" of this plugin type and, by abuse of
56913+ terminology, subject of particular instance of this type to which
56914+ it's attached currently. For example, inode is subject of object
56915+ plugin type. Inode representing directory is subject of directory
56916+ plugin, hash plugin type and some particular instance of hash plugin
56917+ type. Inode, representing regular file is subject of "regular file"
56918+ plugin, tail-policy plugin type etc.
56919+
56920+ With each subject the plugin possibly stores some state. For example,
56921+ the state of a directory plugin (instance of object plugin type) is pointer
56922+ to hash plugin (if directories always use hashing that is).
56923+
56924+ Interface:
56925+
56926+ In addition to a scalar identifier, each plugin type and plugin
56927+ proper has a "label": short string and a "description"---longer
56928+ descriptive string. Labels and descriptions of plugin types are
56929+ hard-coded into plugins[] array, declared and defined in
56930+ plugin.c. Label and description of plugin are stored in .label and
56931+ .desc fields of reiser4_plugin_header respectively. It's possible to
56932+ locate plugin by the pair of labels.
56933+
56934+ Features (not implemented):
56935+
56936+ . user-level plugin manipulations:
56937+ + reiser4("filename/..file_plugin<='audit'");
56938+ + write(open("filename/..file_plugin"), "audit", 8);
56939+
56940+ . user level utilities lsplug and chplug to manipulate plugins.
56941+ Utilities are not of primary priority. Possibly they will be not
56942+ working on v4.0
56943+
56944+ NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount
56945+ option, do you agree? I don't think that specifying it at mount time,
56946+ and then changing it with each mount, is a good model for usage.
56947+
56948+ . mount option "plug" to set-up plugins of root-directory.
56949+ "plug=foo:bar" will set "bar" as default plugin of type "foo".
56950+
56951+ Limitations:
56952+
56953+ . each plugin type has to provide at least one builtin
56954+ plugin. This is technical limitation and it can be lifted in the
56955+ future.
56956+
56957+ TODO:
56958+
56959+ New plugin types/plugings:
56960+ Things we should be able to separately choose to inherit:
56961+
56962+ security plugins
56963+
56964+ stat data
56965+
56966+ file bodies
56967+
56968+ file plugins
56969+
56970+ dir plugins
56971+
56972+ . perm:acl
56973+
56974+ . audi---audit plugin intercepting and possibly logging all
56975+ accesses to object. Requires to put stub functions in file_operations
56976+ in stead of generic_file_*.
56977+
56978+NIKITA-FIXME-HANS: why make overflows a plugin?
56979+ . over---handle hash overflows
56980+
56981+ . sqnt---handle different access patterns and instruments read-ahead
56982+
56983+NIKITA-FIXME-HANS: describe the line below in more detail.
56984+
56985+ . hier---handle inheritance of plugins along file-system hierarchy
56986+
56987+ Different kinds of inheritance: on creation vs. on access.
56988+ Compatible/incompatible plugins.
56989+ Inheritance for multi-linked files.
56990+ Layered plugins.
56991+ Notion of plugin context is abandoned.
56992+
56993+Each file is associated
56994+ with one plugin and dependant plugins (hash, etc.) are stored as
56995+ main plugin state. Now, if we have plugins used for regular files
56996+ but not for directories, how such plugins would be inherited?
56997+ . always store them with directories also
56998+
56999+NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing
57000+the line below which is also useful.
57001+
57002+ . use inheritance hierarchy, independent of file-system namespace
57003+*/
57004+
57005+#include "../debug.h"
57006+#include "../dformat.h"
57007+#include "plugin_header.h"
57008+#include "item/static_stat.h"
57009+#include "node/node.h"
57010+#include "security/perm.h"
57011+#include "space/space_allocator.h"
57012+#include "disk_format/disk_format.h"
57013+#include "plugin.h"
57014+#include "../reiser4.h"
57015+#include "../jnode.h"
57016+#include "../inode.h"
57017+
57018+#include <linux/fs.h> /* for struct super_block */
57019+
57020+/*
57021+ * init_plugins - initialize plugin sub-system.
57022+ * Just call this once on reiser4 startup.
57023+ *
57024+ * Initializes plugin sub-system. It is part of reiser4 module
57025+ * initialization. For each plugin of each type init method is called and each
57026+ * plugin is put into list of plugins.
57027+ */
57028+int init_plugins(void)
57029+{
57030+ reiser4_plugin_type type_id;
57031+
57032+ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
57033+ struct reiser4_plugin_type_data *ptype;
57034+ int i;
57035+
57036+ ptype = &plugins[type_id];
57037+ assert("nikita-3508", ptype->label != NULL);
57038+ assert("nikita-3509", ptype->type_id == type_id);
57039+
57040+ INIT_LIST_HEAD(&ptype->plugins_list);
57041+/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
57042+ for (i = 0; i < ptype->builtin_num; ++i) {
57043+ reiser4_plugin *plugin;
57044+
57045+ plugin = plugin_at(ptype, i);
57046+
57047+ if (plugin->h.label == NULL)
57048+ /* uninitialized slot encountered */
57049+ continue;
57050+ assert("nikita-3445", plugin->h.type_id == type_id);
57051+ plugin->h.id = i;
57052+ if (plugin->h.pops != NULL &&
57053+ plugin->h.pops->init != NULL) {
57054+ int result;
57055+
57056+ result = plugin->h.pops->init(plugin);
57057+ if (result != 0)
57058+ return result;
57059+ }
57060+ INIT_LIST_HEAD(&plugin->h.linkage);
57061+ list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
57062+ }
57063+ }
57064+ return 0;
57065+}
57066+
57067+/* true if plugin type id is valid */
57068+int is_plugin_type_valid(reiser4_plugin_type type)
57069+{
57070+ /* "type" is unsigned, so no comparison with 0 is
57071+ necessary */
57072+ return (type < REISER4_PLUGIN_TYPES);
57073+}
57074+
57075+/* true if plugin id is valid */
57076+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
57077+{
57078+ assert("nikita-1653", is_plugin_type_valid(type));
57079+ return id < plugins[type].builtin_num;
57080+}
57081+
57082+/* return plugin by its @type and @id.
57083+
57084+ Both arguments are checked for validness: this is supposed to be called
57085+ from user-level.
57086+
57087+NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
57088+user space, and passed to the filesystem by use of method files? Your
57089+comment really confused me on the first reading....
57090+
57091+*/
57092+reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
57093+ * unchecked */,
57094+ reiser4_plugin_id id /* plugin id,
57095+ * unchecked */)
57096+{
57097+ if (is_plugin_type_valid(type)) {
57098+ if (is_plugin_id_valid(type, id))
57099+ return plugin_at(&plugins[type], id);
57100+ else
57101+ /* id out of bounds */
57102+ warning("nikita-2913",
57103+ "Invalid plugin id: [%i:%i]", type, id);
57104+ } else
57105+ /* type_id out of bounds */
57106+ warning("nikita-2914", "Invalid type_id: %i", type);
57107+ return NULL;
57108+}
57109+
57110+/**
57111+ * save_plugin_id - store plugin id in disk format
57112+ * @plugin: plugin to convert
57113+ * @area: where to store result
57114+ *
57115+ * Puts id of @plugin in little endian format to address @area.
57116+ */
57117+int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
57118+ d16 *area /* where to store result */ )
57119+{
57120+ assert("nikita-1261", plugin != NULL);
57121+ assert("nikita-1262", area != NULL);
57122+
57123+ put_unaligned(cpu_to_le16(plugin->h.id), area);
57124+ return 0;
57125+}
57126+
57127+/* list of all plugins of given type */
57128+struct list_head *get_plugin_list(reiser4_plugin_type type)
57129+{
57130+ assert("nikita-1056", is_plugin_type_valid(type));
57131+ return &plugins[type].plugins_list;
57132+}
57133+
57134+static void update_pset_mask(reiser4_inode * info, pset_member memb)
57135+{
57136+ struct dentry *rootdir;
57137+ reiser4_inode *root;
57138+
57139+ assert("edward-1443", memb != PSET_FILE);
57140+
57141+ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
57142+ if (rootdir != NULL) {
57143+ root = reiser4_inode_data(rootdir->d_inode);
57144+ /*
57145+ * if inode is different from the default one, or we are
57146+ * changing plugin of root directory, update plugin_mask
57147+ */
57148+ if (aset_get(info->pset, memb) !=
57149+ aset_get(root->pset, memb) ||
57150+ info == root)
57151+ info->plugin_mask |= (1 << memb);
57152+ else
57153+ info->plugin_mask &= ~(1 << memb);
57154+ }
57155+}
57156+
57157+/* Get specified plugin set member from parent,
57158+ or from fs-defaults (if no parent is given) and
57159+ install the result to pset of @self */
57160+int grab_plugin_pset(struct inode *self,
57161+ struct inode *ancestor,
57162+ pset_member memb)
57163+{
57164+ reiser4_plugin *plug;
57165+ reiser4_inode *info;
57166+ int result = 0;
57167+
57168+ /* Do not grab if initialised already. */
57169+ info = reiser4_inode_data(self);
57170+ if (aset_get(info->pset, memb) != NULL)
57171+ return 0;
57172+ if (ancestor) {
57173+ reiser4_inode *parent;
57174+
57175+ parent = reiser4_inode_data(ancestor);
57176+ plug = aset_get(parent->hset, memb) ? :
57177+ aset_get(parent->pset, memb);
57178+ }
57179+ else
57180+ plug = get_default_plugin(memb);
57181+
57182+ result = set_plugin(&info->pset, memb, plug);
57183+ if (result == 0) {
57184+ if (!ancestor || self->i_sb->s_root->d_inode != self)
57185+ update_pset_mask(info, memb);
57186+ }
57187+ return result;
57188+}
57189+
57190+/* Take missing pset members from root inode */
57191+int finish_pset(struct inode *inode)
57192+{
57193+ reiser4_plugin *plug;
57194+ reiser4_inode *root;
57195+ reiser4_inode *info;
57196+ pset_member memb;
57197+ int result = 0;
57198+
57199+ root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
57200+ info = reiser4_inode_data(inode);
57201+
57202+ assert("edward-1455", root != NULL);
57203+ assert("edward-1456", info != NULL);
57204+
57205+ /* file and directory plugins are already initialized. */
57206+ for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
57207+
57208+ /* Do not grab if initialised already. */
57209+ if (aset_get(info->pset, memb) != NULL)
57210+ continue;
57211+
57212+ plug = aset_get(root->pset, memb);
57213+ result = set_plugin(&info->pset, memb, plug);
57214+ if (result != 0)
57215+ break;
57216+ }
57217+ if (result != 0) {
57218+ warning("nikita-3447",
57219+ "Cannot set up plugins for %lli",
57220+ (unsigned long long)
57221+ get_inode_oid(inode));
57222+ }
57223+ return result;
57224+}
57225+
57226+int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug)
57227+{
57228+ reiser4_inode *info;
57229+ int result = 0;
57230+
57231+ if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
57232+ /* Changing pset in the root object. */
57233+ return RETERR(-EINVAL);
57234+ }
57235+
57236+ info = reiser4_inode_data(self);
57237+ if (plug->h.pops != NULL && plug->h.pops->change != NULL)
57238+ result = plug->h.pops->change(self, plug, memb);
57239+ else
57240+ result = aset_set_unsafe(&info->pset, memb, plug);
57241+ if (result == 0) {
57242+ __u16 oldmask = info->plugin_mask;
57243+
57244+ update_pset_mask(info, memb);
57245+ if (oldmask != info->plugin_mask)
57246+ reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
57247+ }
57248+ return result;
57249+}
57250+
57251+struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
57252+ /* C90 initializers */
57253+ [REISER4_FILE_PLUGIN_TYPE] = {
57254+ .type_id = REISER4_FILE_PLUGIN_TYPE,
57255+ .label = "file",
57256+ .desc = "Object plugins",
57257+ .builtin_num = sizeof_array(file_plugins),
57258+ .builtin = file_plugins,
57259+ .plugins_list = {NULL, NULL},
57260+ .size = sizeof(file_plugin)
57261+ },
57262+ [REISER4_DIR_PLUGIN_TYPE] = {
57263+ .type_id = REISER4_DIR_PLUGIN_TYPE,
57264+ .label = "dir",
57265+ .desc = "Directory plugins",
57266+ .builtin_num = sizeof_array(dir_plugins),
57267+ .builtin = dir_plugins,
57268+ .plugins_list = {NULL, NULL},
57269+ .size = sizeof(dir_plugin)
57270+ },
57271+ [REISER4_HASH_PLUGIN_TYPE] = {
57272+ .type_id = REISER4_HASH_PLUGIN_TYPE,
57273+ .label = "hash",
57274+ .desc = "Directory hashes",
57275+ .builtin_num = sizeof_array(hash_plugins),
57276+ .builtin = hash_plugins,
57277+ .plugins_list = {NULL, NULL},
57278+ .size = sizeof(hash_plugin)
57279+ },
57280+ [REISER4_FIBRATION_PLUGIN_TYPE] = {
57281+ .type_id =
57282+ REISER4_FIBRATION_PLUGIN_TYPE,
57283+ .label = "fibration",
57284+ .desc = "Directory fibrations",
57285+ .builtin_num = sizeof_array(fibration_plugins),
57286+ .builtin = fibration_plugins,
57287+ .plugins_list = {NULL, NULL},
57288+ .size = sizeof(fibration_plugin)
57289+ },
57290+ [REISER4_CIPHER_PLUGIN_TYPE] = {
57291+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
57292+ .label = "cipher",
57293+ .desc = "Cipher plugins",
57294+ .builtin_num = sizeof_array(cipher_plugins),
57295+ .builtin = cipher_plugins,
57296+ .plugins_list = {NULL, NULL},
57297+ .size = sizeof(cipher_plugin)
57298+ },
57299+ [REISER4_DIGEST_PLUGIN_TYPE] = {
57300+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
57301+ .label = "digest",
57302+ .desc = "Digest plugins",
57303+ .builtin_num = sizeof_array(digest_plugins),
57304+ .builtin = digest_plugins,
57305+ .plugins_list = {NULL, NULL},
57306+ .size = sizeof(digest_plugin)
57307+ },
57308+ [REISER4_COMPRESSION_PLUGIN_TYPE] = {
57309+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
57310+ .label = "compression",
57311+ .desc = "Compression plugins",
57312+ .builtin_num = sizeof_array(compression_plugins),
57313+ .builtin = compression_plugins,
57314+ .plugins_list = {NULL, NULL},
57315+ .size = sizeof(compression_plugin)
57316+ },
57317+ [REISER4_FORMATTING_PLUGIN_TYPE] = {
57318+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
57319+ .label = "formatting",
57320+ .desc = "Tail inlining policies",
57321+ .builtin_num = sizeof_array(formatting_plugins),
57322+ .builtin = formatting_plugins,
57323+ .plugins_list = {NULL, NULL},
57324+ .size = sizeof(formatting_plugin)
57325+ },
57326+ [REISER4_PERM_PLUGIN_TYPE] = {
57327+ .type_id = REISER4_PERM_PLUGIN_TYPE,
57328+ .label = "perm",
57329+ .desc = "Permission checks",
57330+ .builtin_num = sizeof_array(perm_plugins),
57331+ .builtin = perm_plugins,
57332+ .plugins_list = {NULL, NULL},
57333+ .size = sizeof(perm_plugin)
57334+ },
57335+ [REISER4_ITEM_PLUGIN_TYPE] = {
57336+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
57337+ .label = "item",
57338+ .desc = "Item handlers",
57339+ .builtin_num = sizeof_array(item_plugins),
57340+ .builtin = item_plugins,
57341+ .plugins_list = {NULL, NULL},
57342+ .size = sizeof(item_plugin)
57343+ },
57344+ [REISER4_NODE_PLUGIN_TYPE] = {
57345+ .type_id = REISER4_NODE_PLUGIN_TYPE,
57346+ .label = "node",
57347+ .desc = "node layout handlers",
57348+ .builtin_num = sizeof_array(node_plugins),
57349+ .builtin = node_plugins,
57350+ .plugins_list = {NULL, NULL},
57351+ .size = sizeof(node_plugin)
57352+ },
57353+ [REISER4_SD_EXT_PLUGIN_TYPE] = {
57354+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
57355+ .label = "sd_ext",
57356+ .desc = "Parts of stat-data",
57357+ .builtin_num = sizeof_array(sd_ext_plugins),
57358+ .builtin = sd_ext_plugins,
57359+ .plugins_list = {NULL, NULL},
57360+ .size = sizeof(sd_ext_plugin)
57361+ },
57362+ [REISER4_FORMAT_PLUGIN_TYPE] = {
57363+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
57364+ .label = "disk_layout",
57365+ .desc = "defines filesystem on disk layout",
57366+ .builtin_num = sizeof_array(format_plugins),
57367+ .builtin = format_plugins,
57368+ .plugins_list = {NULL, NULL},
57369+ .size = sizeof(disk_format_plugin)
57370+ },
57371+ [REISER4_JNODE_PLUGIN_TYPE] = {
57372+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
57373+ .label = "jnode",
57374+ .desc = "defines kind of jnode",
57375+ .builtin_num = sizeof_array(jnode_plugins),
57376+ .builtin = jnode_plugins,
57377+ .plugins_list = {NULL, NULL},
57378+ .size = sizeof(jnode_plugin)
57379+ },
57380+ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
57381+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
57382+ .label = "compression_mode",
57383+ .desc = "Defines compression mode",
57384+ .builtin_num = sizeof_array(compression_mode_plugins),
57385+ .builtin = compression_mode_plugins,
57386+ .plugins_list = {NULL, NULL},
57387+ .size = sizeof(compression_mode_plugin)
57388+ },
57389+ [REISER4_CLUSTER_PLUGIN_TYPE] = {
57390+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
57391+ .label = "cluster",
57392+ .desc = "Defines cluster size",
57393+ .builtin_num = sizeof_array(cluster_plugins),
57394+ .builtin = cluster_plugins,
57395+ .plugins_list = {NULL, NULL},
57396+ .size = sizeof(cluster_plugin)
57397+ }
57398+};
57399+
57400+/*
57401+ * Local variables:
57402+ * c-indentation-style: "K&R"
57403+ * mode-name: "LC"
57404+ * c-basic-offset: 8
57405+ * tab-width: 8
57406+ * fill-column: 120
57407+ * End:
57408+ */
57409diff -urN linux-2.6.24.orig/fs/reiser4/plugin/plugin.h linux-2.6.24/fs/reiser4/plugin/plugin.h
57410--- linux-2.6.24.orig/fs/reiser4/plugin/plugin.h 1970-01-01 03:00:00.000000000 +0300
57411+++ linux-2.6.24/fs/reiser4/plugin/plugin.h 2008-01-25 11:39:07.052237570 +0300
57412@@ -0,0 +1,937 @@
57413+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
57414+
57415+/* Basic plugin data-types.
57416+ see fs/reiser4/plugin/plugin.c for details */
57417+
57418+#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
57419+#define __FS_REISER4_PLUGIN_TYPES_H__
57420+
57421+#include "../forward.h"
57422+#include "../debug.h"
57423+#include "../dformat.h"
57424+#include "../key.h"
57425+#include "compress/compress.h"
57426+#include "crypto/cipher.h"
57427+#include "plugin_header.h"
57428+#include "item/static_stat.h"
57429+#include "item/internal.h"
57430+#include "item/sde.h"
57431+#include "item/cde.h"
57432+#include "item/item.h"
57433+#include "node/node.h"
57434+#include "node/node40.h"
57435+#include "security/perm.h"
57436+#include "fibration.h"
57437+
57438+#include "space/bitmap.h"
57439+#include "space/space_allocator.h"
57440+
57441+#include "disk_format/disk_format40.h"
57442+#include "disk_format/disk_format.h"
57443+
57444+#include <linux/fs.h> /* for struct super_block, address_space */
57445+#include <linux/mm.h> /* for struct page */
57446+#include <linux/buffer_head.h> /* for struct buffer_head */
57447+#include <linux/dcache.h> /* for struct dentry */
57448+#include <linux/types.h>
57449+#include <linux/crypto.h>
57450+
57451+typedef struct reiser4_object_on_wire reiser4_object_on_wire;
57452+
57453+/*
57454+ * File plugin. Defines the set of methods that file plugins implement, some
57455+ * of which are optional.
57456+ *
57457+ * A file plugin offers to the caller an interface for IO ( writing to and/or
57458+ * reading from) to what the caller sees as one sequence of bytes. An IO to it
57459+ * may affect more than one physical sequence of bytes, or no physical sequence
57460+ * of bytes, it may affect sequences of bytes offered by other file plugins to
57461+ * the semantic layer, and the file plugin may invoke other plugins and
57462+ * delegate work to them, but its interface is structured for offering the
57463+ * caller the ability to read and/or write what the caller sees as being a
57464+ * single sequence of bytes.
57465+ *
57466+ * The file plugin must present a sequence of bytes to the caller, but it does
57467+ * not necessarily have to store a sequence of bytes, it does not necessarily
57468+ * have to support efficient tree traversal to any offset in the sequence of
57469+ * bytes (tail and extent items, whose keys contain offsets, do however provide
57470+ * efficient non-sequential lookup of any offset in the sequence of bytes).
57471+ *
57472+ * Directory plugins provide methods for selecting file plugins by resolving a
57473+ * name for them.
57474+ *
57475+ * The functionality other filesystems call an attribute, and rigidly tie
57476+ * together, we decompose into orthogonal selectable features of files. Using
57477+ * the terminology we will define next, an attribute is a perhaps constrained,
57478+ * perhaps static length, file whose parent has a uni-count-intra-link to it,
57479+ * which might be grandparent-major-packed, and whose parent has a deletion
57480+ * method that deletes it.
57481+ *
57482+ * File plugins can implement constraints.
57483+ *
57484+ * Files can be of variable length (e.g. regular unix files), or of static
57485+ * length (e.g. static sized attributes).
57486+ *
57487+ * An object may have many sequences of bytes, and many file plugins, but, it
57488+ * has exactly one objectid. It is usually desirable that an object has a
57489+ * deletion method which deletes every item with that objectid. Items cannot
57490+ * in general be found by just their objectids. This means that an object must
57491+ * have either a method built into its deletion plugin method for knowing what
57492+ * items need to be deleted, or links stored with the object that provide the
57493+ * plugin with a method for finding those items. Deleting a file within an
57494+ * object may or may not have the effect of deleting the entire object,
57495+ * depending on the file plugin's deletion method.
57496+ *
57497+ * LINK TAXONOMY:
57498+ *
57499+ * Many objects have a reference count, and when the reference count reaches 0
57500+ * the object's deletion method is invoked. Some links embody a reference
57501+ * count increase ("countlinks"), and others do not ("nocountlinks").
57502+ *
57503+ * Some links are bi-directional links ("bilinks"), and some are
57504+ * uni-directional("unilinks").
57505+ *
57506+ * Some links are between parts of the same object ("intralinks"), and some are
57507+ * between different objects ("interlinks").
57508+ *
57509+ * PACKING TAXONOMY:
57510+ *
57511+ * Some items of an object are stored with a major packing locality based on
57512+ * their object's objectid (e.g. unix directory items in plan A), and these are
57513+ * called "self-major-packed".
57514+ *
57515+ * Some items of an object are stored with a major packing locality based on
57516+ * their semantic parent object's objectid (e.g. unix file bodies in plan A),
57517+ * and these are called "parent-major-packed".
57518+ *
57519+ * Some items of an object are stored with a major packing locality based on
57520+ * their semantic grandparent, and these are called "grandparent-major-packed".
57521+ * Now carefully notice that we run into trouble with key length if we have to
57522+ * store a 8 byte major+minor grandparent based packing locality, an 8 byte
57523+ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
57524+ * a 24 byte key. One of these fields must be sacrificed if an item is to be
57525+ * grandparent-major-packed, and which to sacrifice is left to the item author
57526+ * choosing to make the item grandparent-major-packed. You cannot make tail
57527+ * items and extent items grandparent-major-packed, though you could make them
57528+ * self-major-packed (usually they are parent-major-packed).
57529+ *
57530+ * In the case of ACLs (which are composed of fixed length ACEs which consist
57531+ * of {subject-type, subject, and permission bitmask} triples), it makes sense
57532+ * to not have an offset field in the ACE item key, and to allow duplicate keys
57533+ * for ACEs. Thus, the set of ACES for a given file is found by looking for a
57534+ * key consisting of the objectid of the grandparent (thus grouping all ACLs in
57535+ * a directory together), the minor packing locality of ACE, the objectid of
57536+ * the file, and 0.
57537+ *
57538+ * IO involves moving data from one location to another, which means that two
57539+ * locations must be specified, source and destination.
57540+ *
57541+ * This source and destination can be in the filesystem, or they can be a
57542+ * pointer in the user process address space plus a byte count.
57543+ *
57544+ * If both source and destination are in the filesystem, then at least one of
57545+ * them must be representable as a pure stream of bytes (which we call a flow,
57546+ * and define as a struct containing a key, a data pointer, and a length).
57547+ * This may mean converting one of them into a flow. We provide a generic
57548+ * cast_into_flow() method, which will work for any plugin supporting
57549+ * read_flow(), though it is inefficiently implemented in that it temporarily
57550+ * stores the flow in a buffer (Question: what to do with huge flows that
57551+ * cannot fit into memory? Answer: we must not convert them all at once. )
57552+ *
57553+ * Performing a write requires resolving the write request into a flow defining
57554+ * the source, and a method that performs the write, and a key that defines
57555+ * where in the tree the write is to go.
57556+ *
57557+ * Performing a read requires resolving the read request into a flow defining
57558+ * the target, and a method that performs the read, and a key that defines
57559+ * where in the tree the read is to come from.
57560+ *
57561+ * There will exist file plugins which have no pluginid stored on the disk for
57562+ * them, and which are only invoked by other plugins.
57563+ */
57564+
57565+/* This should be incremented with each new contributed
57566+ pair (plugin type, plugin id).
57567+ NOTE: Make sure there is a release of reiser4progs
57568+ with the corresponding version number */
57569+#define PLUGIN_LIBRARY_VERSION 0
57570+
57571+ /* enumeration of fields within plugin_set */
57572+typedef enum {
57573+ PSET_FILE,
57574+ PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
57575+ * inode.c:read_inode() depends on this. */
57576+ PSET_PERM,
57577+ PSET_FORMATTING,
57578+ PSET_HASH,
57579+ PSET_FIBRATION,
57580+ PSET_SD,
57581+ PSET_DIR_ITEM,
57582+ PSET_CIPHER,
57583+ PSET_DIGEST,
57584+ PSET_COMPRESSION,
57585+ PSET_COMPRESSION_MODE,
57586+ PSET_CLUSTER,
57587+ PSET_CREATE,
57588+ PSET_LAST
57589+} pset_member;
57590+
57591+/* builtin file-plugins */
57592+typedef enum {
57593+ /* regular file */
57594+ UNIX_FILE_PLUGIN_ID,
57595+ /* directory */
57596+ DIRECTORY_FILE_PLUGIN_ID,
57597+ /* symlink */
57598+ SYMLINK_FILE_PLUGIN_ID,
57599+ /* for objects completely handled by the VFS: fifos, devices,
57600+ sockets */
57601+ SPECIAL_FILE_PLUGIN_ID,
57602+ /* regular cryptcompress file */
57603+ CRYPTCOMPRESS_FILE_PLUGIN_ID,
57604+ /* number of file plugins. Used as size of arrays to hold
57605+ file plugins. */
57606+ LAST_FILE_PLUGIN_ID
57607+} reiser4_file_id;
57608+
57609+typedef struct file_plugin {
57610+
57611+ /* generic fields */
57612+ plugin_header h;
57613+
57614+ /* VFS methods.
57615+ * Must be invariant with respect to plugin conversion.
57616+ * It can be achieved by using "common" methods, which
57617+ * are the same for all plugins that take participation in
57618+ * conversion, or by using "generic" or "careful" methods,
57619+ * which provide automatic redirection to proper private
57620+ * plugin methods ("careful" are the same as "generic",
57621+ * but with protection of pset and other disk structures
57622+ * from being rebuilt during conversion.
57623+ */
57624+ struct inode_operations * inode_ops;
57625+ struct file_operations * file_ops;
57626+ struct address_space_operations * as_ops;
57627+ /**
57628+ * Private methods. These are optional. If used they will allow you
57629+ * to minimize the amount of code needed to implement a deviation
57630+ * from some other method that also uses them.
57631+ */
57632+ /*
57633+ * private inode_ops
57634+ */
57635+ int (*setattr)(struct dentry *, struct iattr *);
57636+ /*
57637+ * private file_ops
57638+ */
57639+ /* do whatever is necessary to do when object is opened */
57640+ int (*open) (struct inode * inode, struct file * file);
57641+ ssize_t (*read) (struct file *, char __user *buf, size_t read_amount,
57642+ loff_t *off);
57643+ /* write as much as possible bytes from nominated @write_amount
57644+ * before plugin scheduling is occurred. Save scheduling state
57645+ * in @cont */
57646+ ssize_t (*write) (struct file *, const char __user *buf,
57647+ size_t write_amount, loff_t * off,
57648+ struct psched_context * cont);
57649+ int (*ioctl) (struct inode *inode, struct file *filp,
57650+ unsigned int cmd, unsigned long arg);
57651+ int (*mmap) (struct file *, struct vm_area_struct *);
57652+ int (*release) (struct inode *, struct file *);
57653+ /*
57654+ * private a_ops
57655+ */
57656+ int (*readpage) (struct file *file, struct page *page);
57657+ int (*readpages)(struct file *file, struct address_space *mapping,
57658+ struct list_head *pages, unsigned nr_pages);
57659+ int (*writepages)(struct address_space *mapping,
57660+ struct writeback_control *wbc);
57661+ int (*prepare_write)(struct file *file, struct page *page,
57662+ unsigned from, unsigned to);
57663+ int (*commit_write)(struct file *file, struct page *page,
57664+ unsigned from, unsigned to);
57665+ sector_t (*bmap) (struct address_space * mapping, sector_t lblock);
57666+ /* other private methods */
57667+ /* save inode cached stat-data onto disk. It was called
57668+ reiserfs_update_sd() in 3.x */
57669+ int (*write_sd_by_inode) (struct inode *);
57670+ /*
57671+ * Construct flow into @flow according to user-supplied data.
57672+ *
57673+ * This is used by read/write methods to construct a flow to
57674+ * write/read. ->flow_by_inode() is plugin method, rather than single
57675+ * global implementation, because key in a flow used by plugin may
57676+ * depend on data in a @buf.
57677+ *
57678+ * NIKITA-FIXME-HANS: please create statistics on what functions are
57679+ * dereferenced how often for the mongo benchmark. You can supervise
57680+ * Elena doing this for you if that helps. Email me the list of the
57681+ * top 10, with their counts, and an estimate of the total number of
57682+ * CPU cycles spent dereferencing as a percentage of CPU cycles spent
57683+ * processing (non-idle processing). If the total percent is, say,
57684+ * less than 1%, it will make our coding discussions much easier, and
57685+ * keep me from questioning whether functions like the below are too
57686+ * frequently called to be dereferenced. If the total percent is more
57687+ * than 1%, perhaps private methods should be listed in a "required"
57688+ * comment at the top of each plugin (with stern language about how if
57689+ * the comment is missing it will not be accepted by the maintainer),
57690+ * and implemented using macros not dereferenced functions. How about
57691+ * replacing this whole private methods part of the struct with a
57692+ * thorough documentation of what the standard helper functions are for
57693+ * use in constructing plugins? I think users have been asking for
57694+ * that, though not in so many words.
57695+ */
57696+ int (*flow_by_inode) (struct inode *, const char __user *buf,
57697+ int user, loff_t size,
57698+ loff_t off, rw_op op, flow_t *);
57699+ /*
57700+ * Return the key used to retrieve an offset of a file. It is used by
57701+ * default implementation of ->flow_by_inode() method
57702+ * (common_build_flow()) and, among other things, to get to the extent
57703+ * from jnode of unformatted node.
57704+ */
57705+ int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
57706+
57707+ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
57708+ /*
57709+ * set the plugin for a file. Called during file creation in creat()
57710+ * but not reiser4() unless an inode already exists for the file.
57711+ */
57712+ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
57713+ reiser4_object_create_data *);
57714+
57715+ /* NIKITA-FIXME-HANS: comment and name seem to say different things,
57716+ * are you setting up the object itself also or just adjusting the
57717+ * parent?.... */
57718+ /* set up plugins for new @object created in @parent. @root is root
57719+ directory. */
57720+ int (*adjust_to_parent) (struct inode *object, struct inode *parent,
57721+ struct inode *root);
57722+ /*
57723+ * this does whatever is necessary to do when object is created. For
57724+ * instance, for unix files stat data is inserted. It is supposed to be
57725+ * called by create of struct inode_operations.
57726+ */
57727+ int (*create_object) (struct inode *object, struct inode *parent,
57728+ reiser4_object_create_data *);
57729+ /*
57730+ * this method should check REISER4_NO_SD and set REISER4_NO_SD on
57731+ * success. Deletion of an object usually includes removal of items
57732+ * building file body (for directories this is removal of "." and "..")
57733+ * and removal of stat-data item.
57734+ */
57735+ int (*delete_object) (struct inode *);
57736+
57737+ /* add link from @parent to @object */
57738+ int (*add_link) (struct inode *object, struct inode *parent);
57739+
57740+ /* remove link from @parent to @object */
57741+ int (*rem_link) (struct inode *object, struct inode *parent);
57742+
57743+ /*
57744+ * return true if item addressed by @coord belongs to @inode. This is
57745+ * used by read/write to properly slice flow into items in presence of
57746+ * multiple key assignment policies, because items of a file are not
57747+ * necessarily contiguous in a key space, for example, in a plan-b.
57748+ */
57749+ int (*owns_item) (const struct inode *, const coord_t *);
57750+
57751+ /* checks whether yet another hard links to this object can be
57752+ added */
57753+ int (*can_add_link) (const struct inode *);
57754+
57755+ /* checks whether hard links to this object can be removed */
57756+ int (*can_rem_link) (const struct inode *);
57757+
57758+ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
57759+ detach of directory plugin to remove ".." */
57760+ int (*detach) (struct inode * child, struct inode * parent);
57761+
57762+ /* called when @child was just looked up in the @parent. It is not
57763+ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
57764+ directory plugin */
57765+ int (*bind) (struct inode * child, struct inode * parent);
57766+
57767+ /* process safe-link during mount */
57768+ int (*safelink) (struct inode * object, reiser4_safe_link_t link,
57769+ __u64 value);
57770+
57771+ /* The couple of estimate methods for all file operations */
57772+ struct {
57773+ reiser4_block_nr(*create) (const struct inode *);
57774+ reiser4_block_nr(*update) (const struct inode *);
57775+ reiser4_block_nr(*unlink) (const struct inode *,
57776+ const struct inode *);
57777+ } estimate;
57778+
57779+ /*
57780+ * reiser4 specific part of inode has a union of structures which are
57781+ * specific to a plugin. This method is called when inode is read
57782+ * (read_inode) and when file is created (common_create_child) so that
57783+ * file plugin could initialize its inode data
57784+ */
57785+ void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
57786+ int);
57787+
57788+ /*
57789+ * This method performs progressive deletion of items and whole nodes
57790+ * from right to left.
57791+ *
57792+ * @tap: the point deletion process begins from,
57793+ * @from_key: the beginning of the deleted key range,
57794+ * @to_key: the end of the deleted key range,
57795+ * @smallest_removed: the smallest removed key,
57796+ *
57797+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
57798+ * operation was interrupted for allowing atom commit .
57799+ */
57800+ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
57801+ const reiser4_key * to_key,
57802+ reiser4_key * smallest_removed, struct inode *,
57803+ int, int *);
57804+
57805+ /* called from ->destroy_inode() */
57806+ void (*destroy_inode) (struct inode *);
57807+
57808+ /*
57809+ * methods to serialize object identify. This is used, for example, by
57810+ * reiser4_{en,de}code_fh().
57811+ */
57812+ struct {
57813+ /* store object's identity at @area */
57814+ char *(*write) (struct inode * inode, char *area);
57815+ /* parse object from wire to the @obj */
57816+ char *(*read) (char *area, reiser4_object_on_wire * obj);
57817+ /* given object identity in @obj, find or create its dentry */
57818+ struct dentry *(*get) (struct super_block * s,
57819+ reiser4_object_on_wire * obj);
57820+ /* how many bytes ->wire.write() consumes */
57821+ int (*size) (struct inode * inode);
57822+ /* finish with object identify */
57823+ void (*done) (reiser4_object_on_wire * obj);
57824+ } wire;
57825+} file_plugin;
57826+
57827+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
57828+
57829+struct reiser4_object_on_wire {
57830+ file_plugin *plugin;
57831+ union {
57832+ struct {
57833+ obj_key_id key_id;
57834+ } std;
57835+ void *generic;
57836+ } u;
57837+};
57838+
57839+/* builtin dir-plugins */
57840+typedef enum {
57841+ HASHED_DIR_PLUGIN_ID,
57842+ SEEKABLE_HASHED_DIR_PLUGIN_ID,
57843+ LAST_DIR_ID
57844+} reiser4_dir_id;
57845+
57846+typedef struct dir_plugin {
57847+ /* generic fields */
57848+ plugin_header h;
57849+
57850+ struct inode_operations * inode_ops;
57851+ struct file_operations * file_ops;
57852+ struct address_space_operations * as_ops;
57853+
57854+ /*
57855+ * private methods: These are optional. If used they will allow you to
57856+ * minimize the amount of code needed to implement a deviation from
57857+ * some other method that uses them. You could logically argue that
57858+ * they should be a separate type of plugin.
57859+ */
57860+
57861+ struct dentry *(*get_parent) (struct inode * childdir);
57862+
57863+ /*
57864+ * check whether "name" is acceptable name to be inserted into this
57865+ * object. Optionally implemented by directory-like objects. Can check
57866+ * for maximal length, reserved symbols etc
57867+ */
57868+ int (*is_name_acceptable) (const struct inode * inode, const char *name,
57869+ int len);
57870+
57871+ void (*build_entry_key) (const struct inode * dir /* directory where
57872+ * entry is (or will
57873+ * be) in.*/ ,
57874+ const struct qstr * name /* name of file
57875+ * referenced by this
57876+ * entry */ ,
57877+ reiser4_key * result /* resulting key of
57878+ * directory entry */ );
57879+ int (*build_readdir_key) (struct file * dir, reiser4_key * result);
57880+ int (*add_entry) (struct inode * object, struct dentry * where,
57881+ reiser4_object_create_data * data,
57882+ reiser4_dir_entry_desc * entry);
57883+ int (*rem_entry) (struct inode * object, struct dentry * where,
57884+ reiser4_dir_entry_desc * entry);
57885+
57886+ /*
57887+ * initialize directory structure for newly created object. For normal
57888+ * unix directories, insert dot and dotdot.
57889+ */
57890+ int (*init) (struct inode * object, struct inode * parent,
57891+ reiser4_object_create_data * data);
57892+
57893+ /* destroy directory */
57894+ int (*done) (struct inode * child);
57895+
57896+ /* called when @subdir was just looked up in the @dir */
57897+ int (*attach) (struct inode * subdir, struct inode * dir);
57898+ int (*detach) (struct inode * subdir, struct inode * dir);
57899+
57900+ struct {
57901+ reiser4_block_nr(*add_entry) (const struct inode *);
57902+ reiser4_block_nr(*rem_entry) (const struct inode *);
57903+ reiser4_block_nr(*unlink) (const struct inode *,
57904+ const struct inode *);
57905+ } estimate;
57906+} dir_plugin;
57907+
57908+extern dir_plugin dir_plugins[LAST_DIR_ID];
57909+
57910+typedef struct formatting_plugin {
57911+ /* generic fields */
57912+ plugin_header h;
57913+ /* returns non-zero iff file's tail has to be stored
57914+ in a direct item. */
57915+ int (*have_tail) (const struct inode * inode, loff_t size);
57916+} formatting_plugin;
57917+
57918+typedef struct hash_plugin {
57919+ /* generic fields */
57920+ plugin_header h;
57921+ /* computes hash of the given name */
57922+ __u64(*hash) (const unsigned char *name, int len);
57923+} hash_plugin;
57924+
57925+typedef struct cipher_plugin {
57926+ /* generic fields */
57927+ plugin_header h;
57928+ struct crypto_blkcipher * (*alloc) (void);
57929+ void (*free) (struct crypto_blkcipher * tfm);
57930+ /* Offset translator. For each offset this returns (k * offset), where
57931+ k (k >= 1) is an expansion factor of the cipher algorithm.
57932+ For all symmetric algorithms k == 1. For asymmetric algorithms (which
57933+ inflate data) offset translation guarantees that all disk cluster's
57934+ units will have keys smaller then next cluster's one.
57935+ */
57936+ loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
57937+ /* Cipher algorithms can accept data only by chunks of cipher block
57938+ size. This method is to align any flow up to cipher block size when
57939+ we pass it to cipher algorithm. To align means to append padding of
57940+ special format specific to the cipher algorithm */
57941+ int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
57942+ /* low-level key manager (check, install, etc..) */
57943+ int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
57944+ unsigned int keylen);
57945+ /* main text processing procedures */
57946+ void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
57947+ void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
57948+} cipher_plugin;
57949+
57950+typedef struct digest_plugin {
57951+ /* generic fields */
57952+ plugin_header h;
57953+ /* fingerprint size in bytes */
57954+ int fipsize;
57955+ struct crypto_hash * (*alloc) (void);
57956+ void (*free) (struct crypto_hash * tfm);
57957+} digest_plugin;
57958+
57959+typedef struct compression_plugin {
57960+ /* generic fields */
57961+ plugin_header h;
57962+ int (*init) (void);
57963+ /* the maximum number of bytes the size of the "compressed" data can
57964+ * exceed the uncompressed data. */
57965+ int (*overrun) (unsigned src_len);
57966+ coa_t(*alloc) (tfm_action act);
57967+ void (*free) (coa_t coa, tfm_action act);
57968+ /* minimal size of the flow we still try to compress */
57969+ int (*min_size_deflate) (void);
57970+ __u32(*checksum) (char *data, __u32 length);
57971+ /* main transform procedures */
57972+ void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
57973+ __u8 * dst_first, unsigned *dst_len);
57974+ void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
57975+ __u8 * dst_first, unsigned *dst_len);
57976+} compression_plugin;
57977+
57978+typedef struct compression_mode_plugin {
57979+ /* generic fields */
57980+ plugin_header h;
57981+ /* this is called when estimating compressibility
57982+ of a logical cluster by its content */
57983+ int (*should_deflate) (struct inode * inode, cloff_t index);
57984+ /* this is called when results of compression should be saved */
57985+ int (*accept_hook) (struct inode * inode, cloff_t index);
57986+ /* this is called when results of compression should be discarded */
57987+ int (*discard_hook) (struct inode * inode, cloff_t index);
57988+} compression_mode_plugin;
57989+
57990+typedef struct cluster_plugin {
57991+ /* generic fields */
57992+ plugin_header h;
57993+ int shift;
57994+} cluster_plugin;
57995+
57996+typedef struct sd_ext_plugin {
57997+ /* generic fields */
57998+ plugin_header h;
57999+ int (*present) (struct inode * inode, char **area, int *len);
58000+ int (*absent) (struct inode * inode);
58001+ int (*save_len) (struct inode * inode);
58002+ int (*save) (struct inode * inode, char **area);
58003+ /* alignment requirement for this stat-data part */
58004+ int alignment;
58005+} sd_ext_plugin;
58006+
58007+/* this plugin contains methods to allocate objectid for newly created files,
58008+ to deallocate objectid when file gets removed, to report number of used and
58009+ free objectids */
58010+typedef struct oid_allocator_plugin {
58011+ /* generic fields */
58012+ plugin_header h;
58013+ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
58014+ __u64 oids);
58015+ /* used to report statfs->f_files */
58016+ __u64(*oids_used) (reiser4_oid_allocator * map);
58017+ /* get next oid to use */
58018+ __u64(*next_oid) (reiser4_oid_allocator * map);
58019+ /* used to report statfs->f_ffree */
58020+ __u64(*oids_free) (reiser4_oid_allocator * map);
58021+ /* allocate new objectid */
58022+ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
58023+ /* release objectid */
58024+ int (*release_oid) (reiser4_oid_allocator * map, oid_t);
58025+ /* how many pages to reserve in transaction for allocation of new
58026+ objectid */
58027+ int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
58028+ /* how many pages to reserve in transaction for freeing of an
58029+ objectid */
58030+ int (*oid_reserve_release) (reiser4_oid_allocator * map);
58031+ void (*print_info) (const char *, reiser4_oid_allocator *);
58032+} oid_allocator_plugin;
58033+
58034+/* disk layout plugin: this specifies super block, journal, bitmap (if there
58035+ are any) locations, etc */
58036+typedef struct disk_format_plugin {
58037+ /* generic fields */
58038+ plugin_header h;
58039+ /* replay journal, initialize super_info_data, etc */
58040+ int (*init_format) (struct super_block *, void *data);
58041+
58042+ /* key of root directory stat data */
58043+ const reiser4_key *(*root_dir_key) (const struct super_block *);
58044+
58045+ int (*release) (struct super_block *);
58046+ jnode *(*log_super) (struct super_block *);
58047+ int (*check_open) (const struct inode * object);
58048+ int (*version_update) (struct super_block *);
58049+} disk_format_plugin;
58050+
58051+struct jnode_plugin {
58052+ /* generic fields */
58053+ plugin_header h;
58054+ int (*init) (jnode * node);
58055+ int (*parse) (jnode * node);
58056+ struct address_space *(*mapping) (const jnode * node);
58057+ unsigned long (*index) (const jnode * node);
58058+ jnode *(*clone) (jnode * node);
58059+};
58060+
58061+/* plugin instance. */
58062+/* */
58063+/* This is "wrapper" union for all types of plugins. Most of the code uses */
58064+/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
58065+/* operates with pointers to reiser4_plugin. This union is only used in */
58066+/* some generic code in plugin/plugin.c that operates on all */
58067+/* plugins. Technically speaking purpose of this union is to add type */
58068+/* safety to said generic code: each plugin type (file_plugin, for */
58069+/* example), contains plugin_header as its first memeber. This first member */
58070+/* is located at the same place in memory as .h member of */
58071+/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
58072+/* looks in the .h which is header of plugin type located in union. This */
58073+/* allows to avoid type-casts. */
58074+union reiser4_plugin {
58075+ /* generic fields */
58076+ plugin_header h;
58077+ /* file plugin */
58078+ file_plugin file;
58079+ /* directory plugin */
58080+ dir_plugin dir;
58081+ /* hash plugin, used by directory plugin */
58082+ hash_plugin hash;
58083+ /* fibration plugin used by directory plugin */
58084+ fibration_plugin fibration;
58085+ /* cipher transform plugin, used by file plugin */
58086+ cipher_plugin cipher;
58087+ /* digest transform plugin, used by file plugin */
58088+ digest_plugin digest;
58089+ /* compression transform plugin, used by file plugin */
58090+ compression_plugin compression;
58091+ /* tail plugin, used by file plugin */
58092+ formatting_plugin formatting;
58093+ /* permission plugin */
58094+ perm_plugin perm;
58095+ /* node plugin */
58096+ node_plugin node;
58097+ /* item plugin */
58098+ item_plugin item;
58099+ /* stat-data extension plugin */
58100+ sd_ext_plugin sd_ext;
58101+ /* disk layout plugin */
58102+ disk_format_plugin format;
58103+ /* object id allocator plugin */
58104+ oid_allocator_plugin oid_allocator;
58105+ /* plugin for different jnode types */
58106+ jnode_plugin jnode;
58107+ /* compression mode plugin, used by object plugin */
58108+ compression_mode_plugin compression_mode;
58109+ /* cluster plugin, used by object plugin */
58110+ cluster_plugin clust;
58111+ /* place-holder for new plugin types that can be registered
58112+ dynamically, and used by other dynamically loaded plugins. */
58113+ void *generic;
58114+};
58115+
58116+struct reiser4_plugin_ops {
58117+ /* called when plugin is initialized */
58118+ int (*init) (reiser4_plugin * plugin);
58119+ /* called when plugin is unloaded */
58120+ int (*done) (reiser4_plugin * plugin);
58121+ /* load given plugin from disk */
58122+ int (*load) (struct inode * inode,
58123+ reiser4_plugin * plugin, char **area, int *len);
58124+ /* how many space is required to store this plugin's state
58125+ in stat-data */
58126+ int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
58127+ /* save persistent plugin-data to disk */
58128+ int (*save) (struct inode * inode, reiser4_plugin * plugin,
58129+ char **area);
58130+ /* alignment requirement for on-disk state of this plugin
58131+ in number of bytes */
58132+ int alignment;
58133+ /* install itself into given inode. This can return error
58134+ (e.g., you cannot change hash of non-empty directory). */
58135+ int (*change) (struct inode * inode, reiser4_plugin * plugin,
58136+ pset_member memb);
58137+ /* install itself into given inode. This can return error
58138+ (e.g., you cannot change hash of non-empty directory). */
58139+ int (*inherit) (struct inode * inode, struct inode * parent,
58140+ reiser4_plugin * plugin);
58141+};
58142+
58143+/* functions implemented in fs/reiser4/plugin/plugin.c */
58144+
58145+/* stores plugin reference in reiser4-specific part of inode */
58146+extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
58147+extern int init_plugins(void);
58148+
58149+/* builtin plugins */
58150+
58151+/* builtin hash-plugins */
58152+
58153+typedef enum {
58154+ RUPASOV_HASH_ID,
58155+ R5_HASH_ID,
58156+ TEA_HASH_ID,
58157+ FNV1_HASH_ID,
58158+ DEGENERATE_HASH_ID,
58159+ LAST_HASH_ID
58160+} reiser4_hash_id;
58161+
58162+/* builtin cipher plugins */
58163+
58164+typedef enum {
58165+ NONE_CIPHER_ID,
58166+ LAST_CIPHER_ID
58167+} reiser4_cipher_id;
58168+
58169+/* builtin digest plugins */
58170+
58171+typedef enum {
58172+ SHA256_32_DIGEST_ID,
58173+ LAST_DIGEST_ID
58174+} reiser4_digest_id;
58175+
58176+/* builtin compression mode plugins */
58177+typedef enum {
58178+ NONE_COMPRESSION_MODE_ID,
58179+ LATTD_COMPRESSION_MODE_ID,
58180+ ULTIM_COMPRESSION_MODE_ID,
58181+ FORCE_COMPRESSION_MODE_ID,
58182+ CONVX_COMPRESSION_MODE_ID,
58183+ LAST_COMPRESSION_MODE_ID
58184+} reiser4_compression_mode_id;
58185+
58186+/* builtin cluster plugins */
58187+typedef enum {
58188+ CLUSTER_64K_ID,
58189+ CLUSTER_32K_ID,
58190+ CLUSTER_16K_ID,
58191+ CLUSTER_8K_ID,
58192+ CLUSTER_4K_ID,
58193+ LAST_CLUSTER_ID
58194+} reiser4_cluster_id;
58195+
58196+/* builtin tail-plugins */
58197+
58198+typedef enum {
58199+ NEVER_TAILS_FORMATTING_ID,
58200+ ALWAYS_TAILS_FORMATTING_ID,
58201+ SMALL_FILE_FORMATTING_ID,
58202+ LAST_TAIL_FORMATTING_ID
58203+} reiser4_formatting_id;
58204+
58205+/* data type used to pack parameters that we pass to vfs object creation
58206+ function create_object() */
58207+struct reiser4_object_create_data {
58208+ /* plugin to control created object */
58209+ reiser4_file_id id;
58210+ /* mode of regular file, directory or special file */
58211+/* what happens if some other sort of perm plugin is in use? */
58212+ int mode;
58213+ /* rdev of special file */
58214+ dev_t rdev;
58215+ /* symlink target */
58216+ const char *name;
58217+ /* add here something for non-standard objects you invent, like
58218+ query for interpolation file etc. */
58219+
58220+ struct reiser4_crypto_info * crypto;
58221+
58222+ struct inode *parent;
58223+ struct dentry *dentry;
58224+};
58225+
58226+/* description of directory entry being created/destroyed/sought for
58227+
58228+ It is passed down to the directory plugin and farther to the
58229+ directory item plugin methods. Creation of new directory is done in
58230+ several stages: first we search for an entry with the same name, then
58231+ create new one. reiser4_dir_entry_desc is used to store some information
58232+ collected at some stage of this process and required later: key of
58233+ item that we want to insert/delete and pointer to an object that will
58234+ be bound by the new directory entry. Probably some more fields will
58235+ be added there.
58236+
58237+*/
58238+struct reiser4_dir_entry_desc {
58239+ /* key of directory entry */
58240+ reiser4_key key;
58241+ /* object bound by this entry. */
58242+ struct inode *obj;
58243+};
58244+
58245+#define MAX_PLUGIN_TYPE_LABEL_LEN 32
58246+#define MAX_PLUGIN_PLUG_LABEL_LEN 32
58247+
58248+#define PLUGIN_BY_ID(TYPE,ID,FIELD) \
58249+static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \
58250+{ \
58251+ reiser4_plugin *plugin = plugin_by_id ( ID, id ); \
58252+ return plugin ? & plugin -> FIELD : NULL; \
58253+} \
58254+static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
58255+{ \
58256+ reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \
58257+ return plugin ? & plugin -> FIELD : NULL; \
58258+} \
58259+static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \
58260+{ \
58261+ reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \
58262+ return plugin ? & plugin -> FIELD : NULL; \
58263+} \
58264+static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \
58265+{ \
58266+ return ( reiser4_plugin * ) plugin; \
58267+} \
58268+static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \
58269+{ \
58270+ return TYPE ## _to_plugin (plugin) -> h.id; \
58271+} \
58272+typedef struct { int foo; } TYPE ## _plugin_dummy
58273+
58274+PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
58275+PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
58276+PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
58277+PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
58278+PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
58279+PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
58280+PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
58281+PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
58282+PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
58283+PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
58284+PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
58285+PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
58286+PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
58287+PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
58288+PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58289+ compression_mode);
58290+PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
58291+
58292+extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
58293+
58294+extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
58295+
58296+#define for_all_plugins(ptype, plugin) \
58297+for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
58298+ get_plugin_list(ptype) != &plugin->h.linkage; \
58299+ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
58300+
58301+
58302+extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb);
58303+extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug);
58304+extern int finish_pset(struct inode *inode);
58305+
58306+/* defined in fs/reiser4/plugin/object.c */
58307+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
58308+/* defined in fs/reiser4/plugin/object.c */
58309+extern dir_plugin dir_plugins[LAST_DIR_ID];
58310+/* defined in fs/reiser4/plugin/item/static_stat.c */
58311+extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
58312+/* defined in fs/reiser4/plugin/hash.c */
58313+extern hash_plugin hash_plugins[LAST_HASH_ID];
58314+/* defined in fs/reiser4/plugin/fibration.c */
58315+extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
58316+/* defined in fs/reiser4/plugin/crypt.c */
58317+extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
58318+/* defined in fs/reiser4/plugin/digest.c */
58319+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
58320+/* defined in fs/reiser4/plugin/compress/compress.c */
58321+extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
58322+/* defined in fs/reiser4/plugin/compress/compression_mode.c */
58323+extern compression_mode_plugin
58324+compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
58325+/* defined in fs/reiser4/plugin/cluster.c */
58326+extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
58327+/* defined in fs/reiser4/plugin/tail.c */
58328+extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
58329+/* defined in fs/reiser4/plugin/security/security.c */
58330+extern perm_plugin perm_plugins[LAST_PERM_ID];
58331+/* defined in fs/reiser4/plugin/item/item.c */
58332+extern item_plugin item_plugins[LAST_ITEM_ID];
58333+/* defined in fs/reiser4/plugin/node/node.c */
58334+extern node_plugin node_plugins[LAST_NODE_ID];
58335+/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
58336+extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
58337+
58338+/* __FS_REISER4_PLUGIN_TYPES_H__ */
58339+#endif
58340+
58341+/* Make Linus happy.
58342+ Local variables:
58343+ c-indentation-style: "K&R"
58344+ mode-name: "LC"
58345+ c-basic-offset: 8
58346+ tab-width: 8
58347+ fill-column: 120
58348+ End:
58349+*/
58350diff -urN linux-2.6.24.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.24/fs/reiser4/plugin/plugin_header.h
58351--- linux-2.6.24.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 03:00:00.000000000 +0300
58352+++ linux-2.6.24/fs/reiser4/plugin/plugin_header.h 2008-01-25 11:39:07.052237570 +0300
58353@@ -0,0 +1,155 @@
58354+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58355+
58356+/* plugin header. Data structures required by all plugin types. */
58357+
58358+#if !defined( __PLUGIN_HEADER_H__ )
58359+#define __PLUGIN_HEADER_H__
58360+
58361+/* plugin data-types and constants */
58362+
58363+#include "../debug.h"
58364+#include "../dformat.h"
58365+
58366+/* Every plugin type can be considered as a class of virtual objects
58367+ {(type, i) | i = 0, 1, ...}, which has one the following categories
58368+ of virtualization:
58369+ A - no virtualization;
58370+ F - per-file virtualization;
58371+ S - per-superblock virtualization;
58372+ FIXME-EDWARD: Define every such category */
58373+
58374+/* Supported plugin types: (id, (virtualization category), short description) */
58375+typedef enum {
58376+ REISER4_FILE_PLUGIN_TYPE, /* (F) service VFS enry-points */
58377+ REISER4_DIR_PLUGIN_TYPE, /* (F) service VFS enry-points */
58378+ REISER4_ITEM_PLUGIN_TYPE, /* (F) manage items */
58379+ REISER4_NODE_PLUGIN_TYPE, /* (S) manage formatted nodes */
58380+ REISER4_HASH_PLUGIN_TYPE, /* (F) compute hash */
58381+ REISER4_FIBRATION_PLUGIN_TYPE, /* (F) directory fibrations */
58382+ REISER4_FORMATTING_PLUGIN_TYPE, /* (F) tail-packing policy */
58383+ REISER4_PERM_PLUGIN_TYPE, /* stub (vacancy) */
58384+ REISER4_SD_EXT_PLUGIN_TYPE, /* (A) stat-data extensions */
58385+ REISER4_FORMAT_PLUGIN_TYPE, /* (S) specify disk format */
58386+ REISER4_JNODE_PLUGIN_TYPE, /* (A) in-memory node headers */
58387+ REISER4_CIPHER_PLUGIN_TYPE, /* (F) cipher transform algs */
58388+ REISER4_DIGEST_PLUGIN_TYPE, /* (F) digest transform algs */
58389+ REISER4_COMPRESSION_PLUGIN_TYPE, /* (F) compression tfm algs */
58390+ REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* (F) compression heuristic */
58391+ REISER4_CLUSTER_PLUGIN_TYPE, /* (F) size of logical cluster */
58392+ REISER4_PLUGIN_TYPES
58393+} reiser4_plugin_type;
58394+
58395+/* Supported plugin groups */
58396+typedef enum {
58397+ REISER4_DIRECTORY_FILE,
58398+ REISER4_REGULAR_FILE,
58399+ REISER4_SYMLINK_FILE,
58400+ REISER4_SPECIAL_FILE,
58401+} file_plugin_group;
58402+
58403+struct reiser4_plugin_ops;
58404+/* generic plugin operations, supported by each
58405+ plugin type. */
58406+typedef struct reiser4_plugin_ops reiser4_plugin_ops;
58407+
58408+/* the common part of all plugin instances. */
58409+typedef struct plugin_header {
58410+ /* plugin type */
58411+ reiser4_plugin_type type_id;
58412+ /* id of this plugin */
58413+ reiser4_plugin_id id;
58414+ /* bitmask of groups the plugin belongs to. */
58415+ reiser4_plugin_groups groups;
58416+ /* plugin operations */
58417+ reiser4_plugin_ops *pops;
58418+/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
58419+ /* short label of this plugin */
58420+ const char *label;
58421+ /* descriptive string.. */
58422+ const char *desc;
58423+ /* list linkage */
58424+ struct list_head linkage;
58425+} plugin_header;
58426+
58427+#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
58428+
58429+/* PRIVATE INTERFACES */
58430+/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
58431+/* plugin type representation. */
58432+struct reiser4_plugin_type_data {
58433+ /* internal plugin type identifier. Should coincide with
58434+ index of this item in plugins[] array. */
58435+ reiser4_plugin_type type_id;
58436+ /* short symbolic label of this plugin type. Should be no longer
58437+ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
58438+ const char *label;
58439+ /* plugin type description longer than .label */
58440+ const char *desc;
58441+
58442+/* NIKITA-FIXME-HANS: define built-in */
58443+ /* number of built-in plugin instances of this type */
58444+ int builtin_num;
58445+ /* array of built-in plugins */
58446+ void *builtin;
58447+ struct list_head plugins_list;
58448+ size_t size;
58449+};
58450+
58451+extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
58452+
58453+int is_plugin_type_valid(reiser4_plugin_type type);
58454+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
58455+
58456+static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data * ptype,
58457+ int i)
58458+{
58459+ char *builtin;
58460+
58461+ builtin = ptype->builtin;
58462+ return (reiser4_plugin *) (builtin + i * ptype->size);
58463+}
58464+
58465+/* return plugin by its @type_id and @id */
58466+static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
58467+ reiser4_plugin_id id)
58468+{
58469+ assert("nikita-1651", is_plugin_type_valid(type));
58470+ assert("nikita-1652", is_plugin_id_valid(type, id));
58471+ return plugin_at(&plugins[type], id);
58472+}
58473+
58474+extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
58475+ reiser4_plugin_id id);
58476+
58477+/**
58478+ * plugin_by_disk_id - get reiser4_plugin
58479+ * @type_id: plugin type id
58480+ * @did: plugin id in disk format
58481+ *
58482+ * Returns reiser4_plugin by plugin type id an dplugin_id.
58483+ */
58484+static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
58485+ reiser4_plugin_type type_id,
58486+ __le16 *plugin_id)
58487+{
58488+ /*
58489+ * what we should do properly is to maintain within each file-system a
58490+ * dictionary that maps on-disk plugin ids to "universal" ids. This
58491+ * dictionary will be resolved on mount time, so that this function
58492+ * will perform just one additional array lookup.
58493+ */
58494+ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
58495+}
58496+
58497+/* __PLUGIN_HEADER_H__ */
58498+#endif
58499+
58500+/*
58501+ * Local variables:
58502+ * c-indentation-style: "K&R"
58503+ * mode-name: "LC"
58504+ * c-basic-offset: 8
58505+ * tab-width: 8
58506+ * fill-column: 79
58507+ * End:
58508+ */
58509diff -urN linux-2.6.24.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.24/fs/reiser4/plugin/plugin_set.c
58510--- linux-2.6.24.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 03:00:00.000000000 +0300
58511+++ linux-2.6.24/fs/reiser4/plugin/plugin_set.c 2008-01-25 11:39:07.052237570 +0300
58512@@ -0,0 +1,379 @@
58513+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58514+ * reiser4/README */
58515+/* This file contains Reiser4 plugin set operations */
58516+
58517+/* plugin sets
58518+ *
58519+ * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
58520+ * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
58521+ * assigned (inherited, deduced from mode bits, etc.) at creation time. This
58522+ * set of plugins (so called pset) is described by structure plugin_set (see
58523+ * plugin/plugin_set.h), which contains pointers to all required plugins.
58524+ *
58525+ * Children can inherit some pset members from their parent, however sometimes
58526+ * it is useful to specify members different from parent ones. Since object's
58527+ * pset can not be easily changed without fatal consequences, we use for this
58528+ * purpose another special plugin table (so called hset, or heir set) described
58529+ * by the same structure.
58530+ *
58531+ * Inode only stores a pointers to pset and hset. Different inodes with the
58532+ * same set of pset (hset) members point to the same pset (hset). This is
58533+ * archived by storing psets and hsets in global hash table. Races are avoided
58534+ * by simple (and efficient so far) solution of never recycling psets, even
58535+ * when last inode pointing to it is destroyed.
58536+ */
58537+
58538+#include "../debug.h"
58539+#include "../super.h"
58540+#include "plugin_set.h"
58541+
58542+#include <linux/slab.h>
58543+#include <linux/stddef.h>
58544+
58545+/* slab for plugin sets */
58546+static struct kmem_cache *plugin_set_slab;
58547+
58548+static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
58549+ [0 ... 7] = SPIN_LOCK_UNLOCKED
58550+};
58551+
58552+/* hash table support */
58553+
58554+#define PS_TABLE_SIZE (32)
58555+
58556+static inline plugin_set *cast_to(const unsigned long *a)
58557+{
58558+ return container_of(a, plugin_set, hashval);
58559+}
58560+
58561+static inline int pseq(const unsigned long *a1, const unsigned long *a2)
58562+{
58563+ plugin_set *set1;
58564+ plugin_set *set2;
58565+
58566+ /* make sure fields are not missed in the code below */
58567+ cassert(sizeof *set1 ==
58568+ sizeof set1->hashval +
58569+ sizeof set1->link +
58570+ sizeof set1->file +
58571+ sizeof set1->dir +
58572+ sizeof set1->perm +
58573+ sizeof set1->formatting +
58574+ sizeof set1->hash +
58575+ sizeof set1->fibration +
58576+ sizeof set1->sd +
58577+ sizeof set1->dir_item +
58578+ sizeof set1->cipher +
58579+ sizeof set1->digest +
58580+ sizeof set1->compression +
58581+ sizeof set1->compression_mode +
58582+ sizeof set1->cluster +
58583+ sizeof set1->create);
58584+
58585+ set1 = cast_to(a1);
58586+ set2 = cast_to(a2);
58587+ return
58588+ set1->hashval == set2->hashval &&
58589+ set1->file == set2->file &&
58590+ set1->dir == set2->dir &&
58591+ set1->perm == set2->perm &&
58592+ set1->formatting == set2->formatting &&
58593+ set1->hash == set2->hash &&
58594+ set1->fibration == set2->fibration &&
58595+ set1->sd == set2->sd &&
58596+ set1->dir_item == set2->dir_item &&
58597+ set1->cipher == set2->cipher &&
58598+ set1->digest == set2->digest &&
58599+ set1->compression == set2->compression &&
58600+ set1->compression_mode == set2->compression_mode &&
58601+ set1->cluster == set2->cluster &&
58602+ set1->create == set2->create;
58603+}
58604+
58605+#define HASH_FIELD(hash, set, field) \
58606+({ \
58607+ (hash) += (unsigned long)(set)->field >> 2; \
58608+})
58609+
58610+static inline unsigned long calculate_hash(const plugin_set * set)
58611+{
58612+ unsigned long result;
58613+
58614+ result = 0;
58615+ HASH_FIELD(result, set, file);
58616+ HASH_FIELD(result, set, dir);
58617+ HASH_FIELD(result, set, perm);
58618+ HASH_FIELD(result, set, formatting);
58619+ HASH_FIELD(result, set, hash);
58620+ HASH_FIELD(result, set, fibration);
58621+ HASH_FIELD(result, set, sd);
58622+ HASH_FIELD(result, set, dir_item);
58623+ HASH_FIELD(result, set, cipher);
58624+ HASH_FIELD(result, set, digest);
58625+ HASH_FIELD(result, set, compression);
58626+ HASH_FIELD(result, set, compression_mode);
58627+ HASH_FIELD(result, set, cluster);
58628+ HASH_FIELD(result, set, create);
58629+ return result & (PS_TABLE_SIZE - 1);
58630+}
58631+
58632+static inline unsigned long
58633+pshash(ps_hash_table * table, const unsigned long *a)
58634+{
58635+ return *a;
58636+}
58637+
58638+/* The hash table definition */
58639+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
58640+#define KFREE(ptr, size) kfree(ptr)
58641+TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
58642+ pseq);
58643+#undef KFREE
58644+#undef KMALLOC
58645+
58646+static ps_hash_table ps_table;
58647+static plugin_set empty_set = {
58648+ .hashval = 0,
58649+ .file = NULL,
58650+ .dir = NULL,
58651+ .perm = NULL,
58652+ .formatting = NULL,
58653+ .hash = NULL,
58654+ .fibration = NULL,
58655+ .sd = NULL,
58656+ .dir_item = NULL,
58657+ .cipher = NULL,
58658+ .digest = NULL,
58659+ .compression = NULL,
58660+ .compression_mode = NULL,
58661+ .cluster = NULL,
58662+ .create = NULL,
58663+ .link = {NULL}
58664+};
58665+
58666+plugin_set *plugin_set_get_empty(void)
58667+{
58668+ return &empty_set;
58669+}
58670+
58671+void plugin_set_put(plugin_set * set)
58672+{
58673+}
58674+
58675+static inline unsigned long *pset_field(plugin_set * set, int offset)
58676+{
58677+ return (unsigned long *)(((char *)set) + offset);
58678+}
58679+
58680+static int plugin_set_field(plugin_set ** set, const unsigned long val,
58681+ const int offset)
58682+{
58683+ unsigned long *spot;
58684+ spinlock_t *lock;
58685+ plugin_set replica;
58686+ plugin_set *twin;
58687+ plugin_set *psal;
58688+ plugin_set *orig;
58689+
58690+ assert("nikita-2902", set != NULL);
58691+ assert("nikita-2904", *set != NULL);
58692+
58693+ spot = pset_field(*set, offset);
58694+ if (unlikely(*spot == val))
58695+ return 0;
58696+
58697+ replica = *(orig = *set);
58698+ *pset_field(&replica, offset) = val;
58699+ replica.hashval = calculate_hash(&replica);
58700+ rcu_read_lock();
58701+ twin = ps_hash_find(&ps_table, &replica.hashval);
58702+ if (unlikely(twin == NULL)) {
58703+ rcu_read_unlock();
58704+ psal = kmem_cache_alloc(plugin_set_slab,
58705+ reiser4_ctx_gfp_mask_get());
58706+ if (psal == NULL)
58707+ return RETERR(-ENOMEM);
58708+ *psal = replica;
58709+ lock = &plugin_set_lock[replica.hashval & 7];
58710+ spin_lock(lock);
58711+ twin = ps_hash_find(&ps_table, &replica.hashval);
58712+ if (likely(twin == NULL)) {
58713+ *set = psal;
58714+ ps_hash_insert_rcu(&ps_table, psal);
58715+ } else {
58716+ *set = twin;
58717+ kmem_cache_free(plugin_set_slab, psal);
58718+ }
58719+ spin_unlock(lock);
58720+ } else {
58721+ rcu_read_unlock();
58722+ *set = twin;
58723+ }
58724+ return 0;
58725+}
58726+
58727+static struct {
58728+ int offset;
58729+ reiser4_plugin_groups groups;
58730+ reiser4_plugin_type type;
58731+} pset_descr[PSET_LAST] = {
58732+ [PSET_FILE] = {
58733+ .offset = offsetof(plugin_set, file),
58734+ .type = REISER4_FILE_PLUGIN_TYPE,
58735+ .groups = 0
58736+ },
58737+ [PSET_DIR] = {
58738+ .offset = offsetof(plugin_set, dir),
58739+ .type = REISER4_DIR_PLUGIN_TYPE,
58740+ .groups = 0
58741+ },
58742+ [PSET_PERM] = {
58743+ .offset = offsetof(plugin_set, perm),
58744+ .type = REISER4_PERM_PLUGIN_TYPE,
58745+ .groups = 0
58746+ },
58747+ [PSET_FORMATTING] = {
58748+ .offset = offsetof(plugin_set, formatting),
58749+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
58750+ .groups = 0
58751+ },
58752+ [PSET_HASH] = {
58753+ .offset = offsetof(plugin_set, hash),
58754+ .type = REISER4_HASH_PLUGIN_TYPE,
58755+ .groups = 0
58756+ },
58757+ [PSET_FIBRATION] = {
58758+ .offset = offsetof(plugin_set, fibration),
58759+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
58760+ .groups = 0
58761+ },
58762+ [PSET_SD] = {
58763+ .offset = offsetof(plugin_set, sd),
58764+ .type = REISER4_ITEM_PLUGIN_TYPE,
58765+ .groups = (1 << STAT_DATA_ITEM_TYPE)
58766+ },
58767+ [PSET_DIR_ITEM] = {
58768+ .offset = offsetof(plugin_set, dir_item),
58769+ .type = REISER4_ITEM_PLUGIN_TYPE,
58770+ .groups = (1 << DIR_ENTRY_ITEM_TYPE)
58771+ },
58772+ [PSET_CIPHER] = {
58773+ .offset = offsetof(plugin_set, cipher),
58774+ .type = REISER4_CIPHER_PLUGIN_TYPE,
58775+ .groups = 0
58776+ },
58777+ [PSET_DIGEST] = {
58778+ .offset = offsetof(plugin_set, digest),
58779+ .type = REISER4_DIGEST_PLUGIN_TYPE,
58780+ .groups = 0
58781+ },
58782+ [PSET_COMPRESSION] = {
58783+ .offset = offsetof(plugin_set, compression),
58784+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
58785+ .groups = 0
58786+ },
58787+ [PSET_COMPRESSION_MODE] = {
58788+ .offset = offsetof(plugin_set, compression_mode),
58789+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58790+ .groups = 0
58791+ },
58792+ [PSET_CLUSTER] = {
58793+ .offset = offsetof(plugin_set, cluster),
58794+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
58795+ .groups = 0
58796+ },
58797+ [PSET_CREATE] = {
58798+ .offset = offsetof(plugin_set, create),
58799+ .type = REISER4_FILE_PLUGIN_TYPE,
58800+ .groups = (1 << REISER4_REGULAR_FILE)
58801+ }
58802+};
58803+
58804+#define DEFINE_PSET_OPS(PREFIX) \
58805+ reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \
58806+{ \
58807+ if (memb > PSET_LAST) \
58808+ return REISER4_PLUGIN_TYPES; \
58809+ return pset_descr[memb].type; \
58810+} \
58811+ \
58812+int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \
58813+ reiser4_plugin * plugin) \
58814+{ \
58815+ assert("nikita-3492", set != NULL); \
58816+ assert("nikita-3493", *set != NULL); \
58817+ assert("nikita-3494", plugin != NULL); \
58818+ assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \
58819+ assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \
58820+ \
58821+ if (pset_descr[memb].groups) \
58822+ if (!(pset_descr[memb].groups & plugin->h.groups)) \
58823+ return -EINVAL; \
58824+ \
58825+ return plugin_set_field(set, \
58826+ (unsigned long)plugin, pset_descr[memb].offset); \
58827+} \
58828+ \
58829+reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \
58830+{ \
58831+ assert("nikita-3497", set != NULL); \
58832+ assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \
58833+ \
58834+ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
58835+}
58836+
58837+DEFINE_PSET_OPS(aset);
58838+
58839+int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) {
58840+ return plugin_set_field(set,
58841+ (unsigned long)plugin, pset_descr[memb].offset);
58842+}
58843+
58844+/**
58845+ * init_plugin_set - create plugin set cache and hash table
58846+ *
58847+ * Initializes slab cache of plugin_set-s and their hash table. It is part of
58848+ * reiser4 module initialization.
58849+ */
58850+int init_plugin_set(void)
58851+{
58852+ int result;
58853+
58854+ result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
58855+ if (result == 0) {
58856+ plugin_set_slab = kmem_cache_create("plugin_set",
58857+ sizeof(plugin_set), 0,
58858+ SLAB_HWCACHE_ALIGN,
58859+ NULL);
58860+ if (plugin_set_slab == NULL)
58861+ result = RETERR(-ENOMEM);
58862+ }
58863+ return result;
58864+}
58865+
58866+/**
58867+ * done_plugin_set - delete plugin_set cache and plugin_set hash table
58868+ *
58869+ * This is called on reiser4 module unloading or system shutdown.
58870+ */
58871+void done_plugin_set(void)
58872+{
58873+ plugin_set *cur, *next;
58874+
58875+ for_all_in_htable(&ps_table, ps, cur, next) {
58876+ ps_hash_remove(&ps_table, cur);
58877+ kmem_cache_free(plugin_set_slab, cur);
58878+ }
58879+ destroy_reiser4_cache(&plugin_set_slab);
58880+ ps_hash_done(&ps_table);
58881+}
58882+
58883+/*
58884+ * Local variables:
58885+ * c-indentation-style: "K&R"
58886+ * mode-name: "LC"
58887+ * c-basic-offset: 8
58888+ * tab-width: 8
58889+ * fill-column: 120
58890+ * End:
58891+ */
58892diff -urN linux-2.6.24.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.24/fs/reiser4/plugin/plugin_set.h
58893--- linux-2.6.24.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 03:00:00.000000000 +0300
58894+++ linux-2.6.24/fs/reiser4/plugin/plugin_set.h 2008-01-25 11:39:07.056238601 +0300
58895@@ -0,0 +1,77 @@
58896+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58897+
58898+/* Reiser4 plugin set definition.
58899+ See fs/reiser4/plugin/plugin_set.c for details */
58900+
58901+#if !defined( __PLUGIN_SET_H__ )
58902+#define __PLUGIN_SET_H__
58903+
58904+#include "../type_safe_hash.h"
58905+#include "plugin.h"
58906+
58907+#include <linux/rcupdate.h>
58908+
58909+struct plugin_set;
58910+typedef struct plugin_set plugin_set;
58911+
58912+TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
58913+
58914+struct plugin_set {
58915+ unsigned long hashval;
58916+ /* plugin of file */
58917+ file_plugin *file;
58918+ /* plugin of dir */
58919+ dir_plugin *dir;
58920+ /* perm plugin for this file */
58921+ perm_plugin *perm;
58922+ /* tail policy plugin. Only meaningful for regular files */
58923+ formatting_plugin *formatting;
58924+ /* hash plugin. Only meaningful for directories. */
58925+ hash_plugin *hash;
58926+ /* fibration plugin. Only meaningful for directories. */
58927+ fibration_plugin *fibration;
58928+ /* plugin of stat-data */
58929+ item_plugin *sd;
58930+ /* plugin of items a directory is built of */
58931+ item_plugin *dir_item;
58932+ /* cipher plugin */
58933+ cipher_plugin *cipher;
58934+ /* digest plugin */
58935+ digest_plugin *digest;
58936+ /* compression plugin */
58937+ compression_plugin *compression;
58938+ /* compression mode plugin */
58939+ compression_mode_plugin *compression_mode;
58940+ /* cluster plugin */
58941+ cluster_plugin *cluster;
58942+ /* this specifies file plugin of regular children.
58943+ only meaningful for directories */
58944+ file_plugin *create;
58945+ ps_hash_link link;
58946+};
58947+
58948+extern plugin_set *plugin_set_get_empty(void);
58949+extern void plugin_set_put(plugin_set * set);
58950+
58951+extern int init_plugin_set(void);
58952+extern void done_plugin_set(void);
58953+
58954+extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
58955+extern int set_plugin(plugin_set ** set, pset_member memb,
58956+ reiser4_plugin * plugin);
58957+extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
58958+ reiser4_plugin * plugin);
58959+extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
58960+
58961+/* __PLUGIN_SET_H__ */
58962+#endif
58963+
58964+/* Make Linus happy.
58965+ Local variables:
58966+ c-indentation-style: "K&R"
58967+ mode-name: "LC"
58968+ c-basic-offset: 8
58969+ tab-width: 8
58970+ fill-column: 120
58971+ End:
58972+*/
58973diff -urN linux-2.6.24.orig/fs/reiser4/plugin/security/Makefile linux-2.6.24/fs/reiser4/plugin/security/Makefile
58974--- linux-2.6.24.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 03:00:00.000000000 +0300
58975+++ linux-2.6.24/fs/reiser4/plugin/security/Makefile 2008-01-25 11:39:07.056238601 +0300
58976@@ -0,0 +1,4 @@
58977+obj-$(CONFIG_REISER4_FS) += security_plugins.o
58978+
58979+security_plugins-objs := \
58980+ perm.o
58981diff -urN linux-2.6.24.orig/fs/reiser4/plugin/security/perm.c linux-2.6.24/fs/reiser4/plugin/security/perm.c
58982--- linux-2.6.24.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 03:00:00.000000000 +0300
58983+++ linux-2.6.24/fs/reiser4/plugin/security/perm.c 2008-01-25 11:39:07.056238601 +0300
58984@@ -0,0 +1,33 @@
58985+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58986+
58987+/*
58988+ * This file contains implementation of permission plugins.
58989+ * See the comments in perm.h
58990+ */
58991+
58992+#include "../plugin.h"
58993+#include "../plugin_header.h"
58994+#include "../../debug.h"
58995+
58996+perm_plugin perm_plugins[LAST_PERM_ID] = {
58997+ [NULL_PERM_ID] = {
58998+ .h = {
58999+ .type_id = REISER4_PERM_PLUGIN_TYPE,
59000+ .id = NULL_PERM_ID,
59001+ .pops = NULL,
59002+ .label = "null",
59003+ .desc = "stub permission plugin",
59004+ .linkage = {NULL, NULL}
59005+ }
59006+ }
59007+};
59008+
59009+/*
59010+ * Local variables:
59011+ * c-indentation-style: "K&R"
59012+ * mode-name: "LC"
59013+ * c-basic-offset: 8
59014+ * tab-width: 8
59015+ * fill-column: 79
59016+ * End:
59017+ */
59018diff -urN linux-2.6.24.orig/fs/reiser4/plugin/security/perm.h linux-2.6.24/fs/reiser4/plugin/security/perm.h
59019--- linux-2.6.24.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 03:00:00.000000000 +0300
59020+++ linux-2.6.24/fs/reiser4/plugin/security/perm.h 2008-01-25 11:39:07.060239631 +0300
59021@@ -0,0 +1,38 @@
59022+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59023+
59024+/* Perm (short for "permissions") plugins common stuff. */
59025+
59026+#if !defined( __REISER4_PERM_H__ )
59027+#define __REISER4_PERM_H__
59028+
59029+#include "../../forward.h"
59030+#include "../plugin_header.h"
59031+
59032+#include <linux/types.h>
59033+
59034+/* Definition of permission plugin */
59035+/* NIKITA-FIXME-HANS: define what this is targeted for.
59036+ It does not seem to be intended for use with sys_reiser4. Explain. */
59037+
59038+/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4.
59039+ Consider it like a temporary "seam" and reserved pset member.
59040+ If you have something usefull to add, then rename this plugin and add here */
59041+typedef struct perm_plugin {
59042+ /* generic plugin fields */
59043+ plugin_header h;
59044+} perm_plugin;
59045+
59046+typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
59047+
59048+/* __REISER4_PERM_H__ */
59049+#endif
59050+
59051+/* Make Linus happy.
59052+ Local variables:
59053+ c-indentation-style: "K&R"
59054+ mode-name: "LC"
59055+ c-basic-offset: 8
59056+ tab-width: 8
59057+ fill-column: 120
59058+ End:
59059+*/
59060diff -urN linux-2.6.24.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.24/fs/reiser4/plugin/space/bitmap.c
59061--- linux-2.6.24.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 03:00:00.000000000 +0300
59062+++ linux-2.6.24/fs/reiser4/plugin/space/bitmap.c 2008-01-25 11:39:07.064240661 +0300
59063@@ -0,0 +1,1585 @@
59064+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59065+
59066+#include "../../debug.h"
59067+#include "../../dformat.h"
59068+#include "../../txnmgr.h"
59069+#include "../../jnode.h"
59070+#include "../../block_alloc.h"
59071+#include "../../tree.h"
59072+#include "../../super.h"
59073+#include "../plugin.h"
59074+#include "space_allocator.h"
59075+#include "bitmap.h"
59076+
59077+#include <linux/types.h>
59078+#include <linux/fs.h> /* for struct super_block */
59079+#include <linux/mutex.h>
59080+#include <asm/div64.h>
59081+
59082+/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
59083+ * blocks
59084+
59085+ A useful optimization of reiser4 bitmap handling would be dynamic bitmap
59086+ blocks loading/unloading which is different from v3.x where all bitmap
59087+ blocks are loaded at mount time.
59088+
59089+ To implement bitmap blocks unloading we need to count bitmap block usage
59090+ and detect currently unused blocks allowing them to be unloaded. It is not
59091+ a simple task since we allow several threads to modify one bitmap block
59092+ simultaneously.
59093+
59094+ Briefly speaking, the following schema is proposed: we count in special
59095+ variable associated with each bitmap block. That is for counting of block
59096+ alloc/dealloc operations on that bitmap block. With a deferred block
59097+ deallocation feature of reiser4 all those operation will be represented in
59098+ atom dirty/deleted lists as jnodes for freshly allocated or deleted
59099+ nodes.
59100+
59101+ So, we increment usage counter for each new node allocated or deleted, and
59102+ decrement it at atom commit one time for each node from the dirty/deleted
59103+ atom's list. Of course, freshly allocated node deletion and node reusing
59104+ from atom deleted (if we do so) list should decrement bitmap usage counter
59105+ also.
59106+
59107+ This schema seems to be working but that reference counting is
59108+ not easy to debug. I think we should agree with Hans and do not implement
59109+ it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
59110+
59111+ For simplicity all bitmap nodes (both commit and working bitmap blocks) are
59112+ loaded into memory on fs mount time or each bitmap nodes are loaded at the
59113+ first access to it, the "dont_load_bitmap" mount option controls whether
59114+ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
59115+ nodes currently is not supported. */
59116+
59117+#define CHECKSUM_SIZE 4
59118+
59119+#define BYTES_PER_LONG (sizeof(long))
59120+
59121+#if BITS_PER_LONG == 64
59122+# define LONG_INT_SHIFT (6)
59123+#else
59124+# define LONG_INT_SHIFT (5)
59125+#endif
59126+
59127+#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
59128+
59129+typedef unsigned long ulong_t;
59130+
59131+#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
59132+#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
59133+
59134+/* Block allocation/deallocation are done through special bitmap objects which
59135+ are allocated in an array at fs mount. */
59136+struct bitmap_node {
59137+ struct mutex mutex; /* long term lock object */
59138+
59139+ jnode *wjnode; /* j-nodes for WORKING ... */
59140+ jnode *cjnode; /* ... and COMMIT bitmap blocks */
59141+
59142+ bmap_off_t first_zero_bit; /* for skip_busy option implementation */
59143+
59144+ atomic_t loaded; /* a flag which shows that bnode is loaded
59145+ * already */
59146+};
59147+
59148+static inline char *bnode_working_data(struct bitmap_node *bnode)
59149+{
59150+ char *data;
59151+
59152+ data = jdata(bnode->wjnode);
59153+ assert("zam-429", data != NULL);
59154+
59155+ return data + CHECKSUM_SIZE;
59156+}
59157+
59158+static inline char *bnode_commit_data(const struct bitmap_node *bnode)
59159+{
59160+ char *data;
59161+
59162+ data = jdata(bnode->cjnode);
59163+ assert("zam-430", data != NULL);
59164+
59165+ return data + CHECKSUM_SIZE;
59166+}
59167+
59168+static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
59169+{
59170+ char *data;
59171+
59172+ data = jdata(bnode->cjnode);
59173+ assert("vpf-261", data != NULL);
59174+
59175+ return le32_to_cpu(get_unaligned((d32 *)data));
59176+}
59177+
59178+static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
59179+{
59180+ char *data;
59181+
59182+ data = jdata(bnode->cjnode);
59183+ assert("vpf-261", data != NULL);
59184+
59185+ put_unaligned(cpu_to_le32(crc), (d32 *)data);
59186+}
59187+
59188+/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
59189+ * written the code, does this added abstraction still have */
59190+/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
59191+ * reiser4_space_allocator structure) */
59192+/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
59193+/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
59194+ * someday?". What they about? If there is a reason to have a union, it should
59195+ * be a union, if not, it should not be a union. "..might be someday" means no
59196+ * reason. */
59197+struct bitmap_allocator_data {
59198+ /* an array for bitmap blocks direct access */
59199+ struct bitmap_node *bitmap;
59200+};
59201+
59202+#define get_barray(super) \
59203+(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
59204+
59205+#define get_bnode(super, i) (get_barray(super) + i)
59206+
59207+/* allocate and initialize jnode with JNODE_BITMAP type */
59208+static jnode *bnew(void)
59209+{
59210+ jnode *jal = jalloc();
59211+
59212+ if (jal)
59213+ jnode_init(jal, current_tree, JNODE_BITMAP);
59214+
59215+ return jal;
59216+}
59217+
59218+/* this file contains:
59219+ - bitmap based implementation of space allocation plugin
59220+ - all the helper functions like set bit, find_first_zero_bit, etc */
59221+
59222+/* Audited by: green(2002.06.12) */
59223+static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
59224+{
59225+ ulong_t mask = 1UL << start_bit;
59226+ int i = start_bit;
59227+
59228+ while ((word & mask) != 0) {
59229+ mask <<= 1;
59230+ if (++i >= BITS_PER_LONG)
59231+ break;
59232+ }
59233+
59234+ return i;
59235+}
59236+
59237+#include <linux/bitops.h>
59238+
59239+#if BITS_PER_LONG == 64
59240+
59241+#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
59242+#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
59243+
59244+static inline void reiser4_set_bit(int nr, void *addr)
59245+{
59246+ ext2_set_bit(nr + OFF(addr), BASE(addr));
59247+}
59248+
59249+static inline void reiser4_clear_bit(int nr, void *addr)
59250+{
59251+ ext2_clear_bit(nr + OFF(addr), BASE(addr));
59252+}
59253+
59254+static inline int reiser4_test_bit(int nr, void *addr)
59255+{
59256+ return ext2_test_bit(nr + OFF(addr), BASE(addr));
59257+}
59258+static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
59259+ int offset)
59260+{
59261+ int off = OFF(addr);
59262+
59263+ return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
59264+ offset + off) - off;
59265+}
59266+
59267+#else
59268+
59269+#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
59270+#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
59271+#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
59272+
59273+#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
59274+ext2_find_next_zero_bit(addr, maxoffset, offset)
59275+#endif
59276+
59277+/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
59278+ * are counted from @addr, return the offset of the first bit if it is found,
59279+ * @maxoffset otherwise. */
59280+static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59281+ bmap_off_t start_offset)
59282+{
59283+ ulong_t *base = addr;
59284+ /* start_offset is in bits, convert it to byte offset within bitmap. */
59285+ int word_nr = start_offset >> LONG_INT_SHIFT;
59286+ /* bit number within the byte. */
59287+ int bit_nr = start_offset & LONG_INT_MASK;
59288+ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
59289+
59290+ assert("zam-387", max_offset != 0);
59291+
59292+ /* Unaligned @start_offset case. */
59293+ if (bit_nr != 0) {
59294+ bmap_nr_t nr;
59295+
59296+ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
59297+
59298+ if (nr < BITS_PER_LONG)
59299+ return (word_nr << LONG_INT_SHIFT) + nr;
59300+
59301+ ++word_nr;
59302+ }
59303+
59304+ /* Fast scan trough aligned words. */
59305+ while (word_nr <= max_word_nr) {
59306+ if (base[word_nr] != 0) {
59307+ return (word_nr << LONG_INT_SHIFT)
59308+ + find_next_zero_bit_in_word(~(base[word_nr]), 0);
59309+ }
59310+
59311+ ++word_nr;
59312+ }
59313+
59314+ return max_offset;
59315+}
59316+
59317+#if BITS_PER_LONG == 64
59318+
59319+static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59320+ bmap_off_t start_offset)
59321+{
59322+ bmap_off_t off = OFF(addr);
59323+
59324+ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
59325+ start_offset + off) - off;
59326+}
59327+
59328+#else
59329+#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
59330+ __reiser4_find_next_set_bit(addr, max_offset, start_offset)
59331+#endif
59332+
59333+/* search for the first set bit in single word. */
59334+static int find_last_set_bit_in_word(ulong_t word, int start_bit)
59335+{
59336+ ulong_t bit_mask;
59337+ int nr = start_bit;
59338+
59339+ assert("zam-965", start_bit < BITS_PER_LONG);
59340+ assert("zam-966", start_bit >= 0);
59341+
59342+ bit_mask = (1UL << nr);
59343+
59344+ while (bit_mask != 0) {
59345+ if (bit_mask & word)
59346+ return nr;
59347+ bit_mask >>= 1;
59348+ nr--;
59349+ }
59350+ return BITS_PER_LONG;
59351+}
59352+
59353+/* Search bitmap for a set bit in backward direction from the end to the
59354+ * beginning of given region
59355+ *
59356+ * @result: result offset of the last set bit
59357+ * @addr: base memory address,
59358+ * @low_off: low end of the search region, edge bit included into the region,
59359+ * @high_off: high end of the search region, edge bit included into the region,
59360+ *
59361+ * @return: 0 - set bit was found, -1 otherwise.
59362+ */
59363+static int
59364+reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59365+ bmap_off_t high_off)
59366+{
59367+ ulong_t *base = addr;
59368+ int last_word;
59369+ int first_word;
59370+ int last_bit;
59371+ int nr;
59372+
59373+ assert("zam-962", high_off >= low_off);
59374+
59375+ last_word = high_off >> LONG_INT_SHIFT;
59376+ last_bit = high_off & LONG_INT_MASK;
59377+ first_word = low_off >> LONG_INT_SHIFT;
59378+
59379+ if (last_bit < BITS_PER_LONG) {
59380+ nr = find_last_set_bit_in_word(base[last_word], last_bit);
59381+ if (nr < BITS_PER_LONG) {
59382+ *result = (last_word << LONG_INT_SHIFT) + nr;
59383+ return 0;
59384+ }
59385+ --last_word;
59386+ }
59387+ while (last_word >= first_word) {
59388+ if (base[last_word] != 0x0) {
59389+ last_bit =
59390+ find_last_set_bit_in_word(base[last_word],
59391+ BITS_PER_LONG - 1);
59392+ assert("zam-972", last_bit < BITS_PER_LONG);
59393+ *result = (last_word << LONG_INT_SHIFT) + last_bit;
59394+ return 0;
59395+ }
59396+ --last_word;
59397+ }
59398+
59399+ return -1; /* set bit not found */
59400+}
59401+
59402+/* Search bitmap for a clear bit in backward direction from the end to the
59403+ * beginning of given region */
59404+static int
59405+reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59406+ bmap_off_t high_off)
59407+{
59408+ ulong_t *base = addr;
59409+ int last_word;
59410+ int first_word;
59411+ int last_bit;
59412+ int nr;
59413+
59414+ last_word = high_off >> LONG_INT_SHIFT;
59415+ last_bit = high_off & LONG_INT_MASK;
59416+ first_word = low_off >> LONG_INT_SHIFT;
59417+
59418+ if (last_bit < BITS_PER_LONG) {
59419+ nr = find_last_set_bit_in_word(~base[last_word], last_bit);
59420+ if (nr < BITS_PER_LONG) {
59421+ *result = (last_word << LONG_INT_SHIFT) + nr;
59422+ return 0;
59423+ }
59424+ --last_word;
59425+ }
59426+ while (last_word >= first_word) {
59427+ if (base[last_word] != (ulong_t) (-1)) {
59428+ *result = (last_word << LONG_INT_SHIFT) +
59429+ find_last_set_bit_in_word(~base[last_word],
59430+ BITS_PER_LONG - 1);
59431+ return 0;
59432+ }
59433+ --last_word;
59434+ }
59435+
59436+ return -1; /* zero bit not found */
59437+}
59438+
59439+/* Audited by: green(2002.06.12) */
59440+static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
59441+{
59442+ int first_byte;
59443+ int last_byte;
59444+
59445+ unsigned char first_byte_mask = 0xFF;
59446+ unsigned char last_byte_mask = 0xFF;
59447+
59448+ assert("zam-410", start < end);
59449+
59450+ first_byte = start >> 3;
59451+ last_byte = (end - 1) >> 3;
59452+
59453+ if (last_byte > first_byte + 1)
59454+ memset(addr + first_byte + 1, 0,
59455+ (size_t) (last_byte - first_byte - 1));
59456+
59457+ first_byte_mask >>= 8 - (start & 0x7);
59458+ last_byte_mask <<= ((end - 1) & 0x7) + 1;
59459+
59460+ if (first_byte == last_byte) {
59461+ addr[first_byte] &= (first_byte_mask | last_byte_mask);
59462+ } else {
59463+ addr[first_byte] &= first_byte_mask;
59464+ addr[last_byte] &= last_byte_mask;
59465+ }
59466+}
59467+
59468+/* Audited by: green(2002.06.12) */
59469+/* ZAM-FIXME-HANS: comment this */
59470+static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
59471+{
59472+ int first_byte;
59473+ int last_byte;
59474+
59475+ unsigned char first_byte_mask = 0xFF;
59476+ unsigned char last_byte_mask = 0xFF;
59477+
59478+ assert("zam-386", start < end);
59479+
59480+ first_byte = start >> 3;
59481+ last_byte = (end - 1) >> 3;
59482+
59483+ if (last_byte > first_byte + 1)
59484+ memset(addr + first_byte + 1, 0xFF,
59485+ (size_t) (last_byte - first_byte - 1));
59486+
59487+ first_byte_mask <<= start & 0x7;
59488+ last_byte_mask >>= 7 - ((end - 1) & 0x7);
59489+
59490+ if (first_byte == last_byte) {
59491+ addr[first_byte] |= (first_byte_mask & last_byte_mask);
59492+ } else {
59493+ addr[first_byte] |= first_byte_mask;
59494+ addr[last_byte] |= last_byte_mask;
59495+ }
59496+}
59497+
59498+#define ADLER_BASE 65521
59499+#define ADLER_NMAX 5552
59500+
59501+/* Calculates the adler32 checksum for the data pointed by `data` of the
59502+ length `len`. This function was originally taken from zlib, version 1.1.3,
59503+ July 9th, 1998.
59504+
59505+ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
59506+
59507+ This software is provided 'as-is', without any express or implied
59508+ warranty. In no event will the authors be held liable for any damages
59509+ arising from the use of this software.
59510+
59511+ Permission is granted to anyone to use this software for any purpose,
59512+ including commercial applications, and to alter it and redistribute it
59513+ freely, subject to the following restrictions:
59514+
59515+ 1. The origin of this software must not be misrepresented; you must not
59516+ claim that you wrote the original software. If you use this software
59517+ in a product, an acknowledgment in the product documentation would be
59518+ appreciated but is not required.
59519+ 2. Altered source versions must be plainly marked as such, and must not be
59520+ misrepresented as being the original software.
59521+ 3. This notice may not be removed or altered from any source distribution.
59522+
59523+ Jean-loup Gailly Mark Adler
59524+ jloup@gzip.org madler@alumni.caltech.edu
59525+
59526+ The above comment applies only to the reiser4_adler32 function.
59527+*/
59528+
59529+__u32 reiser4_adler32(char *data, __u32 len)
59530+{
59531+ unsigned char *t = data;
59532+ __u32 s1 = 1;
59533+ __u32 s2 = 0;
59534+ int k;
59535+
59536+ while (len > 0) {
59537+ k = len < ADLER_NMAX ? len : ADLER_NMAX;
59538+ len -= k;
59539+
59540+ while (k--) {
59541+ s1 += *t++;
59542+ s2 += s1;
59543+ }
59544+
59545+ s1 %= ADLER_BASE;
59546+ s2 %= ADLER_BASE;
59547+ }
59548+ return (s2 << 16) | s1;
59549+}
59550+
59551+#define sb_by_bnode(bnode) \
59552+ ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
59553+
59554+static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
59555+{
59556+ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
59557+}
59558+
59559+static int
59560+bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
59561+{
59562+ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
59563+ bmap_nr_t bmap;
59564+
59565+ bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
59566+
59567+ warning("vpf-263",
59568+ "Checksum for the bitmap block %llu is incorrect",
59569+ bmap);
59570+
59571+ return RETERR(-EIO);
59572+ }
59573+
59574+ return 0;
59575+}
59576+
59577+#define REISER4_CHECK_BMAP_CRC (0)
59578+
59579+#if REISER4_CHECK_BMAP_CRC
59580+static int bnode_check_crc(const struct bitmap_node *bnode)
59581+{
59582+ return bnode_check_adler32(bnode,
59583+ bmap_size(sb_by_bnode(bnode)->s_blocksize));
59584+}
59585+
59586+/* REISER4_CHECK_BMAP_CRC */
59587+#else
59588+
59589+#define bnode_check_crc(bnode) (0)
59590+
59591+/* REISER4_CHECK_BMAP_CRC */
59592+#endif
59593+
59594+/* Recalculates the adler32 checksum for only 1 byte change.
59595+ adler - previous adler checksum
59596+ old_data, data - old, new byte values.
59597+ tail == (chunk - offset) : length, checksum was calculated for, - offset of
59598+ the changed byte within this chunk.
59599+ This function can be used for checksum calculation optimisation.
59600+*/
59601+
59602+static __u32
59603+adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
59604+ __u32 tail)
59605+{
59606+ __u32 delta = data - old_data + 2 * ADLER_BASE;
59607+ __u32 s1 = adler & 0xffff;
59608+ __u32 s2 = (adler >> 16) & 0xffff;
59609+
59610+ s1 = (delta + s1) % ADLER_BASE;
59611+ s2 = (delta * tail + s2) % ADLER_BASE;
59612+
59613+ return (s2 << 16) | s1;
59614+}
59615+
59616+#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
59617+
59618+/**
59619+ * get_nr_bitmap - calculate number of bitmap blocks
59620+ * @super: super block with initialized blocksize and block count
59621+ *
59622+ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
59623+ * maintain free disk space. It assumes that each bitmap addresses the same
59624+ * number of blocks which is calculated by bmap_block_count macro defined in
59625+ * above. Number of blocks in the filesystem has to be initialized in reiser4
59626+ * private data of super block already so that it can be obtained via
59627+ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
59628+ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
59629+ * to use special function to divide and modulo 64bits filesystem block
59630+ * counters.
59631+ *
59632+ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
59633+ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
59634+ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
59635+ */
59636+static bmap_nr_t get_nr_bmap(const struct super_block *super)
59637+{
59638+ u64 quotient;
59639+
59640+ assert("zam-393", reiser4_block_count(super) != 0);
59641+
59642+ quotient = reiser4_block_count(super) - 1;
59643+ do_div(quotient, bmap_bit_count(super->s_blocksize));
59644+ return quotient + 1;
59645+}
59646+
59647+/**
59648+ * parse_blocknr - calculate bitmap number and offset in it by block number
59649+ * @block: pointer to block number to calculate location in bitmap of
59650+ * @bmap: pointer where to store bitmap block number
59651+ * @offset: pointer where to store offset within bitmap block
59652+ *
59653+ * Calculates location of bit which is responsible for allocation/freeing of
59654+ * block @*block. That location is represented by bitmap block number and offset
59655+ * within that bitmap block.
59656+ */
59657+static void
59658+parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
59659+ bmap_off_t *offset)
59660+{
59661+ struct super_block *super = get_current_context()->super;
59662+ u64 quotient = *block;
59663+
59664+ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
59665+ *bmap = quotient;
59666+
59667+ assert("zam-433", *bmap < get_nr_bmap(super));
59668+ assert("", *offset < bmap_bit_count(super->s_blocksize));
59669+}
59670+
59671+#if REISER4_DEBUG
59672+/* Audited by: green(2002.06.12) */
59673+static void
59674+check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
59675+{
59676+ struct super_block *sb = reiser4_get_current_sb();
59677+
59678+ assert("zam-436", sb != NULL);
59679+
59680+ assert("zam-455", start != NULL);
59681+ assert("zam-437", *start != 0);
59682+ assert("zam-541", !reiser4_blocknr_is_fake(start));
59683+ assert("zam-441", *start < reiser4_block_count(sb));
59684+
59685+ if (len != NULL) {
59686+ assert("zam-438", *len != 0);
59687+ assert("zam-442", *start + *len <= reiser4_block_count(sb));
59688+ }
59689+}
59690+
59691+static void check_bnode_loaded(const struct bitmap_node *bnode)
59692+{
59693+ assert("zam-485", bnode != NULL);
59694+ assert("zam-483", jnode_page(bnode->wjnode) != NULL);
59695+ assert("zam-484", jnode_page(bnode->cjnode) != NULL);
59696+ assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
59697+ assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
59698+}
59699+
59700+#else
59701+
59702+# define check_block_range(start, len) do { /* nothing */} while(0)
59703+# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
59704+
59705+#endif
59706+
59707+/* modify bnode->first_zero_bit (if we free bits before); bnode should be
59708+ spin-locked */
59709+static inline void
59710+adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
59711+{
59712+ if (offset < bnode->first_zero_bit)
59713+ bnode->first_zero_bit = offset;
59714+}
59715+
59716+/* return a physical disk address for logical bitmap number @bmap */
59717+/* FIXME-VS: this is somehow related to disk layout? */
59718+/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
59719+ * per block allocation so that performance is not affected. Probably this
59720+ * whole file should be considered part of the disk layout plugin, and other
59721+ * disk layouts can use other defines and efficiency will not be significantly
59722+ * affected. */
59723+
59724+#define REISER4_FIRST_BITMAP_BLOCK \
59725+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
59726+
59727+/* Audited by: green(2002.06.12) */
59728+static void
59729+get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
59730+ reiser4_block_nr * bnr)
59731+{
59732+
59733+ assert("zam-390", bmap < get_nr_bmap(super));
59734+
59735+#ifdef CONFIG_REISER4_BADBLOCKS
59736+#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
59737+ /* Check if the diskmap have this already, first. */
59738+ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
59739+ return; /* Found it in diskmap */
59740+#endif
59741+ /* FIXME_ZAM: before discussing of disk layouts and disk format
59742+ plugins I implement bitmap location scheme which is close to scheme
59743+ used in reiser 3.6 */
59744+ if (bmap == 0) {
59745+ *bnr = REISER4_FIRST_BITMAP_BLOCK;
59746+ } else {
59747+ *bnr = bmap * bmap_bit_count(super->s_blocksize);
59748+ }
59749+}
59750+
59751+/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
59752+/* Audited by: green(2002.06.12) */
59753+static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
59754+{
59755+ *bnr =
59756+ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
59757+ REISER4_BITMAP_BLOCKS_STATUS_VALUE);
59758+}
59759+
59760+/* bnode structure initialization */
59761+static void
59762+init_bnode(struct bitmap_node *bnode,
59763+ struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
59764+{
59765+ memset(bnode, 0, sizeof(struct bitmap_node));
59766+
59767+ mutex_init(&bnode->mutex);
59768+ atomic_set(&bnode->loaded, 0);
59769+}
59770+
59771+static void release(jnode * node)
59772+{
59773+ jrelse(node);
59774+ JF_SET(node, JNODE_HEARD_BANSHEE);
59775+ jput(node);
59776+}
59777+
59778+/* This function is for internal bitmap.c use because it assumes that jnode is
59779+ in under full control of this thread */
59780+static void done_bnode(struct bitmap_node *bnode)
59781+{
59782+ if (bnode) {
59783+ atomic_set(&bnode->loaded, 0);
59784+ if (bnode->wjnode != NULL)
59785+ release(bnode->wjnode);
59786+ if (bnode->cjnode != NULL)
59787+ release(bnode->cjnode);
59788+ bnode->wjnode = bnode->cjnode = NULL;
59789+ }
59790+}
59791+
59792+/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
59793+static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
59794+ jnode **wjnode_ret)
59795+{
59796+ struct super_block *super;
59797+ jnode *cjnode;
59798+ jnode *wjnode;
59799+ bmap_nr_t bmap;
59800+ int ret;
59801+
59802+ super = reiser4_get_current_sb();
59803+
59804+ *wjnode_ret = wjnode = bnew();
59805+ if (wjnode == NULL) {
59806+ *cjnode_ret = NULL;
59807+ return RETERR(-ENOMEM);
59808+ }
59809+
59810+ *cjnode_ret = cjnode = bnew();
59811+ if (cjnode == NULL)
59812+ return RETERR(-ENOMEM);
59813+
59814+ bmap = bnode - get_bnode(super, 0);
59815+
59816+ get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
59817+ get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
59818+
59819+ jref(cjnode);
59820+ jref(wjnode);
59821+
59822+ /* load commit bitmap */
59823+ ret = jload_gfp(cjnode, GFP_NOFS, 1);
59824+
59825+ if (ret)
59826+ goto error;
59827+
59828+ /* allocate memory for working bitmap block. Note that for
59829+ * bitmaps jinit_new() doesn't actually modifies node content,
59830+ * so parallel calls to this are ok. */
59831+ ret = jinit_new(wjnode, GFP_NOFS);
59832+
59833+ if (ret != 0) {
59834+ jrelse(cjnode);
59835+ goto error;
59836+ }
59837+
59838+ return 0;
59839+
59840+ error:
59841+ jput(cjnode);
59842+ jput(wjnode);
59843+ *wjnode_ret = *cjnode_ret = NULL;
59844+ return ret;
59845+
59846+}
59847+
59848+/* Check the bnode data on read. */
59849+static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
59850+{
59851+ void *data;
59852+ int ret;
59853+
59854+ /* Check CRC */
59855+ ret = bnode_check_adler32(bnode, blksize);
59856+
59857+ if (ret) {
59858+ return ret;
59859+ }
59860+
59861+ data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
59862+
59863+ /* Check the very first bit -- it must be busy. */
59864+ if (!reiser4_test_bit(0, data)) {
59865+ warning("vpf-1362", "The allocator block %llu is not marked "
59866+ "as used.", (unsigned long long)bnode->cjnode->blocknr);
59867+
59868+ return -EINVAL;
59869+ }
59870+
59871+ return 0;
59872+}
59873+
59874+/* load bitmap blocks "on-demand" */
59875+static int load_and_lock_bnode(struct bitmap_node *bnode)
59876+{
59877+ int ret;
59878+
59879+ jnode *cjnode;
59880+ jnode *wjnode;
59881+
59882+ assert("nikita-3040", reiser4_schedulable());
59883+
59884+/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
59885+ * need to be atomic, right? Just leave a comment that if bitmaps were
59886+ * unloadable, this would need to be atomic. */
59887+ if (atomic_read(&bnode->loaded)) {
59888+ /* bitmap is already loaded, nothing to do */
59889+ check_bnode_loaded(bnode);
59890+ mutex_lock(&bnode->mutex);
59891+ assert("nikita-2827", atomic_read(&bnode->loaded));
59892+ return 0;
59893+ }
59894+
59895+ ret = prepare_bnode(bnode, &cjnode, &wjnode);
59896+ if (ret == 0) {
59897+ mutex_lock(&bnode->mutex);
59898+
59899+ if (!atomic_read(&bnode->loaded)) {
59900+ assert("nikita-2822", cjnode != NULL);
59901+ assert("nikita-2823", wjnode != NULL);
59902+ assert("nikita-2824", jnode_is_loaded(cjnode));
59903+ assert("nikita-2825", jnode_is_loaded(wjnode));
59904+
59905+ bnode->wjnode = wjnode;
59906+ bnode->cjnode = cjnode;
59907+
59908+ ret = check_struct_bnode(bnode, current_blocksize);
59909+ if (!ret) {
59910+ cjnode = wjnode = NULL;
59911+ atomic_set(&bnode->loaded, 1);
59912+ /* working bitmap is initialized by on-disk
59913+ * commit bitmap. This should be performed
59914+ * under mutex. */
59915+ memcpy(bnode_working_data(bnode),
59916+ bnode_commit_data(bnode),
59917+ bmap_size(current_blocksize));
59918+ } else
59919+ mutex_unlock(&bnode->mutex);
59920+ } else
59921+ /* race: someone already loaded bitmap while we were
59922+ * busy initializing data. */
59923+ check_bnode_loaded(bnode);
59924+ }
59925+
59926+ if (wjnode != NULL) {
59927+ release(wjnode);
59928+ bnode->wjnode = NULL;
59929+ }
59930+ if (cjnode != NULL) {
59931+ release(cjnode);
59932+ bnode->cjnode = NULL;
59933+ }
59934+
59935+ return ret;
59936+}
59937+
59938+static void release_and_unlock_bnode(struct bitmap_node *bnode)
59939+{
59940+ check_bnode_loaded(bnode);
59941+ mutex_unlock(&bnode->mutex);
59942+}
59943+
59944+/* This function does all block allocation work but only for one bitmap
59945+ block.*/
59946+/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
59947+ block responsibility zone boundaries. This had no sense in v3.6 but may
59948+ have it in v4.x */
59949+/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
59950+static int
59951+search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
59952+ bmap_off_t max_offset, int min_len, int max_len)
59953+{
59954+ struct super_block *super = get_current_context()->super;
59955+ struct bitmap_node *bnode = get_bnode(super, bmap);
59956+
59957+ char *data;
59958+
59959+ bmap_off_t search_end;
59960+ bmap_off_t start;
59961+ bmap_off_t end;
59962+
59963+ int set_first_zero_bit = 0;
59964+
59965+ int ret;
59966+
59967+ assert("zam-364", min_len > 0);
59968+ assert("zam-365", max_len >= min_len);
59969+ assert("zam-366", *offset <= max_offset);
59970+
59971+ ret = load_and_lock_bnode(bnode);
59972+
59973+ if (ret)
59974+ return ret;
59975+
59976+ data = bnode_working_data(bnode);
59977+
59978+ start = *offset;
59979+
59980+ if (bnode->first_zero_bit >= start) {
59981+ start = bnode->first_zero_bit;
59982+ set_first_zero_bit = 1;
59983+ }
59984+
59985+ while (start + min_len < max_offset) {
59986+
59987+ start =
59988+ reiser4_find_next_zero_bit((long *)data, max_offset, start);
59989+ if (set_first_zero_bit) {
59990+ bnode->first_zero_bit = start;
59991+ set_first_zero_bit = 0;
59992+ }
59993+ if (start >= max_offset)
59994+ break;
59995+
59996+ search_end = LIMIT(start + max_len, max_offset);
59997+ end =
59998+ reiser4_find_next_set_bit((long *)data, search_end, start);
59999+ if (end >= start + min_len) {
60000+ /* we can't trust find_next_set_bit result if set bit
60001+ was not fount, result may be bigger than
60002+ max_offset */
60003+ if (end > search_end)
60004+ end = search_end;
60005+
60006+ ret = end - start;
60007+ *offset = start;
60008+
60009+ reiser4_set_bits(data, start, end);
60010+
60011+ /* FIXME: we may advance first_zero_bit if [start,
60012+ end] region overlaps the first_zero_bit point */
60013+
60014+ break;
60015+ }
60016+
60017+ start = end + 1;
60018+ }
60019+
60020+ release_and_unlock_bnode(bnode);
60021+
60022+ return ret;
60023+}
60024+
60025+static int
60026+search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
60027+ bmap_off_t end_offset, int min_len, int max_len)
60028+{
60029+ struct super_block *super = get_current_context()->super;
60030+ struct bitmap_node *bnode = get_bnode(super, bmap);
60031+ char *data;
60032+ bmap_off_t start;
60033+ int ret;
60034+
60035+ assert("zam-958", min_len > 0);
60036+ assert("zam-959", max_len >= min_len);
60037+ assert("zam-960", *start_offset >= end_offset);
60038+
60039+ ret = load_and_lock_bnode(bnode);
60040+ if (ret)
60041+ return ret;
60042+
60043+ data = bnode_working_data(bnode);
60044+ start = *start_offset;
60045+
60046+ while (1) {
60047+ bmap_off_t end, search_end;
60048+
60049+ /* Find the beginning of the zero filled region */
60050+ if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
60051+ break;
60052+ /* Is there more than `min_len' bits from `start' to
60053+ * `end_offset'? */
60054+ if (start < end_offset + min_len - 1)
60055+ break;
60056+
60057+ /* Do not search to `end_offset' if we need to find less than
60058+ * `max_len' zero bits. */
60059+ if (end_offset + max_len - 1 < start)
60060+ search_end = start - max_len + 1;
60061+ else
60062+ search_end = end_offset;
60063+
60064+ if (reiser4_find_last_set_bit(&end, data, search_end, start))
60065+ end = search_end;
60066+ else
60067+ end++;
60068+
60069+ if (end + min_len <= start + 1) {
60070+ if (end < search_end)
60071+ end = search_end;
60072+ ret = start - end + 1;
60073+ *start_offset = end; /* `end' is lowest offset */
60074+ assert("zam-987",
60075+ reiser4_find_next_set_bit(data, start + 1,
60076+ end) >= start + 1);
60077+ reiser4_set_bits(data, end, start + 1);
60078+ break;
60079+ }
60080+
60081+ if (end <= end_offset)
60082+ /* left search boundary reached. */
60083+ break;
60084+ start = end - 1;
60085+ }
60086+
60087+ release_and_unlock_bnode(bnode);
60088+ return ret;
60089+}
60090+
60091+/* allocate contiguous range of blocks in bitmap */
60092+static int bitmap_alloc_forward(reiser4_block_nr * start,
60093+ const reiser4_block_nr * end, int min_len,
60094+ int max_len)
60095+{
60096+ bmap_nr_t bmap, end_bmap;
60097+ bmap_off_t offset, end_offset;
60098+ int len;
60099+
60100+ reiser4_block_nr tmp;
60101+
60102+ struct super_block *super = get_current_context()->super;
60103+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60104+
60105+ parse_blocknr(start, &bmap, &offset);
60106+
60107+ tmp = *end - 1;
60108+ parse_blocknr(&tmp, &end_bmap, &end_offset);
60109+ ++end_offset;
60110+
60111+ assert("zam-358", end_bmap >= bmap);
60112+ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
60113+
60114+ for (; bmap < end_bmap; bmap++, offset = 0) {
60115+ len =
60116+ search_one_bitmap_forward(bmap, &offset, max_offset,
60117+ min_len, max_len);
60118+ if (len != 0)
60119+ goto out;
60120+ }
60121+
60122+ len =
60123+ search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
60124+ max_len);
60125+ out:
60126+ *start = bmap * max_offset + offset;
60127+ return len;
60128+}
60129+
60130+/* allocate contiguous range of blocks in bitmap (from @start to @end in
60131+ * backward direction) */
60132+static int bitmap_alloc_backward(reiser4_block_nr * start,
60133+ const reiser4_block_nr * end, int min_len,
60134+ int max_len)
60135+{
60136+ bmap_nr_t bmap, end_bmap;
60137+ bmap_off_t offset, end_offset;
60138+ int len;
60139+ struct super_block *super = get_current_context()->super;
60140+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60141+
60142+ parse_blocknr(start, &bmap, &offset);
60143+ parse_blocknr(end, &end_bmap, &end_offset);
60144+
60145+ assert("zam-961", end_bmap <= bmap);
60146+ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
60147+
60148+ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
60149+ len =
60150+ search_one_bitmap_backward(bmap, &offset, 0, min_len,
60151+ max_len);
60152+ if (len != 0)
60153+ goto out;
60154+ }
60155+
60156+ len =
60157+ search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
60158+ max_len);
60159+ out:
60160+ *start = bmap * max_offset + offset;
60161+ return len;
60162+}
60163+
60164+/* plugin->u.space_allocator.alloc_blocks() */
60165+static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
60166+ reiser4_block_nr *start, reiser4_block_nr *len)
60167+{
60168+ struct super_block *super = get_current_context()->super;
60169+ int actual_len;
60170+
60171+ reiser4_block_nr search_start;
60172+ reiser4_block_nr search_end;
60173+
60174+ assert("zam-398", super != NULL);
60175+ assert("zam-412", hint != NULL);
60176+ assert("zam-397", hint->blk <= reiser4_block_count(super));
60177+
60178+ if (hint->max_dist == 0)
60179+ search_end = reiser4_block_count(super);
60180+ else
60181+ search_end =
60182+ LIMIT(hint->blk + hint->max_dist,
60183+ reiser4_block_count(super));
60184+
60185+ /* We use @hint -> blk as a search start and search from it to the end
60186+ of the disk or in given region if @hint -> max_dist is not zero */
60187+ search_start = hint->blk;
60188+
60189+ actual_len =
60190+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60191+
60192+ /* There is only one bitmap search if max_dist was specified or first
60193+ pass was from the beginning of the bitmap. We also do one pass for
60194+ scanning bitmap in backward direction. */
60195+ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
60196+ /* next step is a scanning from 0 to search_start */
60197+ search_end = search_start;
60198+ search_start = 0;
60199+ actual_len =
60200+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60201+ }
60202+ if (actual_len == 0)
60203+ return RETERR(-ENOSPC);
60204+ if (actual_len < 0)
60205+ return RETERR(actual_len);
60206+ *len = actual_len;
60207+ *start = search_start;
60208+ return 0;
60209+}
60210+
60211+static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
60212+ reiser4_block_nr * start,
60213+ reiser4_block_nr * len)
60214+{
60215+ reiser4_block_nr search_start;
60216+ reiser4_block_nr search_end;
60217+ int actual_len;
60218+
60219+ ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
60220+
60221+ assert("zam-969", super != NULL);
60222+ assert("zam-970", hint != NULL);
60223+ assert("zam-971", hint->blk <= reiser4_block_count(super));
60224+
60225+ search_start = hint->blk;
60226+ if (hint->max_dist == 0 || search_start <= hint->max_dist)
60227+ search_end = 0;
60228+ else
60229+ search_end = search_start - hint->max_dist;
60230+
60231+ actual_len =
60232+ bitmap_alloc_backward(&search_start, &search_end, 1, needed);
60233+ if (actual_len == 0)
60234+ return RETERR(-ENOSPC);
60235+ if (actual_len < 0)
60236+ return RETERR(actual_len);
60237+ *len = actual_len;
60238+ *start = search_start;
60239+ return 0;
60240+}
60241+
60242+/* plugin->u.space_allocator.alloc_blocks() */
60243+int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
60244+ reiser4_blocknr_hint * hint, int needed,
60245+ reiser4_block_nr * start, reiser4_block_nr * len)
60246+{
60247+ if (hint->backward)
60248+ return alloc_blocks_backward(hint, needed, start, len);
60249+ return alloc_blocks_forward(hint, needed, start, len);
60250+}
60251+
60252+/* plugin->u.space_allocator.dealloc_blocks(). */
60253+/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
60254+ nodes deletion is deferred until transaction commit. However, deallocation
60255+ of temporary objects like wandered blocks and transaction commit records
60256+ requires immediate node deletion from WORKING BITMAP.*/
60257+void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
60258+ reiser4_block_nr start, reiser4_block_nr len)
60259+{
60260+ struct super_block *super = reiser4_get_current_sb();
60261+
60262+ bmap_nr_t bmap;
60263+ bmap_off_t offset;
60264+
60265+ struct bitmap_node *bnode;
60266+ int ret;
60267+
60268+ assert("zam-468", len != 0);
60269+ check_block_range(&start, &len);
60270+
60271+ parse_blocknr(&start, &bmap, &offset);
60272+
60273+ assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
60274+
60275+ bnode = get_bnode(super, bmap);
60276+
60277+ assert("zam-470", bnode != NULL);
60278+
60279+ ret = load_and_lock_bnode(bnode);
60280+ assert("zam-481", ret == 0);
60281+
60282+ reiser4_clear_bits(bnode_working_data(bnode), offset,
60283+ (bmap_off_t) (offset + len));
60284+
60285+ adjust_first_zero_bit(bnode, offset);
60286+
60287+ release_and_unlock_bnode(bnode);
60288+}
60289+
60290+/* plugin->u.space_allocator.check_blocks(). */
60291+void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
60292+ const reiser4_block_nr * len, int desired)
60293+{
60294+#if REISER4_DEBUG
60295+ struct super_block *super = reiser4_get_current_sb();
60296+
60297+ bmap_nr_t bmap;
60298+ bmap_off_t start_offset;
60299+ bmap_off_t end_offset;
60300+
60301+ struct bitmap_node *bnode;
60302+ int ret;
60303+
60304+ assert("zam-622", len != NULL);
60305+ check_block_range(start, len);
60306+ parse_blocknr(start, &bmap, &start_offset);
60307+
60308+ end_offset = start_offset + *len;
60309+ assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
60310+
60311+ bnode = get_bnode(super, bmap);
60312+
60313+ assert("nikita-2215", bnode != NULL);
60314+
60315+ ret = load_and_lock_bnode(bnode);
60316+ assert("zam-626", ret == 0);
60317+
60318+ assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
60319+
60320+ if (desired) {
60321+ assert("zam-623",
60322+ reiser4_find_next_zero_bit(bnode_working_data(bnode),
60323+ end_offset, start_offset)
60324+ >= end_offset);
60325+ } else {
60326+ assert("zam-624",
60327+ reiser4_find_next_set_bit(bnode_working_data(bnode),
60328+ end_offset, start_offset)
60329+ >= end_offset);
60330+ }
60331+
60332+ release_and_unlock_bnode(bnode);
60333+#endif
60334+}
60335+
60336+/* conditional insertion of @node into atom's overwrite set if it was not there */
60337+static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
60338+{
60339+ assert("zam-546", atom != NULL);
60340+ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
60341+ assert("zam-548", node != NULL);
60342+
60343+ spin_lock_atom(atom);
60344+ spin_lock_jnode(node);
60345+
60346+ if (node->atom == NULL) {
60347+ JF_SET(node, JNODE_OVRWR);
60348+ insert_into_atom_ovrwr_list(atom, node);
60349+ } else {
60350+ assert("zam-549", node->atom == atom);
60351+ }
60352+
60353+ spin_unlock_jnode(node);
60354+ spin_unlock_atom(atom);
60355+}
60356+
60357+/* an actor which applies delete set to COMMIT bitmap pages and link modified
60358+ pages in a single-linked list */
60359+static int
60360+apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
60361+ const reiser4_block_nr * len, void *data)
60362+{
60363+
60364+ bmap_nr_t bmap;
60365+ bmap_off_t offset;
60366+ int ret;
60367+
60368+ long long *blocks_freed_p = data;
60369+
60370+ struct bitmap_node *bnode;
60371+
60372+ struct super_block *sb = reiser4_get_current_sb();
60373+
60374+ check_block_range(start, len);
60375+
60376+ parse_blocknr(start, &bmap, &offset);
60377+
60378+ /* FIXME-ZAM: we assume that all block ranges are allocated by this
60379+ bitmap-based allocator and each block range can't go over a zone of
60380+ responsibility of one bitmap block; same assumption is used in
60381+ other journal hooks in bitmap code. */
60382+ bnode = get_bnode(sb, bmap);
60383+ assert("zam-448", bnode != NULL);
60384+
60385+ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
60386+ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
60387+ ret = load_and_lock_bnode(bnode);
60388+ if (ret)
60389+ return ret;
60390+
60391+ /* put bnode into atom's overwrite set */
60392+ cond_add_to_overwrite_set(atom, bnode->cjnode);
60393+
60394+ data = bnode_commit_data(bnode);
60395+
60396+ ret = bnode_check_crc(bnode);
60397+ if (ret != 0)
60398+ return ret;
60399+
60400+ if (len != NULL) {
60401+ /* FIXME-ZAM: a check that all bits are set should be there */
60402+ assert("zam-443",
60403+ offset + *len <= bmap_bit_count(sb->s_blocksize));
60404+ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
60405+
60406+ (*blocks_freed_p) += *len;
60407+ } else {
60408+ reiser4_clear_bit(offset, data);
60409+ (*blocks_freed_p)++;
60410+ }
60411+
60412+ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
60413+
60414+ release_and_unlock_bnode(bnode);
60415+
60416+ return 0;
60417+}
60418+
60419+/* plugin->u.space_allocator.pre_commit_hook(). */
60420+/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
60421+ rest is done by transaction manager (allocate wandered locations for COMMIT
60422+ BITMAP blocks, copy COMMIT BITMAP blocks data). */
60423+/* Only one instance of this function can be running at one given time, because
60424+ only one transaction can be committed a time, therefore it is safe to access
60425+ some global variables without any locking */
60426+
60427+int reiser4_pre_commit_hook_bitmap(void)
60428+{
60429+ struct super_block *super = reiser4_get_current_sb();
60430+ txn_atom *atom;
60431+
60432+ long long blocks_freed = 0;
60433+
60434+ atom = get_current_atom_locked();
60435+ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
60436+ spin_unlock_atom(atom);
60437+
60438+ { /* scan atom's captured list and find all freshly allocated nodes,
60439+ * mark corresponded bits in COMMIT BITMAP as used */
60440+ struct list_head *head = ATOM_CLEAN_LIST(atom);
60441+ jnode *node = list_entry(head->next, jnode, capture_link);
60442+
60443+ while (head != &node->capture_link) {
60444+ /* we detect freshly allocated jnodes */
60445+ if (JF_ISSET(node, JNODE_RELOC)) {
60446+ int ret;
60447+ bmap_nr_t bmap;
60448+
60449+ bmap_off_t offset;
60450+ bmap_off_t index;
60451+ struct bitmap_node *bn;
60452+ __u32 size = bmap_size(super->s_blocksize);
60453+ __u32 crc;
60454+ char byte;
60455+
60456+ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
60457+ assert("zam-460",
60458+ !reiser4_blocknr_is_fake(&node->blocknr));
60459+
60460+ parse_blocknr(&node->blocknr, &bmap, &offset);
60461+ bn = get_bnode(super, bmap);
60462+
60463+ index = offset >> 3;
60464+ assert("vpf-276", index < size);
60465+
60466+ ret = bnode_check_crc(bnode);
60467+ if (ret != 0)
60468+ return ret;
60469+
60470+ check_bnode_loaded(bn);
60471+ load_and_lock_bnode(bn);
60472+
60473+ byte = *(bnode_commit_data(bn) + index);
60474+ reiser4_set_bit(offset, bnode_commit_data(bn));
60475+
60476+ crc = adler32_recalc(bnode_commit_crc(bn), byte,
60477+ *(bnode_commit_data(bn) +
60478+ index),
60479+ size - index),
60480+ bnode_set_commit_crc(bn, crc);
60481+
60482+ release_and_unlock_bnode(bn);
60483+
60484+ ret = bnode_check_crc(bn);
60485+ if (ret != 0)
60486+ return ret;
60487+
60488+ /* working of this depends on how it inserts
60489+ new j-node into clean list, because we are
60490+ scanning the same list now. It is OK, if
60491+ insertion is done to the list front */
60492+ cond_add_to_overwrite_set(atom, bn->cjnode);
60493+ }
60494+
60495+ node = list_entry(node->capture_link.next, jnode, capture_link);
60496+ }
60497+ }
60498+
60499+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
60500+ &blocks_freed, 0);
60501+
60502+ blocks_freed -= atom->nr_blocks_allocated;
60503+
60504+ {
60505+ reiser4_super_info_data *sbinfo;
60506+
60507+ sbinfo = get_super_private(super);
60508+
60509+ spin_lock_reiser4_super(sbinfo);
60510+ sbinfo->blocks_free_committed += blocks_freed;
60511+ spin_unlock_reiser4_super(sbinfo);
60512+ }
60513+
60514+ return 0;
60515+}
60516+
60517+/* plugin->u.space_allocator.init_allocator
60518+ constructor of reiser4_space_allocator object. It is called on fs mount */
60519+int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
60520+ struct super_block *super, void *arg)
60521+{
60522+ struct bitmap_allocator_data *data = NULL;
60523+ bmap_nr_t bitmap_blocks_nr;
60524+ bmap_nr_t i;
60525+
60526+ assert("nikita-3039", reiser4_schedulable());
60527+
60528+ /* getting memory for bitmap allocator private data holder */
60529+ data =
60530+ kmalloc(sizeof(struct bitmap_allocator_data),
60531+ reiser4_ctx_gfp_mask_get());
60532+
60533+ if (data == NULL)
60534+ return RETERR(-ENOMEM);
60535+
60536+ /* allocation and initialization for the array of bnodes */
60537+ bitmap_blocks_nr = get_nr_bmap(super);
60538+
60539+ /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
60540+ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
60541+ may I never meet someone who still uses the ia32 architecture when
60542+ storage devices of that size enter the market, and wants to use ia32
60543+ with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
60544+ probably, another dynamic data structure should replace a static
60545+ array of bnodes. */
60546+ /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
60547+ data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
60548+ if (data->bitmap == NULL) {
60549+ kfree(data);
60550+ return RETERR(-ENOMEM);
60551+ }
60552+
60553+ for (i = 0; i < bitmap_blocks_nr; i++)
60554+ init_bnode(data->bitmap + i, super, i);
60555+
60556+ allocator->u.generic = data;
60557+
60558+#if REISER4_DEBUG
60559+ get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
60560+#endif
60561+
60562+ /* Load all bitmap blocks at mount time. */
60563+ if (!test_bit
60564+ (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
60565+ __u64 start_time, elapsed_time;
60566+ struct bitmap_node *bnode;
60567+ int ret;
60568+
60569+ if (REISER4_DEBUG)
60570+ printk(KERN_INFO "loading reiser4 bitmap...");
60571+ start_time = jiffies;
60572+
60573+ for (i = 0; i < bitmap_blocks_nr; i++) {
60574+ bnode = data->bitmap + i;
60575+ ret = load_and_lock_bnode(bnode);
60576+ if (ret) {
60577+ reiser4_destroy_allocator_bitmap(allocator,
60578+ super);
60579+ return ret;
60580+ }
60581+ release_and_unlock_bnode(bnode);
60582+ }
60583+
60584+ elapsed_time = jiffies - start_time;
60585+ if (REISER4_DEBUG)
60586+ printk("...done (%llu jiffies)\n",
60587+ (unsigned long long)elapsed_time);
60588+ }
60589+
60590+ return 0;
60591+}
60592+
60593+/* plugin->u.space_allocator.destroy_allocator
60594+ destructor. It is called on fs unmount */
60595+int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
60596+ struct super_block *super)
60597+{
60598+ bmap_nr_t bitmap_blocks_nr;
60599+ bmap_nr_t i;
60600+
60601+ struct bitmap_allocator_data *data = allocator->u.generic;
60602+
60603+ assert("zam-414", data != NULL);
60604+ assert("zam-376", data->bitmap != NULL);
60605+
60606+ bitmap_blocks_nr = get_nr_bmap(super);
60607+
60608+ for (i = 0; i < bitmap_blocks_nr; i++) {
60609+ struct bitmap_node *bnode = data->bitmap + i;
60610+
60611+ mutex_lock(&bnode->mutex);
60612+
60613+#if REISER4_DEBUG
60614+ if (atomic_read(&bnode->loaded)) {
60615+ jnode *wj = bnode->wjnode;
60616+ jnode *cj = bnode->cjnode;
60617+
60618+ assert("zam-480", jnode_page(cj) != NULL);
60619+ assert("zam-633", jnode_page(wj) != NULL);
60620+
60621+ assert("zam-634",
60622+ memcmp(jdata(wj), jdata(wj),
60623+ bmap_size(super->s_blocksize)) == 0);
60624+
60625+ }
60626+#endif
60627+ done_bnode(bnode);
60628+ mutex_unlock(&bnode->mutex);
60629+ }
60630+
60631+ vfree(data->bitmap);
60632+ kfree(data);
60633+
60634+ allocator->u.generic = NULL;
60635+
60636+ return 0;
60637+}
60638+
60639+/*
60640+ * Local variables:
60641+ * c-indentation-style: "K&R"
60642+ * mode-name: "LC"
60643+ * c-basic-offset: 8
60644+ * tab-width: 8
60645+ * fill-column: 79
60646+ * scroll-step: 1
60647+ * End:
60648+ */
60649diff -urN linux-2.6.24.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.24/fs/reiser4/plugin/space/bitmap.h
60650--- linux-2.6.24.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 03:00:00.000000000 +0300
60651+++ linux-2.6.24/fs/reiser4/plugin/space/bitmap.h 2008-01-25 11:39:07.068241692 +0300
60652@@ -0,0 +1,47 @@
60653+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60654+
60655+#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
60656+#define __REISER4_PLUGIN_SPACE_BITMAP_H__
60657+
60658+#include "../../dformat.h"
60659+#include "../../block_alloc.h"
60660+
60661+#include <linux/types.h> /* for __u?? */
60662+#include <linux/fs.h> /* for struct super_block */
60663+/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
60664+/* declarations of functions implementing methods of space allocator plugin for
60665+ bitmap based allocator. The functions themselves are in bitmap.c */
60666+extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
60667+ struct super_block *, void *);
60668+extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
60669+ struct super_block *);
60670+extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
60671+ reiser4_blocknr_hint *, int needed,
60672+ reiser4_block_nr * start,
60673+ reiser4_block_nr * len);
60674+extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
60675+ const reiser4_block_nr *, int);
60676+extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
60677+ reiser4_block_nr,
60678+ reiser4_block_nr);
60679+extern int reiser4_pre_commit_hook_bitmap(void);
60680+
60681+#define reiser4_post_commit_hook_bitmap() do{}while(0)
60682+#define reiser4_post_write_back_hook_bitmap() do{}while(0)
60683+#define reiser4_print_info_bitmap(pref, al) do{}while(0)
60684+
60685+typedef __u64 bmap_nr_t;
60686+typedef __u32 bmap_off_t;
60687+
60688+#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
60689+
60690+/* Make Linus happy.
60691+ Local variables:
60692+ c-indentation-style: "K&R"
60693+ mode-name: "LC"
60694+ c-basic-offset: 8
60695+ tab-width: 8
60696+ fill-column: 120
60697+ scroll-step: 1
60698+ End:
60699+*/
60700diff -urN linux-2.6.24.orig/fs/reiser4/plugin/space/Makefile linux-2.6.24/fs/reiser4/plugin/space/Makefile
60701--- linux-2.6.24.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 03:00:00.000000000 +0300
60702+++ linux-2.6.24/fs/reiser4/plugin/space/Makefile 2008-01-25 11:39:07.068241692 +0300
60703@@ -0,0 +1,4 @@
60704+obj-$(CONFIG_REISER4_FS) += space_plugins.o
60705+
60706+space_plugins-objs := \
60707+ bitmap.o
60708diff -urN linux-2.6.24.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.24/fs/reiser4/plugin/space/space_allocator.h
60709--- linux-2.6.24.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 03:00:00.000000000 +0300
60710+++ linux-2.6.24/fs/reiser4/plugin/space/space_allocator.h 2008-01-25 11:39:07.068241692 +0300
60711@@ -0,0 +1,80 @@
60712+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60713+
60714+#ifndef __SPACE_ALLOCATOR_H__
60715+#define __SPACE_ALLOCATOR_H__
60716+
60717+#include "../../forward.h"
60718+#include "bitmap.h"
60719+/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
60720+ * but... */
60721+#define DEF_SPACE_ALLOCATOR(allocator) \
60722+ \
60723+static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
60724+{ \
60725+ return reiser4_init_allocator_##allocator (al, s, opaque); \
60726+} \
60727+ \
60728+static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
60729+{ \
60730+ reiser4_destroy_allocator_##allocator (al, s); \
60731+} \
60732+ \
60733+static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
60734+ int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
60735+{ \
60736+ return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \
60737+} \
60738+static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
60739+{ \
60740+ reiser4_dealloc_blocks_##allocator (al, start, len); \
60741+} \
60742+ \
60743+static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
60744+{ \
60745+ reiser4_check_blocks_##allocator (start, end, desired); \
60746+} \
60747+ \
60748+static inline void sa_pre_commit_hook (void) \
60749+{ \
60750+ reiser4_pre_commit_hook_##allocator (); \
60751+} \
60752+ \
60753+static inline void sa_post_commit_hook (void) \
60754+{ \
60755+ reiser4_post_commit_hook_##allocator (); \
60756+} \
60757+ \
60758+static inline void sa_post_write_back_hook (void) \
60759+{ \
60760+ reiser4_post_write_back_hook_##allocator(); \
60761+} \
60762+ \
60763+static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
60764+{ \
60765+ reiser4_print_info_##allocator (prefix, al); \
60766+}
60767+
60768+DEF_SPACE_ALLOCATOR(bitmap)
60769+
60770+/* this object is part of reiser4 private in-core super block */
60771+struct reiser4_space_allocator {
60772+ union {
60773+ /* space allocators might use this pointer to reference their
60774+ * data. */
60775+ void *generic;
60776+ } u;
60777+};
60778+
60779+/* __SPACE_ALLOCATOR_H__ */
60780+#endif
60781+
60782+/* Make Linus happy.
60783+ Local variables:
60784+ c-indentation-style: "K&R"
60785+ mode-name: "LC"
60786+ c-basic-offset: 8
60787+ tab-width: 8
60788+ fill-column: 120
60789+ scroll-step: 1
60790+ End:
60791+*/
60792diff -urN linux-2.6.24.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.24/fs/reiser4/plugin/tail_policy.c
60793--- linux-2.6.24.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 03:00:00.000000000 +0300
60794+++ linux-2.6.24/fs/reiser4/plugin/tail_policy.c 2008-01-25 11:39:07.068241692 +0300
60795@@ -0,0 +1,113 @@
60796+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60797+ * reiser4/README */
60798+
60799+/* Formatting policy plugins */
60800+
60801+/*
60802+ * Formatting policy plugin is used by object plugin (of regular file) to
60803+ * convert file between two representations.
60804+ *
60805+ * Currently following policies are implemented:
60806+ * never store file in formatted nodes
60807+ * always store file in formatted nodes
60808+ * store file in formatted nodes if file is smaller than 4 blocks (default)
60809+ */
60810+
60811+#include "../tree.h"
60812+#include "../inode.h"
60813+#include "../super.h"
60814+#include "object.h"
60815+#include "plugin.h"
60816+#include "node/node.h"
60817+#include "plugin_header.h"
60818+
60819+#include <linux/pagemap.h>
60820+#include <linux/fs.h> /* For struct inode */
60821+
60822+/**
60823+ * have_formatting_never -
60824+ * @inode:
60825+ * @size:
60826+ *
60827+ *
60828+ */
60829+/* Never store file's tail as direct item */
60830+/* Audited by: green(2002.06.12) */
60831+static int have_formatting_never(const struct inode *inode UNUSED_ARG
60832+ /* inode to operate on */ ,
60833+ loff_t size UNUSED_ARG /* new object size */ )
60834+{
60835+ return 0;
60836+}
60837+
60838+/* Always store file's tail as direct item */
60839+/* Audited by: green(2002.06.12) */
60840+static int
60841+have_formatting_always(const struct inode *inode UNUSED_ARG
60842+ /* inode to operate on */ ,
60843+ loff_t size UNUSED_ARG /* new object size */ )
60844+{
60845+ return 1;
60846+}
60847+
60848+/* This function makes test if we should store file denoted @inode as tails only or
60849+ as extents only. */
60850+static int
60851+have_formatting_default(const struct inode *inode UNUSED_ARG
60852+ /* inode to operate on */ ,
60853+ loff_t size /* new object size */ )
60854+{
60855+ assert("umka-1253", inode != NULL);
60856+
60857+ if (size > inode->i_sb->s_blocksize * 4)
60858+ return 0;
60859+
60860+ return 1;
60861+}
60862+
60863+/* tail plugins */
60864+formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
60865+ [NEVER_TAILS_FORMATTING_ID] = {
60866+ .h = {
60867+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60868+ .id = NEVER_TAILS_FORMATTING_ID,
60869+ .pops = NULL,
60870+ .label = "never",
60871+ .desc = "Never store file's tail",
60872+ .linkage = {NULL, NULL}
60873+ },
60874+ .have_tail = have_formatting_never
60875+ },
60876+ [ALWAYS_TAILS_FORMATTING_ID] = {
60877+ .h = {
60878+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60879+ .id = ALWAYS_TAILS_FORMATTING_ID,
60880+ .pops = NULL,
60881+ .label = "always",
60882+ .desc = "Always store file's tail",
60883+ .linkage = {NULL, NULL}
60884+ },
60885+ .have_tail = have_formatting_always
60886+ },
60887+ [SMALL_FILE_FORMATTING_ID] = {
60888+ .h = {
60889+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60890+ .id = SMALL_FILE_FORMATTING_ID,
60891+ .pops = NULL,
60892+ .label = "4blocks",
60893+ .desc = "store files shorter than 4 blocks in tail items",
60894+ .linkage = {NULL, NULL}
60895+ },
60896+ .have_tail = have_formatting_default
60897+ }
60898+};
60899+
60900+/*
60901+ * Local variables:
60902+ * c-indentation-style: "K&R"
60903+ * mode-name: "LC"
60904+ * c-basic-offset: 8
60905+ * tab-width: 8
60906+ * fill-column: 79
60907+ * End:
60908+ */
60909diff -urN linux-2.6.24.orig/fs/reiser4/pool.c linux-2.6.24/fs/reiser4/pool.c
60910--- linux-2.6.24.orig/fs/reiser4/pool.c 1970-01-01 03:00:00.000000000 +0300
60911+++ linux-2.6.24/fs/reiser4/pool.c 2008-01-25 11:39:07.072242722 +0300
60912@@ -0,0 +1,231 @@
60913+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60914+ * reiser4/README */
60915+
60916+/* Fast pool allocation.
60917+
60918+ There are situations when some sub-system normally asks memory allocator
60919+ for only few objects, but under some circumstances could require much
60920+ more. Typical and actually motivating example is tree balancing. It needs
60921+ to keep track of nodes that were involved into it, and it is well-known
60922+ that in reasonable packed balanced tree most (92.938121%) percent of all
60923+ balancings end up after working with only few nodes (3.141592 on
60924+ average). But in rare cases balancing can involve much more nodes
60925+ (3*tree_height+1 in extremal situation).
60926+
60927+ On the one hand, we don't want to resort to dynamic allocation (slab,
60928+ malloc(), etc.) to allocate data structures required to keep track of
60929+ nodes during balancing. On the other hand, we cannot statically allocate
60930+ required amount of space on the stack, because first: it is useless wastage
60931+ of precious resource, and second: this amount is unknown in advance (tree
60932+ height can change).
60933+
60934+ Pools, implemented in this file are solution for this problem:
60935+
60936+ - some configurable amount of objects is statically preallocated on the
60937+ stack
60938+
60939+ - if this preallocated pool is exhausted and more objects is requested
60940+ they are allocated dynamically.
60941+
60942+ Pools encapsulate distinction between statically and dynamically allocated
60943+ objects. Both allocation and recycling look exactly the same.
60944+
60945+ To keep track of dynamically allocated objects, pool adds its own linkage
60946+ to each object.
60947+
60948+ NOTE-NIKITA This linkage also contains some balancing-specific data. This
60949+ is not perfect. On the other hand, balancing is currently the only client
60950+ of pool code.
60951+
60952+ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
60953+ functions in the style of tslist/tshash, i.e., make them unreadable, but
60954+ type-safe.
60955+
60956+*/
60957+
60958+#include "debug.h"
60959+#include "pool.h"
60960+#include "super.h"
60961+
60962+#include <linux/types.h>
60963+#include <linux/err.h>
60964+
60965+/* initialize new pool object @h */
60966+static void reiser4_init_pool_obj(struct reiser4_pool_header * h)
60967+{
60968+ INIT_LIST_HEAD(&h->usage_linkage);
60969+ INIT_LIST_HEAD(&h->level_linkage);
60970+ INIT_LIST_HEAD(&h->extra_linkage);
60971+}
60972+
60973+/* initialize new pool */
60974+void reiser4_init_pool(struct reiser4_pool * pool /* pool to initialize */ ,
60975+ size_t obj_size /* size of objects in @pool */ ,
60976+ int num_of_objs /* number of preallocated objects */ ,
60977+ char *data /* area for preallocated objects */ )
60978+{
60979+ struct reiser4_pool_header *h;
60980+ int i;
60981+
60982+ assert("nikita-955", pool != NULL);
60983+ assert("nikita-1044", obj_size > 0);
60984+ assert("nikita-956", num_of_objs >= 0);
60985+ assert("nikita-957", data != NULL);
60986+
60987+ memset(pool, 0, sizeof *pool);
60988+ pool->obj_size = obj_size;
60989+ pool->data = data;
60990+ INIT_LIST_HEAD(&pool->free);
60991+ INIT_LIST_HEAD(&pool->used);
60992+ INIT_LIST_HEAD(&pool->extra);
60993+ memset(data, 0, obj_size * num_of_objs);
60994+ for (i = 0; i < num_of_objs; ++i) {
60995+ h = (struct reiser4_pool_header *) (data + i * obj_size);
60996+ reiser4_init_pool_obj(h);
60997+ /* add pool header to the end of pool's free list */
60998+ list_add_tail(&h->usage_linkage, &pool->free);
60999+ }
61000+}
61001+
61002+/* release pool resources
61003+
61004+ Release all resources acquired by this pool, specifically, dynamically
61005+ allocated objects.
61006+
61007+*/
61008+void reiser4_done_pool(struct reiser4_pool * pool UNUSED_ARG)
61009+{
61010+}
61011+
61012+/* allocate carry object from @pool
61013+
61014+ First, try to get preallocated object. If this fails, resort to dynamic
61015+ allocation.
61016+
61017+*/
61018+static void *reiser4_pool_alloc(struct reiser4_pool * pool)
61019+{
61020+ struct reiser4_pool_header *result;
61021+
61022+ assert("nikita-959", pool != NULL);
61023+
61024+ if (!list_empty(&pool->free)) {
61025+ struct list_head *linkage;
61026+
61027+ linkage = pool->free.next;
61028+ list_del(linkage);
61029+ INIT_LIST_HEAD(linkage);
61030+ result = list_entry(linkage, struct reiser4_pool_header,
61031+ usage_linkage);
61032+ BUG_ON(!list_empty(&result->level_linkage) ||
61033+ !list_empty(&result->extra_linkage));
61034+ } else {
61035+ /* pool is empty. Extra allocations don't deserve dedicated
61036+ slab to be served from, as they are expected to be rare. */
61037+ result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
61038+ if (result != 0) {
61039+ reiser4_init_pool_obj(result);
61040+ list_add(&result->extra_linkage, &pool->extra);
61041+ } else
61042+ return ERR_PTR(RETERR(-ENOMEM));
61043+ BUG_ON(!list_empty(&result->usage_linkage) ||
61044+ !list_empty(&result->level_linkage));
61045+ }
61046+ ++pool->objs;
61047+ list_add(&result->usage_linkage, &pool->used);
61048+ memset(result + 1, 0, pool->obj_size - sizeof *result);
61049+ return result;
61050+}
61051+
61052+/* return object back to the pool */
61053+void reiser4_pool_free(struct reiser4_pool * pool,
61054+ struct reiser4_pool_header * h)
61055+{
61056+ assert("nikita-961", h != NULL);
61057+ assert("nikita-962", pool != NULL);
61058+
61059+ --pool->objs;
61060+ assert("nikita-963", pool->objs >= 0);
61061+
61062+ list_del_init(&h->usage_linkage);
61063+ list_del_init(&h->level_linkage);
61064+
61065+ if (list_empty(&h->extra_linkage))
61066+ /*
61067+ * pool header is not an extra one. Push it onto free list
61068+ * using usage_linkage
61069+ */
61070+ list_add(&h->usage_linkage, &pool->free);
61071+ else {
61072+ /* remove pool header from pool's extra list and kfree it */
61073+ list_del(&h->extra_linkage);
61074+ kfree(h);
61075+ }
61076+}
61077+
61078+/* add new object to the carry level list
61079+
61080+ Carry level is FIFO most of the time, but not always. Complications arise
61081+ when make_space() function tries to go to the left neighbor and thus adds
61082+ carry node before existing nodes, and also, when updating delimiting keys
61083+ after moving data between two nodes, we want left node to be locked before
61084+ right node.
61085+
61086+ Latter case is confusing at the first glance. Problem is that COP_UPDATE
61087+ opration that updates delimiting keys is sometimes called with two nodes
61088+ (when data are moved between two nodes) and sometimes with only one node
61089+ (when leftmost item is deleted in a node). In any case operation is
61090+ supplied with at least node whose left delimiting key is to be updated
61091+ (that is "right" node).
61092+
61093+ @pool - from which to allocate new object;
61094+ @list - where to add object;
61095+ @reference - after (or before) which existing object to add
61096+*/
61097+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
61098+ struct list_head *list,
61099+ pool_ordering order,
61100+ struct reiser4_pool_header * reference)
61101+{
61102+ struct reiser4_pool_header *result;
61103+
61104+ assert("nikita-972", pool != NULL);
61105+
61106+ result = reiser4_pool_alloc(pool);
61107+ if (IS_ERR(result))
61108+ return result;
61109+
61110+ assert("nikita-973", result != NULL);
61111+
61112+ switch (order) {
61113+ case POOLO_BEFORE:
61114+ __list_add(&result->level_linkage,
61115+ reference->level_linkage.prev,
61116+ &reference->level_linkage);
61117+ break;
61118+ case POOLO_AFTER:
61119+ __list_add(&result->level_linkage,
61120+ &reference->level_linkage,
61121+ reference->level_linkage.next);
61122+ break;
61123+ case POOLO_LAST:
61124+ list_add_tail(&result->level_linkage, list);
61125+ break;
61126+ case POOLO_FIRST:
61127+ list_add(&result->level_linkage, list);
61128+ break;
61129+ default:
61130+ wrong_return_value("nikita-927", "order");
61131+ }
61132+ return result;
61133+}
61134+
61135+/* Make Linus happy.
61136+ Local variables:
61137+ c-indentation-style: "K&R"
61138+ mode-name: "LC"
61139+ c-basic-offset: 8
61140+ tab-width: 8
61141+ fill-column: 120
61142+ End:
61143+*/
61144diff -urN linux-2.6.24.orig/fs/reiser4/pool.h linux-2.6.24/fs/reiser4/pool.h
61145--- linux-2.6.24.orig/fs/reiser4/pool.h 1970-01-01 03:00:00.000000000 +0300
61146+++ linux-2.6.24/fs/reiser4/pool.h 2008-01-25 11:39:07.072242722 +0300
61147@@ -0,0 +1,56 @@
61148+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61149+
61150+/* Fast pool allocation */
61151+
61152+#ifndef __REISER4_POOL_H__
61153+#define __REISER4_POOL_H__
61154+
61155+#include <linux/types.h>
61156+
61157+struct reiser4_pool {
61158+ size_t obj_size;
61159+ int objs;
61160+ char *data;
61161+ struct list_head free;
61162+ struct list_head used;
61163+ struct list_head extra;
61164+};
61165+
61166+struct reiser4_pool_header {
61167+ /* object is either on free or "used" lists */
61168+ struct list_head usage_linkage;
61169+ struct list_head level_linkage;
61170+ struct list_head extra_linkage;
61171+};
61172+
61173+typedef enum {
61174+ POOLO_BEFORE,
61175+ POOLO_AFTER,
61176+ POOLO_LAST,
61177+ POOLO_FIRST
61178+} pool_ordering;
61179+
61180+/* pool manipulation functions */
61181+
61182+extern void reiser4_init_pool(struct reiser4_pool * pool, size_t obj_size,
61183+ int num_of_objs, char *data);
61184+extern void reiser4_done_pool(struct reiser4_pool * pool);
61185+extern void reiser4_pool_free(struct reiser4_pool * pool,
61186+ struct reiser4_pool_header * h);
61187+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
61188+ struct list_head * list,
61189+ pool_ordering order,
61190+ struct reiser4_pool_header *reference);
61191+
61192+/* __REISER4_POOL_H__ */
61193+#endif
61194+
61195+/* Make Linus happy.
61196+ Local variables:
61197+ c-indentation-style: "K&R"
61198+ mode-name: "LC"
61199+ c-basic-offset: 8
61200+ tab-width: 8
61201+ fill-column: 120
61202+ End:
61203+*/
61204diff -urN linux-2.6.24.orig/fs/reiser4/readahead.c linux-2.6.24/fs/reiser4/readahead.c
61205--- linux-2.6.24.orig/fs/reiser4/readahead.c 1970-01-01 03:00:00.000000000 +0300
61206+++ linux-2.6.24/fs/reiser4/readahead.c 2008-01-25 11:39:07.072242722 +0300
61207@@ -0,0 +1,138 @@
61208+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61209+ * reiser4/README */
61210+
61211+#include "forward.h"
61212+#include "tree.h"
61213+#include "tree_walk.h"
61214+#include "super.h"
61215+#include "inode.h"
61216+#include "key.h"
61217+#include "znode.h"
61218+
61219+#include <linux/swap.h> /* for totalram_pages */
61220+
61221+void reiser4_init_ra_info(ra_info_t * rai)
61222+{
61223+ rai->key_to_stop = *reiser4_min_key();
61224+}
61225+
61226+/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
61227+static inline int ra_adjacent_only(int flags)
61228+{
61229+ return flags & RA_ADJACENT_ONLY;
61230+}
61231+
61232+/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
61233+ if right neighbor's first key is less or equal to readahead's stop key */
61234+static int should_readahead_neighbor(znode * node, ra_info_t * info)
61235+{
61236+ int result;
61237+
61238+ read_lock_dk(znode_get_tree(node));
61239+ result = keyle(znode_get_rd_key(node), &info->key_to_stop);
61240+ read_unlock_dk(znode_get_tree(node));
61241+ return result;
61242+}
61243+
61244+#define LOW_MEM_PERCENTAGE (5)
61245+
61246+static int low_on_memory(void)
61247+{
61248+ unsigned int freepages;
61249+
61250+ freepages = nr_free_pages();
61251+ return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
61252+}
61253+
61254+/* start read for @node and for a few of its right neighbors */
61255+void formatted_readahead(znode * node, ra_info_t * info)
61256+{
61257+ struct formatted_ra_params *ra_params;
61258+ znode *cur;
61259+ int i;
61260+ int grn_flags;
61261+ lock_handle next_lh;
61262+
61263+ /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
61264+ if (reiser4_blocknr_is_fake(znode_get_block(node)))
61265+ return;
61266+
61267+ ra_params = get_current_super_ra_params();
61268+
61269+ if (znode_page(node) == NULL)
61270+ jstartio(ZJNODE(node));
61271+
61272+ if (znode_get_level(node) != LEAF_LEVEL)
61273+ return;
61274+
61275+ /* don't waste memory for read-ahead when low on memory */
61276+ if (low_on_memory())
61277+ return;
61278+
61279+ /* We can have locked nodes on upper tree levels, in this situation lock
61280+ priorities do not help to resolve deadlocks, we have to use TRY_LOCK
61281+ here. */
61282+ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
61283+
61284+ i = 0;
61285+ cur = zref(node);
61286+ init_lh(&next_lh);
61287+ while (i < ra_params->max) {
61288+ const reiser4_block_nr *nextblk;
61289+
61290+ if (!should_readahead_neighbor(cur, info))
61291+ break;
61292+
61293+ if (reiser4_get_right_neighbor
61294+ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
61295+ break;
61296+
61297+ nextblk = znode_get_block(next_lh.node);
61298+ if (reiser4_blocknr_is_fake(nextblk) ||
61299+ (ra_adjacent_only(ra_params->flags)
61300+ && *nextblk != *znode_get_block(cur) + 1)) {
61301+ break;
61302+ }
61303+
61304+ zput(cur);
61305+ cur = zref(next_lh.node);
61306+ done_lh(&next_lh);
61307+ if (znode_page(cur) == NULL)
61308+ jstartio(ZJNODE(cur));
61309+ else
61310+ /* Do not scan read-ahead window if pages already
61311+ * allocated (and i/o already started). */
61312+ break;
61313+
61314+ i++;
61315+ }
61316+ zput(cur);
61317+ done_lh(&next_lh);
61318+}
61319+
61320+void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
61321+{
61322+ reiser4_key *stop_key;
61323+
61324+ assert("nikita-3542", dir != NULL);
61325+ assert("nikita-3543", tap != NULL);
61326+
61327+ stop_key = &tap->ra_info.key_to_stop;
61328+ /* initialize readdir readahead information: include into readahead
61329+ * stat data of all files of the directory */
61330+ set_key_locality(stop_key, get_inode_oid(dir));
61331+ set_key_type(stop_key, KEY_SD_MINOR);
61332+ set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
61333+ set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
61334+ set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
61335+}
61336+
61337+/*
61338+ Local variables:
61339+ c-indentation-style: "K&R"
61340+ mode-name: "LC"
61341+ c-basic-offset: 8
61342+ tab-width: 8
61343+ fill-column: 80
61344+ End:
61345+*/
61346diff -urN linux-2.6.24.orig/fs/reiser4/readahead.h linux-2.6.24/fs/reiser4/readahead.h
61347--- linux-2.6.24.orig/fs/reiser4/readahead.h 1970-01-01 03:00:00.000000000 +0300
61348+++ linux-2.6.24/fs/reiser4/readahead.h 2008-01-25 11:39:07.072242722 +0300
61349@@ -0,0 +1,51 @@
61350+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61351+
61352+#ifndef __READAHEAD_H__
61353+#define __READAHEAD_H__
61354+
61355+#include "key.h"
61356+
61357+typedef enum {
61358+ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent.
61359+ Default is NO (not only adjacent) */
61360+} ra_global_flags;
61361+
61362+/* reiser4 super block has a field of this type.
61363+ It controls readahead during tree traversals */
61364+struct formatted_ra_params {
61365+ unsigned long max; /* request not more than this amount of nodes.
61366+ Default is totalram_pages / 4 */
61367+ int flags;
61368+};
61369+
61370+typedef struct {
61371+ reiser4_key key_to_stop;
61372+} ra_info_t;
61373+
61374+void formatted_readahead(znode *, ra_info_t *);
61375+void reiser4_init_ra_info(ra_info_t * rai);
61376+
61377+struct reiser4_file_ra_state {
61378+ loff_t start; /* Current window */
61379+ loff_t size;
61380+ loff_t next_size; /* Next window size */
61381+ loff_t ahead_start; /* Ahead window */
61382+ loff_t ahead_size;
61383+ loff_t max_window_size; /* Maximum readahead window */
61384+ loff_t slow_start; /* enlarging r/a size algorithm. */
61385+};
61386+
61387+extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
61388+
61389+/* __READAHEAD_H__ */
61390+#endif
61391+
61392+/*
61393+ Local variables:
61394+ c-indentation-style: "K&R"
61395+ mode-name: "LC"
61396+ c-basic-offset: 8
61397+ tab-width: 8
61398+ fill-column: 120
61399+ End:
61400+*/
61401diff -urN linux-2.6.24.orig/fs/reiser4/README linux-2.6.24/fs/reiser4/README
61402--- linux-2.6.24.orig/fs/reiser4/README 1970-01-01 03:00:00.000000000 +0300
61403+++ linux-2.6.24/fs/reiser4/README 2008-01-25 11:39:07.076243753 +0300
61404@@ -0,0 +1,128 @@
61405+[LICENSING]
61406+
61407+Reiser4 is hereby licensed under the GNU General
61408+Public License version 2.
61409+
61410+Source code files that contain the phrase "licensing governed by
61411+reiser4/README" are "governed files" throughout this file. Governed
61412+files are licensed under the GPL. The portions of them owned by Hans
61413+Reiser, or authorized to be licensed by him, have been in the past,
61414+and likely will be in the future, licensed to other parties under
61415+other licenses. If you add your code to governed files, and don't
61416+want it to be owned by Hans Reiser, put your copyright label on that
61417+code so the poor blight and his customers can keep things straight.
61418+All portions of governed files not labeled otherwise are owned by Hans
61419+Reiser, and by adding your code to it, widely distributing it to
61420+others or sending us a patch, and leaving the sentence in stating that
61421+licensing is governed by the statement in this file, you accept this.
61422+It will be a kindness if you identify whether Hans Reiser is allowed
61423+to license code labeled as owned by you on your behalf other than
61424+under the GPL, because he wants to know if it is okay to do so and put
61425+a check in the mail to you (for non-trivial improvements) when he
61426+makes his next sale. He makes no guarantees as to the amount if any,
61427+though he feels motivated to motivate contributors, and you can surely
61428+discuss this with him before or after contributing. You have the
61429+right to decline to allow him to license your code contribution other
61430+than under the GPL.
61431+
61432+Further licensing options are available for commercial and/or other
61433+interests directly from Hans Reiser: reiser@namesys.com. If you interpret
61434+the GPL as not allowing those additional licensing options, you read
61435+it wrongly, and Richard Stallman agrees with me, when carefully read
61436+you can see that those restrictions on additional terms do not apply
61437+to the owner of the copyright, and my interpretation of this shall
61438+govern for this license.
61439+
61440+[END LICENSING]
61441+
61442+Reiser4 is a file system based on dancing tree algorithms, and is
61443+described at http://www.namesys.com
61444+
61445+mkfs.reiser4 and other utilities are on our webpage or wherever your
61446+Linux provider put them. You really want to be running the latest
61447+version off the website if you use fsck.
61448+
61449+Yes, if you update your reiser4 kernel module you do have to
61450+recompile your kernel, most of the time. The errors you get will be
61451+quite cryptic if your forget to do so.
61452+
61453+Hideous Commercial Pitch: Spread your development costs across other OS
61454+vendors. Select from the best in the world, not the best in your
61455+building, by buying from third party OS component suppliers. Leverage
61456+the software component development power of the internet. Be the most
61457+aggressive in taking advantage of the commercial possibilities of
61458+decentralized internet development, and add value through your branded
61459+integration that you sell as an operating system. Let your competitors
61460+be the ones to compete against the entire internet by themselves. Be
61461+hip, get with the new economic trend, before your competitors do. Send
61462+email to reiser@namesys.com
61463+
61464+Hans Reiser was the primary architect of Reiser4, but a whole team
61465+chipped their ideas in. He invested everything he had into Namesys
61466+for 5.5 dark years of no money before Reiser3 finally started to work well
61467+enough to bring in money. He owns the copyright.
61468+
61469+DARPA was the primary sponsor of Reiser4. DARPA does not endorse
61470+Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
61471+opinion, unique in its willingness to invest into things more
61472+theoretical than the VC community can readily understand, and more
61473+longterm than allows them to be sure that they will be the ones to
61474+extract the economic benefits from. DARPA also integrated us into a
61475+security community that transformed our security worldview.
61476+
61477+Vladimir Saveliev is our lead programmer, with us from the beginning,
61478+and he worked long hours writing the cleanest code. This is why he is
61479+now the lead programmer after years of commitment to our work. He
61480+always made the effort to be the best he could be, and to make his
61481+code the best that it could be. What resulted was quite remarkable. I
61482+don't think that money can ever motivate someone to work the way he
61483+did, he is one of the most selfless men I know.
61484+
61485+Alexander Lyamin was our sysadmin, and helped to educate us in
61486+security issues. Moscow State University and IMT were very generous
61487+in the internet access they provided us, and in lots of other little
61488+ways that a generous institution can be.
61489+
61490+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
61491+locking code, the block allocator, and finished the flushing code.
61492+His code is always crystal clean and well structured.
61493+
61494+Nikita Danilov wrote the core of the balancing code, the core of the
61495+plugins code, and the directory code. He worked a steady pace of long
61496+hours that produced a whole lot of well abstracted code. He is our
61497+senior computer scientist.
61498+
61499+Vladimir Demidov wrote the parser. Writing an in kernel parser is
61500+something very few persons have the skills for, and it is thanks to
61501+him that we can say that the parser is really not so big compared to
61502+various bits of our other code, and making a parser work in the kernel
61503+was not so complicated as everyone would imagine mainly because it was
61504+him doing it...
61505+
61506+Joshua McDonald wrote the transaction manager, and the flush code.
61507+The flush code unexpectedly turned out be extremely hairy for reasons
61508+you can read about on our web page, and he did a great job on an
61509+extremely difficult task.
61510+
61511+Nina Reiser handled our accounting, government relations, and much
61512+more.
61513+
61514+Ramon Reiser developed our website.
61515+
61516+Beverly Palmer drew our graphics.
61517+
61518+Vitaly Fertman developed librepair, userspace plugins repair code, fsck
61519+and worked with Umka on developing libreiser4 and userspace plugins.
61520+
61521+Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
61522+userspace tools (reiser4progs).
61523+
61524+Oleg Drokin (aka Green) is the release manager who fixes everything.
61525+It is so nice to have someone like that on the team. He (plus Chris
61526+and Jeff) make it possible for the entire rest of the Namesys team to
61527+focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
61528+is just amazing to watch his talent for spotting bugs in action.
61529+
61530+Edward Shishkin wrote cryptcompress file plugin (which manages files
61531+built of encrypted and(or) compressed bodies) and other plugins related
61532+to transparent encryption and compression support.
61533diff -urN linux-2.6.24.orig/fs/reiser4/reiser4.h linux-2.6.24/fs/reiser4/reiser4.h
61534--- linux-2.6.24.orig/fs/reiser4/reiser4.h 1970-01-01 03:00:00.000000000 +0300
61535+++ linux-2.6.24/fs/reiser4/reiser4.h 2008-01-25 12:25:01.861363382 +0300
61536@@ -0,0 +1,270 @@
61537+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61538+ * reiser4/README */
61539+
61540+/* definitions of common constants used by reiser4 */
61541+
61542+#if !defined( __REISER4_H__ )
61543+#define __REISER4_H__
61544+
61545+#include <asm/param.h> /* for HZ */
61546+#include <linux/errno.h>
61547+#include <linux/types.h>
61548+#include <linux/fs.h>
61549+#include <linux/hardirq.h>
61550+#include <linux/sched.h>
61551+
61552+/*
61553+ * reiser4 compilation options.
61554+ */
61555+
61556+#if defined(CONFIG_REISER4_DEBUG)
61557+/* turn on assertion checks */
61558+#define REISER4_DEBUG (1)
61559+#else
61560+#define REISER4_DEBUG (0)
61561+#endif
61562+
61563+#if defined(CONFIG_ZLIB_INFLATE)
61564+/* turn on zlib */
61565+#define REISER4_ZLIB (1)
61566+#else
61567+#define REISER4_ZLIB (0)
61568+#endif
61569+
61570+#if defined(CONFIG_CRYPTO_SHA256)
61571+#define REISER4_SHA256 (1)
61572+#else
61573+#define REISER4_SHA256 (0)
61574+#endif
61575+
61576+/*
61577+ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
61578+ * 8-byte components. In the old "small key" mode, it's 3 8-byte
61579+ * components. Additional component, referred to as "ordering" is used to
61580+ * order items from which given object is composed of. As such, ordering is
61581+ * placed between locality and objectid. For directory item ordering contains
61582+ * initial prefix of the file name this item is for. This sorts all directory
61583+ * items within given directory lexicographically (but see
61584+ * fibration.[ch]). For file body and stat-data, ordering contains initial
61585+ * prefix of the name file was initially created with. In the common case
61586+ * (files with single name) this allows to order file bodies and stat-datas in
61587+ * the same order as their respective directory entries, thus speeding up
61588+ * readdir.
61589+ *
61590+ * Note, that kernel can only mount file system with the same key size as one
61591+ * it is compiled for, so flipping this option may render your data
61592+ * inaccessible.
61593+ */
61594+#define REISER4_LARGE_KEY (1)
61595+/*#define REISER4_LARGE_KEY (0)*/
61596+
61597+/*#define GUESS_EXISTS 1*/
61598+
61599+/*
61600+ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
61601+ * option
61602+ */
61603+
61604+extern const char *REISER4_SUPER_MAGIC_STRING;
61605+extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
61606+ * beginning of device */
61607+
61608+/* here go tunable parameters that are not worth special entry in kernel
61609+ configuration */
61610+
61611+/* default number of slots in coord-by-key caches */
61612+#define CBK_CACHE_SLOTS (16)
61613+/* how many elementary tree operation to carry on the next level */
61614+#define CARRIES_POOL_SIZE (5)
61615+/* size of pool of preallocated nodes for carry process. */
61616+#define NODES_LOCKED_POOL_SIZE (5)
61617+
61618+#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61619+#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61620+#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
61621+#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
61622+
61623+/* we are supporting reservation of disk space on uid basis */
61624+#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
61625+/* we are supporting reservation of disk space for groups */
61626+#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
61627+/* we are supporting reservation of disk space for root */
61628+#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
61629+/* we use rapid flush mode, see flush.c for comments. */
61630+#define REISER4_USE_RAPID_FLUSH (1)
61631+
61632+/*
61633+ * set this to 0 if you don't want to use wait-for-flush in ->writepage().
61634+ */
61635+#define REISER4_USE_ENTD (1)
61636+
61637+/* key allocation is Plan-A */
61638+#define REISER4_PLANA_KEY_ALLOCATION (1)
61639+/* key allocation follows good old 3.x scheme */
61640+#define REISER4_3_5_KEY_ALLOCATION (0)
61641+
61642+/* size of hash-table for znodes */
61643+#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
61644+
61645+/* number of buckets in lnode hash-table */
61646+#define LNODE_HTABLE_BUCKETS (1024)
61647+
61648+/* some ridiculously high maximal limit on height of znode tree. This
61649+ is used in declaration of various per level arrays and
61650+ to allocate stattistics gathering array for per-level stats. */
61651+#define REISER4_MAX_ZTREE_HEIGHT (8)
61652+
61653+#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
61654+
61655+/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
61656+ sequential search is on average faster than binary. This is because
61657+ of better optimization and because sequential search is more CPU
61658+ cache friendly. This number (25) was found by experiments on dual AMD
61659+ Athlon(tm), 1400MHz.
61660+
61661+ NOTE: testing in kernel has shown that binary search is more effective than
61662+ implied by results of the user level benchmarking. Probably because in the
61663+ node keys are separated by other data. So value was adjusted after few
61664+ tests. More thorough tuning is needed.
61665+*/
61666+#define REISER4_SEQ_SEARCH_BREAK (3)
61667+
61668+/* don't allow tree to be lower than this */
61669+#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
61670+
61671+/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
61672+ * available memory. */
61673+/* Default value of maximal atom size. Can be ovewritten by
61674+ tmgr.atom_max_size mount option. By default infinity. */
61675+#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
61676+
61677+/* Default value of maximal atom age (in jiffies). After reaching this age
61678+ atom will be forced to commit, either synchronously or asynchronously. Can
61679+ be overwritten by tmgr.atom_max_age mount option. */
61680+#define REISER4_ATOM_MAX_AGE (600 * HZ)
61681+
61682+/* sleeping period for ktxnmrgd */
61683+#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
61684+
61685+/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
61686+#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
61687+
61688+/* start complaining after that many restarts in coord_by_key().
61689+
61690+ This either means incredibly heavy contention for this part of a tree, or
61691+ some corruption or bug.
61692+*/
61693+#define REISER4_CBK_ITERATIONS_LIMIT (100)
61694+
61695+/* return -EIO after that many iterations in coord_by_key().
61696+
61697+ I have witnessed more than 800 iterations (in 30 thread test) before cbk
61698+ finished. --nikita
61699+*/
61700+#define REISER4_MAX_CBK_ITERATIONS 500000
61701+
61702+/* put a per-inode limit on maximal number of directory entries with identical
61703+ keys in hashed directory.
61704+
61705+ Disable this until inheritance interfaces stabilize: we need some way to
61706+ set per directory limit.
61707+*/
61708+#define REISER4_USE_COLLISION_LIMIT (0)
61709+
61710+/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
61711+ will force them to be relocated. */
61712+#define FLUSH_RELOCATE_THRESHOLD 64
61713+/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
61714+ from the preceder it will relocate to that position. */
61715+#define FLUSH_RELOCATE_DISTANCE 64
61716+
61717+/* If we have written this much or more blocks before encountering busy jnode
61718+ in flush list - abort flushing hoping that next time we get called
61719+ this jnode will be clean already, and we will save some seeks. */
61720+#define FLUSH_WRITTEN_THRESHOLD 50
61721+
61722+/* The maximum number of nodes to scan left on a level during flush. */
61723+#define FLUSH_SCAN_MAXNODES 10000
61724+
61725+/* per-atom limit of flushers */
61726+#define ATOM_MAX_FLUSHERS (1)
61727+
61728+/* default tracing buffer size */
61729+#define REISER4_TRACE_BUF_SIZE (1 << 15)
61730+
61731+/* what size units of IO we would like cp, etc., to use, in writing to
61732+ reiser4. In bytes.
61733+
61734+ Can be overwritten by optimal_io_size mount option.
61735+*/
61736+#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
61737+
61738+/* see comments in inode.c:oid_to_uino() */
61739+#define REISER4_UINO_SHIFT (1 << 30)
61740+
61741+/* Mark function argument as unused to avoid compiler warnings. */
61742+#define UNUSED_ARG __attribute__((unused))
61743+
61744+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
61745+#define NONNULL __attribute__((nonnull))
61746+#else
61747+#define NONNULL
61748+#endif
61749+
61750+/* master super block offset in bytes.*/
61751+#define REISER4_MASTER_OFFSET 65536
61752+
61753+/* size of VFS block */
61754+#define VFS_BLKSIZE 512
61755+/* number of bits in size of VFS block (512==2^9) */
61756+#define VFS_BLKSIZE_BITS 9
61757+
61758+#define REISER4_I reiser4_inode_data
61759+
61760+/* implication */
61761+#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
61762+/* logical equivalence */
61763+#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
61764+
61765+#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
61766+
61767+#define NOT_YET (0)
61768+
61769+/** Reiser4 specific error codes **/
61770+
61771+#define REISER4_ERROR_CODE_BASE 10000
61772+
61773+/* Neighbor is not available (side neighbor or parent) */
61774+#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
61775+
61776+/* Node was not found in cache */
61777+#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
61778+
61779+/* node has no free space enough for completion of balancing operation */
61780+#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
61781+
61782+/* repeat operation */
61783+#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
61784+
61785+/* deadlock happens */
61786+#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
61787+
61788+/* operation cannot be performed, because it would block and non-blocking mode
61789+ * was requested. */
61790+#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
61791+
61792+/* wait some event (depends on context), then repeat */
61793+#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
61794+
61795+#endif /* __REISER4_H__ */
61796+
61797+
61798+/* Make Linus happy.
61799+ Local variables:
61800+ c-indentation-style: "K&R"
61801+ mode-name: "LC"
61802+ c-basic-offset: 8
61803+ tab-width: 8
61804+ fill-column: 120
61805+ End:
61806+*/
61807diff -urN linux-2.6.24.orig/fs/reiser4/safe_link.c linux-2.6.24/fs/reiser4/safe_link.c
61808--- linux-2.6.24.orig/fs/reiser4/safe_link.c 1970-01-01 03:00:00.000000000 +0300
61809+++ linux-2.6.24/fs/reiser4/safe_link.c 2008-01-25 11:39:07.076243753 +0300
61810@@ -0,0 +1,352 @@
61811+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
61812+ * reiser4/README */
61813+
61814+/* Safe-links. */
61815+
61816+/*
61817+ * Safe-links are used to maintain file system consistency during operations
61818+ * that spawns multiple transactions. For example:
61819+ *
61820+ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
61821+ * without user-visible names in the file system, but still opened by some
61822+ * active process. What happens here is that unlink proper (i.e., removal
61823+ * of the last file name) and file deletion (truncate of file body to zero
61824+ * and deletion of stat-data, that happens when last file descriptor is
61825+ * closed), may belong to different transactions T1 and T2. If a crash
61826+ * happens after T1 commit, but before T2 commit, on-disk file system has
61827+ * a file without name, that is, disk space leak.
61828+ *
61829+ * 2. Truncate. Truncate of large file may spawn multiple transactions. If
61830+ * system crashes while truncate was in-progress, file is left partially
61831+ * truncated, which violates "atomicity guarantees" of reiser4, viz. that
61832+ * every system is atomic.
61833+ *
61834+ * Safe-links address both above cases. Basically, safe-link is a way post
61835+ * some operation to be executed during commit of some other transaction than
61836+ * current one. (Another way to look at the safe-link is to interpret it as a
61837+ * logical logging.)
61838+ *
61839+ * Specifically, at the beginning of unlink safe-link in inserted in the
61840+ * tree. This safe-link is normally removed by file deletion code (during
61841+ * transaction T2 in the above terms). Truncate also inserts safe-link that is
61842+ * normally removed when truncate operation is finished.
61843+ *
61844+ * This means, that in the case of "clean umount" there are no safe-links in
61845+ * the tree. If safe-links are observed during mount, it means that (a) system
61846+ * was terminated abnormally, and (b) safe-link correspond to the "pending"
61847+ * (i.e., not finished) operations that were in-progress during system
61848+ * termination. Each safe-link record enough information to complete
61849+ * corresponding operation, and mount simply "replays" them (hence, the
61850+ * analogy with the logical logging).
61851+ *
61852+ * Safe-links are implemented as blackbox items (see
61853+ * plugin/item/blackbox.[ch]).
61854+ *
61855+ * For the reference: ext3 also has similar mechanism, it's called "an orphan
61856+ * list" there.
61857+ */
61858+
61859+#include "safe_link.h"
61860+#include "debug.h"
61861+#include "inode.h"
61862+
61863+#include "plugin/item/blackbox.h"
61864+
61865+#include <linux/fs.h>
61866+
61867+/*
61868+ * On-disk format of safe-link.
61869+ */
61870+typedef struct safelink {
61871+ reiser4_key sdkey; /* key of stat-data for the file safe-link is
61872+ * for */
61873+ d64 size; /* size to which file should be truncated */
61874+} safelink_t;
61875+
61876+/*
61877+ * locality where safe-link items are stored. Next to the objectid of root
61878+ * directory.
61879+ */
61880+static oid_t safe_link_locality(reiser4_tree * tree)
61881+{
61882+ return get_key_objectid(get_super_private(tree->super)->df_plug->
61883+ root_dir_key(tree->super)) + 1;
61884+}
61885+
61886+/*
61887+ Construct a key for the safe-link. Key has the following format:
61888+
61889+| 60 | 4 | 64 | 4 | 60 | 64 |
61890++---------------+---+------------------+---+---------------+------------------+
61891+| locality | 0 | 0 | 0 | objectid | link type |
61892++---------------+---+------------------+---+---------------+------------------+
61893+| | | | |
61894+| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
61895+
61896+ This is in large keys format. In small keys format second 8 byte chunk is
61897+ out. Locality is a constant returned by safe_link_locality(). objectid is
61898+ an oid of a file on which operation protected by this safe-link is
61899+ performed. link-type is used to distinguish safe-links for different
61900+ operations.
61901+
61902+ */
61903+static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
61904+ reiser4_safe_link_t link, reiser4_key * key)
61905+{
61906+ reiser4_key_init(key);
61907+ set_key_locality(key, safe_link_locality(tree));
61908+ set_key_objectid(key, oid);
61909+ set_key_offset(key, link);
61910+ return key;
61911+}
61912+
61913+/*
61914+ * how much disk space is necessary to insert and remove (in the
61915+ * error-handling path) safe-link.
61916+ */
61917+static __u64 safe_link_tograb(reiser4_tree * tree)
61918+{
61919+ return
61920+ /* insert safe link */
61921+ estimate_one_insert_item(tree) +
61922+ /* remove safe link */
61923+ estimate_one_item_removal(tree) +
61924+ /* drill to the leaf level during insertion */
61925+ 1 + estimate_one_insert_item(tree) +
61926+ /*
61927+ * possible update of existing safe-link. Actually, if
61928+ * safe-link existed already (we failed to remove it), then no
61929+ * insertion is necessary, so this term is already "covered",
61930+ * but for simplicity let's left it.
61931+ */
61932+ 1;
61933+}
61934+
61935+/*
61936+ * grab enough disk space to insert and remove (in the error-handling path)
61937+ * safe-link.
61938+ */
61939+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
61940+{
61941+ int result;
61942+
61943+ grab_space_enable();
61944+ /* The sbinfo->delete_mutex can be taken here.
61945+ * safe_link_release() should be called before leaving reiser4
61946+ * context. */
61947+ result =
61948+ reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
61949+ grab_space_enable();
61950+ return result;
61951+}
61952+
61953+/*
61954+ * release unused disk space reserved by safe_link_grab().
61955+ */
61956+void safe_link_release(reiser4_tree * tree)
61957+{
61958+ reiser4_release_reserved(tree->super);
61959+}
61960+
61961+/*
61962+ * insert into tree safe-link for operation @link on inode @inode.
61963+ */
61964+int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
61965+{
61966+ reiser4_key key;
61967+ safelink_t sl;
61968+ int length;
61969+ int result;
61970+ reiser4_tree *tree;
61971+
61972+ build_sd_key(inode, &sl.sdkey);
61973+ length = sizeof sl.sdkey;
61974+
61975+ if (link == SAFE_TRUNCATE) {
61976+ /*
61977+ * for truncate we have to store final file length also,
61978+ * expand item.
61979+ */
61980+ length += sizeof(sl.size);
61981+ put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
61982+ }
61983+ tree = reiser4_tree_by_inode(inode);
61984+ build_link_key(tree, get_inode_oid(inode), link, &key);
61985+
61986+ result = store_black_box(tree, &key, &sl, length);
61987+ if (result == -EEXIST)
61988+ result = update_black_box(tree, &key, &sl, length);
61989+ return result;
61990+}
61991+
61992+/*
61993+ * remove safe-link corresponding to the operation @link on inode @inode from
61994+ * the tree.
61995+ */
61996+int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
61997+{
61998+ reiser4_key key;
61999+
62000+ return kill_black_box(tree, build_link_key(tree, oid, link, &key));
62001+}
62002+
62003+/*
62004+ * in-memory structure to keep information extracted from safe-link. This is
62005+ * used to iterate over all safe-links.
62006+ */
62007+struct safe_link_context {
62008+ reiser4_tree *tree; /* internal tree */
62009+ reiser4_key key; /* safe-link key */
62010+ reiser4_key sdkey; /* key of object stat-data */
62011+ reiser4_safe_link_t link; /* safe-link type */
62012+ oid_t oid; /* object oid */
62013+ __u64 size; /* final size for truncate */
62014+};
62015+
62016+/*
62017+ * start iterating over all safe-links.
62018+ */
62019+static void safe_link_iter_begin(reiser4_tree * tree,
62020+ struct safe_link_context * ctx)
62021+{
62022+ ctx->tree = tree;
62023+ reiser4_key_init(&ctx->key);
62024+ set_key_locality(&ctx->key, safe_link_locality(tree));
62025+ set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
62026+ set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
62027+}
62028+
62029+/*
62030+ * return next safe-link.
62031+ */
62032+static int safe_link_iter_next(struct safe_link_context * ctx)
62033+{
62034+ int result;
62035+ safelink_t sl;
62036+
62037+ result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
62038+ if (result == 0) {
62039+ ctx->oid = get_key_objectid(&ctx->key);
62040+ ctx->link = get_key_offset(&ctx->key);
62041+ ctx->sdkey = sl.sdkey;
62042+ if (ctx->link == SAFE_TRUNCATE)
62043+ ctx->size = le64_to_cpu(get_unaligned(&sl.size));
62044+ }
62045+ return result;
62046+}
62047+
62048+/*
62049+ * check are there any more safe-links left in the tree.
62050+ */
62051+static int safe_link_iter_finished(struct safe_link_context * ctx)
62052+{
62053+ return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
62054+}
62055+
62056+/*
62057+ * finish safe-link iteration.
62058+ */
62059+static void safe_link_iter_end(struct safe_link_context * ctx)
62060+{
62061+ /* nothing special */
62062+}
62063+
62064+/*
62065+ * process single safe-link.
62066+ */
62067+static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
62068+ reiser4_key * sdkey, oid_t oid, __u64 size)
62069+{
62070+ struct inode *inode;
62071+ int result;
62072+
62073+ /*
62074+ * obtain object inode by reiser4_iget(), then call object plugin
62075+ * ->safelink() method to do actual work, then delete safe-link on
62076+ * success.
62077+ */
62078+ inode = reiser4_iget(super, sdkey, 1);
62079+ if (!IS_ERR(inode)) {
62080+ file_plugin *fplug;
62081+
62082+ fplug = inode_file_plugin(inode);
62083+ assert("nikita-3428", fplug != NULL);
62084+ assert("", oid == get_inode_oid(inode));
62085+ if (fplug->safelink != NULL) {
62086+ /* reiser4_txn_restart_current is not necessary because
62087+ * mounting is signle thread. However, without it
62088+ * deadlock detection code will complain (see
62089+ * nikita-3361). */
62090+ reiser4_txn_restart_current();
62091+ result = fplug->safelink(inode, link, size);
62092+ } else {
62093+ warning("nikita-3430",
62094+ "Cannot handle safelink for %lli",
62095+ (unsigned long long)oid);
62096+ reiser4_print_key("key", sdkey);
62097+ result = 0;
62098+ }
62099+ if (result != 0) {
62100+ warning("nikita-3431",
62101+ "Error processing safelink for %lli: %i",
62102+ (unsigned long long)oid, result);
62103+ }
62104+ reiser4_iget_complete(inode);
62105+ iput(inode);
62106+ if (result == 0) {
62107+ result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT);
62108+ if (result == 0)
62109+ result =
62110+ safe_link_del(reiser4_get_tree(super), oid, link);
62111+ safe_link_release(reiser4_get_tree(super));
62112+ /*
62113+ * restart transaction: if there was large number of
62114+ * safe-links, their processing may fail to fit into
62115+ * single transaction.
62116+ */
62117+ if (result == 0)
62118+ reiser4_txn_restart_current();
62119+ }
62120+ } else
62121+ result = PTR_ERR(inode);
62122+ return result;
62123+}
62124+
62125+/*
62126+ * iterate over all safe-links in the file-system processing them one by one.
62127+ */
62128+int process_safelinks(struct super_block *super)
62129+{
62130+ struct safe_link_context ctx;
62131+ int result;
62132+
62133+ if (rofs_super(super))
62134+ /* do nothing on the read-only file system */
62135+ return 0;
62136+ safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
62137+ result = 0;
62138+ do {
62139+ result = safe_link_iter_next(&ctx);
62140+ if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
62141+ result = 0;
62142+ break;
62143+ }
62144+ if (result == 0)
62145+ result = process_safelink(super, ctx.link,
62146+ &ctx.sdkey, ctx.oid,
62147+ ctx.size);
62148+ } while (result == 0);
62149+ safe_link_iter_end(&ctx);
62150+ return result;
62151+}
62152+
62153+/* Make Linus happy.
62154+ Local variables:
62155+ c-indentation-style: "K&R"
62156+ mode-name: "LC"
62157+ c-basic-offset: 8
62158+ tab-width: 8
62159+ fill-column: 120
62160+ scroll-step: 1
62161+ End:
62162+*/
62163diff -urN linux-2.6.24.orig/fs/reiser4/safe_link.h linux-2.6.24/fs/reiser4/safe_link.h
62164--- linux-2.6.24.orig/fs/reiser4/safe_link.h 1970-01-01 03:00:00.000000000 +0300
62165+++ linux-2.6.24/fs/reiser4/safe_link.h 2008-01-25 11:39:07.080244783 +0300
62166@@ -0,0 +1,29 @@
62167+/* Copyright 2003 by Hans Reiser, licensing governed by
62168+ * reiser4/README */
62169+
62170+/* Safe-links. See safe_link.c for details. */
62171+
62172+#if !defined( __FS_SAFE_LINK_H__ )
62173+#define __FS_SAFE_LINK_H__
62174+
62175+#include "tree.h"
62176+
62177+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
62178+void safe_link_release(reiser4_tree * tree);
62179+int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
62180+int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
62181+
62182+int process_safelinks(struct super_block *super);
62183+
62184+/* __FS_SAFE_LINK_H__ */
62185+#endif
62186+
62187+/* Make Linus happy.
62188+ Local variables:
62189+ c-indentation-style: "K&R"
62190+ mode-name: "LC"
62191+ c-basic-offset: 8
62192+ tab-width: 8
62193+ fill-column: 120
62194+ End:
62195+*/
62196diff -urN linux-2.6.24.orig/fs/reiser4/seal.c linux-2.6.24/fs/reiser4/seal.c
62197--- linux-2.6.24.orig/fs/reiser4/seal.c 1970-01-01 03:00:00.000000000 +0300
62198+++ linux-2.6.24/fs/reiser4/seal.c 2008-01-25 11:39:07.080244783 +0300
62199@@ -0,0 +1,218 @@
62200+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62201+/* Seals implementation. */
62202+/* Seals are "weak" tree pointers. They are analogous to tree coords in
62203+ allowing to bypass tree traversal. But normal usage of coords implies that
62204+ node pointed to by coord is locked, whereas seals don't keep a lock (or
62205+ even a reference) to znode. In stead, each znode contains a version number,
62206+ increased on each znode modification. This version number is copied into a
62207+ seal when seal is created. Later, one can "validate" seal by calling
62208+ reiser4_seal_validate(). If znode is in cache and its version number is
62209+ still the same, seal is "pristine" and coord associated with it can be
62210+ re-used immediately.
62211+
62212+ If, on the other hand, znode is out of cache, or it is obviously different
62213+ one from the znode seal was initially attached to (for example, it is on
62214+ the different level, or is being removed from the tree), seal is
62215+ irreparably invalid ("burned") and tree traversal has to be repeated.
62216+
62217+ Otherwise, there is some hope, that while znode was modified (and seal was
62218+ "broken" as a result), key attached to the seal is still in the node. This
62219+ is checked by first comparing this key with delimiting keys of node and, if
62220+ key is ok, doing intra-node lookup.
62221+
62222+ Znode version is maintained in the following way:
62223+
62224+ there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
62225+ znode_epoch is incremented and its new value is stored in ->version field
62226+ of new znode. Whenever znode is dirtied (which means it was probably
62227+ modified), znode_epoch is also incremented and its new value is stored in
62228+ znode->version. This is done so, because just incrementing znode->version
62229+ on each update is not enough: it may so happen, that znode get deleted, new
62230+ znode is allocated for the same disk block and gets the same version
62231+ counter, tricking seal code into false positive.
62232+*/
62233+
62234+#include "forward.h"
62235+#include "debug.h"
62236+#include "key.h"
62237+#include "coord.h"
62238+#include "seal.h"
62239+#include "plugin/item/item.h"
62240+#include "plugin/node/node.h"
62241+#include "jnode.h"
62242+#include "znode.h"
62243+#include "super.h"
62244+
62245+static znode *seal_node(const seal_t * seal);
62246+static int seal_matches(const seal_t * seal, znode * node);
62247+
62248+/* initialise seal. This can be called several times on the same seal. @coord
62249+ and @key can be NULL. */
62250+void reiser4_seal_init(seal_t * seal /* seal to initialise */ ,
62251+ const coord_t * coord /* coord @seal will be
62252+ * attached to */ ,
62253+ const reiser4_key * key UNUSED_ARG /* key @seal will be
62254+ * attached to */ )
62255+{
62256+ assert("nikita-1886", seal != NULL);
62257+ memset(seal, 0, sizeof *seal);
62258+ if (coord != NULL) {
62259+ znode *node;
62260+
62261+ node = coord->node;
62262+ assert("nikita-1987", node != NULL);
62263+ spin_lock_znode(node);
62264+ seal->version = node->version;
62265+ assert("nikita-1988", seal->version != 0);
62266+ seal->block = *znode_get_block(node);
62267+#if REISER4_DEBUG
62268+ seal->coord1 = *coord;
62269+ if (key != NULL)
62270+ seal->key = *key;
62271+#endif
62272+ spin_unlock_znode(node);
62273+ }
62274+}
62275+
62276+/* finish with seal */
62277+void reiser4_seal_done(seal_t * seal /* seal to clear */ )
62278+{
62279+ assert("nikita-1887", seal != NULL);
62280+ seal->version = 0;
62281+}
62282+
62283+/* true if seal was initialised */
62284+int reiser4_seal_is_set(const seal_t * seal /* seal to query */ )
62285+{
62286+ assert("nikita-1890", seal != NULL);
62287+ return seal->version != 0;
62288+}
62289+
62290+#if REISER4_DEBUG
62291+/* helper function for reiser4_seal_validate(). It checks that item at @coord
62292+ * has expected key. This is to detect cases where node was modified but wasn't
62293+ * marked dirty. */
62294+static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
62295+ const reiser4_key * k /* expected key */ )
62296+{
62297+ reiser4_key ukey;
62298+
62299+ return (coord->between != AT_UNIT) ||
62300+ /* FIXME-VS: we only can compare keys for items whose units
62301+ represent exactly one key */
62302+ ((coord_is_existing_unit(coord))
62303+ && (item_is_extent(coord)
62304+ || keyeq(k, unit_key_by_coord(coord, &ukey))))
62305+ || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
62306+ && keyge(k, unit_key_by_coord(coord, &ukey)));
62307+}
62308+#endif
62309+
62310+/* this is used by reiser4_seal_validate. It accepts return value of
62311+ * longterm_lock_znode and returns 1 if it can be interpreted as seal
62312+ * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
62313+ * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
62314+ * We cannot do this in longterm_lock_znode(), because sometimes we want to
62315+ * distinguish between -EINVAL and -E_REPEAT. */
62316+static int should_repeat(int return_code)
62317+{
62318+ return return_code == -EINVAL;
62319+}
62320+
62321+/* (re-)validate seal.
62322+
62323+ Checks whether seal is pristine, and try to revalidate it if possible.
62324+
62325+ If seal was burned, or broken irreparably, return -E_REPEAT.
62326+
62327+ NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
62328+ looking for is in range of keys covered by the sealed node, but item wasn't
62329+ found by node ->lookup() method. Alternative is to return -ENOENT in this
62330+ case, but this would complicate callers logic.
62331+
62332+*/
62333+int reiser4_seal_validate(seal_t * seal /* seal to validate */,
62334+ coord_t * coord /* coord to validate against */,
62335+ const reiser4_key * key /* key to validate against */,
62336+ lock_handle * lh /* resulting lock handle */,
62337+ znode_lock_mode mode /* lock node */,
62338+ znode_lock_request request /* locking priority */)
62339+{
62340+ znode *node;
62341+ int result;
62342+
62343+ assert("nikita-1889", seal != NULL);
62344+ assert("nikita-1881", reiser4_seal_is_set(seal));
62345+ assert("nikita-1882", key != NULL);
62346+ assert("nikita-1883", coord != NULL);
62347+ assert("nikita-1884", lh != NULL);
62348+ assert("nikita-1885", keyeq(&seal->key, key));
62349+ assert("nikita-1989", coords_equal(&seal->coord1, coord));
62350+
62351+ /* obtain znode by block number */
62352+ node = seal_node(seal);
62353+ if (node != NULL) {
62354+ /* znode was in cache, lock it */
62355+ result = longterm_lock_znode(lh, node, mode, request);
62356+ zput(node);
62357+ if (result == 0) {
62358+ if (seal_matches(seal, node)) {
62359+ /* if seal version and znode version
62360+ coincide */
62361+ ON_DEBUG(coord_update_v(coord));
62362+ assert("nikita-1990",
62363+ node == seal->coord1.node);
62364+ assert("nikita-1898",
62365+ WITH_DATA_RET(coord->node, 1,
62366+ check_seal_match(coord,
62367+ key)));
62368+ } else
62369+ result = RETERR(-E_REPEAT);
62370+ }
62371+ if (result != 0) {
62372+ if (should_repeat(result))
62373+ result = RETERR(-E_REPEAT);
62374+ /* unlock node on failure */
62375+ done_lh(lh);
62376+ }
62377+ } else {
62378+ /* znode wasn't in cache */
62379+ result = RETERR(-E_REPEAT);
62380+ }
62381+ return result;
62382+}
62383+
62384+/* helpers functions */
62385+
62386+/* obtain reference to znode seal points to, if in cache */
62387+static znode *seal_node(const seal_t * seal /* seal to query */ )
62388+{
62389+ assert("nikita-1891", seal != NULL);
62390+ return zlook(current_tree, &seal->block);
62391+}
62392+
62393+/* true if @seal version and @node version coincide */
62394+static int seal_matches(const seal_t * seal /* seal to check */ ,
62395+ znode * node /* node to check */ )
62396+{
62397+ int result;
62398+
62399+ assert("nikita-1991", seal != NULL);
62400+ assert("nikita-1993", node != NULL);
62401+
62402+ spin_lock_znode(node);
62403+ result = (seal->version == node->version);
62404+ spin_unlock_znode(node);
62405+ return result;
62406+}
62407+
62408+/* Make Linus happy.
62409+ Local variables:
62410+ c-indentation-style: "K&R"
62411+ mode-name: "LC"
62412+ c-basic-offset: 8
62413+ tab-width: 8
62414+ fill-column: 120
62415+ scroll-step: 1
62416+ End:
62417+*/
62418diff -urN linux-2.6.24.orig/fs/reiser4/seal.h linux-2.6.24/fs/reiser4/seal.h
62419--- linux-2.6.24.orig/fs/reiser4/seal.h 1970-01-01 03:00:00.000000000 +0300
62420+++ linux-2.6.24/fs/reiser4/seal.h 2008-01-25 11:39:07.080244783 +0300
62421@@ -0,0 +1,49 @@
62422+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62423+
62424+/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
62425+
62426+#ifndef __SEAL_H__
62427+#define __SEAL_H__
62428+
62429+#include "forward.h"
62430+#include "debug.h"
62431+#include "dformat.h"
62432+#include "key.h"
62433+#include "coord.h"
62434+
62435+/* for __u?? types */
62436+/*#include <linux/types.h>*/
62437+
62438+/* seal. See comment at the top of seal.c */
62439+typedef struct seal_s {
62440+ /* version of znode recorder at the time of seal creation */
62441+ __u64 version;
62442+ /* block number of znode attached to this seal */
62443+ reiser4_block_nr block;
62444+#if REISER4_DEBUG
62445+ /* coord this seal is attached to. For debugging. */
62446+ coord_t coord1;
62447+ /* key this seal is attached to. For debugging. */
62448+ reiser4_key key;
62449+#endif
62450+} seal_t;
62451+
62452+extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
62453+extern void reiser4_seal_done(seal_t *);
62454+extern int reiser4_seal_is_set(const seal_t *);
62455+extern int reiser4_seal_validate(seal_t *, coord_t *,
62456+ const reiser4_key *, lock_handle *,
62457+ znode_lock_mode mode, znode_lock_request request);
62458+
62459+/* __SEAL_H__ */
62460+#endif
62461+
62462+/* Make Linus happy.
62463+ Local variables:
62464+ c-indentation-style: "K&R"
62465+ mode-name: "LC"
62466+ c-basic-offset: 8
62467+ tab-width: 8
62468+ fill-column: 120
62469+ End:
62470+*/
62471diff -urN linux-2.6.24.orig/fs/reiser4/search.c linux-2.6.24/fs/reiser4/search.c
62472--- linux-2.6.24.orig/fs/reiser4/search.c 1970-01-01 03:00:00.000000000 +0300
62473+++ linux-2.6.24/fs/reiser4/search.c 2008-01-25 11:39:07.084245813 +0300
62474@@ -0,0 +1,1611 @@
62475+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62476+ * reiser4/README */
62477+
62478+#include "forward.h"
62479+#include "debug.h"
62480+#include "dformat.h"
62481+#include "key.h"
62482+#include "coord.h"
62483+#include "seal.h"
62484+#include "plugin/item/item.h"
62485+#include "plugin/node/node.h"
62486+#include "plugin/plugin.h"
62487+#include "jnode.h"
62488+#include "znode.h"
62489+#include "block_alloc.h"
62490+#include "tree_walk.h"
62491+#include "tree.h"
62492+#include "reiser4.h"
62493+#include "super.h"
62494+#include "inode.h"
62495+
62496+#include <linux/slab.h>
62497+
62498+static const char *bias_name(lookup_bias bias);
62499+
62500+/* tree searching algorithm, intranode searching algorithms are in
62501+ plugin/node/ */
62502+
62503+/* tree lookup cache
62504+ *
62505+ * The coord by key cache consists of small list of recently accessed nodes
62506+ * maintained according to the LRU discipline. Before doing real top-to-down
62507+ * tree traversal this cache is scanned for nodes that can contain key
62508+ * requested.
62509+ *
62510+ * The efficiency of coord cache depends heavily on locality of reference for
62511+ * tree accesses. Our user level simulations show reasonably good hit ratios
62512+ * for coord cache under most loads so far.
62513+ */
62514+
62515+/* Initialise coord cache slot */
62516+static void cbk_cache_init_slot(cbk_cache_slot *slot)
62517+{
62518+ assert("nikita-345", slot != NULL);
62519+
62520+ INIT_LIST_HEAD(&slot->lru);
62521+ slot->node = NULL;
62522+}
62523+
62524+/* Initialize coord cache */
62525+int cbk_cache_init(cbk_cache *cache /* cache to init */ )
62526+{
62527+ int i;
62528+
62529+ assert("nikita-346", cache != NULL);
62530+
62531+ cache->slot =
62532+ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
62533+ reiser4_ctx_gfp_mask_get());
62534+ if (cache->slot == NULL)
62535+ return RETERR(-ENOMEM);
62536+
62537+ INIT_LIST_HEAD(&cache->lru);
62538+ for (i = 0; i < cache->nr_slots; ++i) {
62539+ cbk_cache_init_slot(cache->slot + i);
62540+ list_add_tail(&((cache->slot + i)->lru), &cache->lru);
62541+ }
62542+ rwlock_init(&cache->guard);
62543+ return 0;
62544+}
62545+
62546+/* free cbk cache data */
62547+void cbk_cache_done(cbk_cache * cache /* cache to release */ )
62548+{
62549+ assert("nikita-2493", cache != NULL);
62550+ if (cache->slot != NULL) {
62551+ kfree(cache->slot);
62552+ cache->slot = NULL;
62553+ }
62554+}
62555+
62556+/* macro to iterate over all cbk cache slots */
62557+#define for_all_slots(cache, slot) \
62558+ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
62559+ &(cache)->lru != &(slot)->lru; \
62560+ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
62561+
62562+#if REISER4_DEBUG
62563+/* this function assures that [cbk-cache-invariant] invariant holds */
62564+static int cbk_cache_invariant(const cbk_cache *cache)
62565+{
62566+ cbk_cache_slot *slot;
62567+ int result;
62568+ int unused;
62569+
62570+ if (cache->nr_slots == 0)
62571+ return 1;
62572+
62573+ assert("nikita-2469", cache != NULL);
62574+ unused = 0;
62575+ result = 1;
62576+ read_lock(&((cbk_cache *)cache)->guard);
62577+ for_all_slots(cache, slot) {
62578+ /* in LRU first go all `used' slots followed by `unused' */
62579+ if (unused && (slot->node != NULL))
62580+ result = 0;
62581+ if (slot->node == NULL)
62582+ unused = 1;
62583+ else {
62584+ cbk_cache_slot *scan;
62585+
62586+ /* all cached nodes are different */
62587+ scan = slot;
62588+ while (result) {
62589+ scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
62590+ if (&cache->lru == &scan->lru)
62591+ break;
62592+ if (slot->node == scan->node)
62593+ result = 0;
62594+ }
62595+ }
62596+ if (!result)
62597+ break;
62598+ }
62599+ read_unlock(&((cbk_cache *)cache)->guard);
62600+ return result;
62601+}
62602+
62603+#endif
62604+
62605+/* Remove references, if any, to @node from coord cache */
62606+void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
62607+ reiser4_tree * tree /* tree to remove node from */ )
62608+{
62609+ cbk_cache_slot *slot;
62610+ cbk_cache *cache;
62611+ int i;
62612+
62613+ assert("nikita-350", node != NULL);
62614+ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
62615+
62616+ cache = &tree->cbk_cache;
62617+ assert("nikita-2470", cbk_cache_invariant(cache));
62618+
62619+ write_lock(&(cache->guard));
62620+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62621+ if (slot->node == node) {
62622+ list_move_tail(&slot->lru, &cache->lru);
62623+ slot->node = NULL;
62624+ break;
62625+ }
62626+ }
62627+ write_unlock(&(cache->guard));
62628+ assert("nikita-2471", cbk_cache_invariant(cache));
62629+}
62630+
62631+/* add to the cbk-cache in the "tree" information about "node". This
62632+ can actually be update of existing slot in a cache. */
62633+static void cbk_cache_add(const znode *node /* node to add to the cache */ )
62634+{
62635+ cbk_cache *cache;
62636+ cbk_cache_slot *slot;
62637+ int i;
62638+
62639+ assert("nikita-352", node != NULL);
62640+
62641+ cache = &znode_get_tree(node)->cbk_cache;
62642+ assert("nikita-2472", cbk_cache_invariant(cache));
62643+
62644+ if (cache->nr_slots == 0)
62645+ return;
62646+
62647+ write_lock(&(cache->guard));
62648+ /* find slot to update/add */
62649+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62650+ /* oops, this node is already in a cache */
62651+ if (slot->node == node)
62652+ break;
62653+ }
62654+ /* if all slots are used, reuse least recently used one */
62655+ if (i == cache->nr_slots) {
62656+ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
62657+ slot->node = (znode *) node;
62658+ }
62659+ list_move(&slot->lru, &cache->lru);
62660+ write_unlock(&(cache->guard));
62661+ assert("nikita-2473", cbk_cache_invariant(cache));
62662+}
62663+
62664+static int setup_delimiting_keys(cbk_handle * h);
62665+static lookup_result coord_by_handle(cbk_handle * handle);
62666+static lookup_result traverse_tree(cbk_handle * h);
62667+static int cbk_cache_search(cbk_handle * h);
62668+
62669+static level_lookup_result cbk_level_lookup(cbk_handle * h);
62670+static level_lookup_result cbk_node_lookup(cbk_handle * h);
62671+
62672+/* helper functions */
62673+
62674+static void update_stale_dk(reiser4_tree * tree, znode * node);
62675+
62676+/* release parent node during traversal */
62677+static void put_parent(cbk_handle * h);
62678+/* check consistency of fields */
62679+static int sanity_check(cbk_handle * h);
62680+/* release resources in handle */
62681+static void hput(cbk_handle * h);
62682+
62683+static level_lookup_result search_to_left(cbk_handle * h);
62684+
62685+/* pack numerous (numberous I should say) arguments of coord_by_key() into
62686+ * cbk_handle */
62687+static cbk_handle *cbk_pack(cbk_handle * handle,
62688+ reiser4_tree * tree,
62689+ const reiser4_key * key,
62690+ coord_t * coord,
62691+ lock_handle * active_lh,
62692+ lock_handle * parent_lh,
62693+ znode_lock_mode lock_mode,
62694+ lookup_bias bias,
62695+ tree_level lock_level,
62696+ tree_level stop_level,
62697+ __u32 flags, ra_info_t * info)
62698+{
62699+ memset(handle, 0, sizeof *handle);
62700+
62701+ handle->tree = tree;
62702+ handle->key = key;
62703+ handle->lock_mode = lock_mode;
62704+ handle->bias = bias;
62705+ handle->lock_level = lock_level;
62706+ handle->stop_level = stop_level;
62707+ handle->coord = coord;
62708+ /* set flags. See comment in tree.h:cbk_flags */
62709+ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
62710+
62711+ handle->active_lh = active_lh;
62712+ handle->parent_lh = parent_lh;
62713+ handle->ra_info = info;
62714+ return handle;
62715+}
62716+
62717+/* main tree lookup procedure
62718+
62719+ Check coord cache. If key we are looking for is not found there, call cbk()
62720+ to do real tree traversal.
62721+
62722+ As we have extents on the twig level, @lock_level and @stop_level can
62723+ be different from LEAF_LEVEL and each other.
62724+
62725+ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
62726+ long term locks) while calling this.
62727+*/
62728+lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
62729+ * in. Usually this tree is
62730+ * part of file-system
62731+ * super-block */ ,
62732+ const reiser4_key * key /* key to look for */ ,
62733+ coord_t * coord /* where to store found
62734+ * position in a tree. Fields
62735+ * in "coord" are only valid if
62736+ * coord_by_key() returned
62737+ * "CBK_COORD_FOUND" */ ,
62738+ lock_handle * lh, /* resulting lock handle */
62739+ znode_lock_mode lock_mode /* type of lookup we
62740+ * want on node. Pass
62741+ * ZNODE_READ_LOCK here
62742+ * if you only want to
62743+ * read item found and
62744+ * ZNODE_WRITE_LOCK if
62745+ * you want to modify
62746+ * it */ ,
62747+ lookup_bias bias /* what to return if coord
62748+ * with exactly the @key is
62749+ * not in the tree */ ,
62750+ tree_level lock_level /* tree level where to start
62751+ * taking @lock type of
62752+ * locks */ ,
62753+ tree_level stop_level /* tree level to stop. Pass
62754+ * LEAF_LEVEL or TWIG_LEVEL
62755+ * here Item being looked
62756+ * for has to be between
62757+ * @lock_level and
62758+ * @stop_level, inclusive */ ,
62759+ __u32 flags /* search flags */ ,
62760+ ra_info_t *
62761+ info
62762+ /* information about desired tree traversal readahead */
62763+ )
62764+{
62765+ cbk_handle handle;
62766+ lock_handle parent_lh;
62767+ lookup_result result;
62768+
62769+ init_lh(lh);
62770+ init_lh(&parent_lh);
62771+
62772+ assert("nikita-3023", reiser4_schedulable());
62773+
62774+ assert("nikita-353", tree != NULL);
62775+ assert("nikita-354", key != NULL);
62776+ assert("nikita-355", coord != NULL);
62777+ assert("nikita-356", (bias == FIND_EXACT)
62778+ || (bias == FIND_MAX_NOT_MORE_THAN));
62779+ assert("nikita-357", stop_level >= LEAF_LEVEL);
62780+ /* no locks can be held during tree traversal */
62781+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
62782+
62783+ cbk_pack(&handle,
62784+ tree,
62785+ key,
62786+ coord,
62787+ lh,
62788+ &parent_lh,
62789+ lock_mode, bias, lock_level, stop_level, flags, info);
62790+
62791+ result = coord_by_handle(&handle);
62792+ assert("nikita-3247",
62793+ ergo(!IS_CBKERR(result), coord->node == lh->node));
62794+ return result;
62795+}
62796+
62797+/* like coord_by_key(), but starts traversal from vroot of @object rather than
62798+ * from tree root. */
62799+lookup_result reiser4_object_lookup(struct inode * object,
62800+ const reiser4_key * key,
62801+ coord_t * coord,
62802+ lock_handle * lh,
62803+ znode_lock_mode lock_mode,
62804+ lookup_bias bias,
62805+ tree_level lock_level,
62806+ tree_level stop_level, __u32 flags,
62807+ ra_info_t * info)
62808+{
62809+ cbk_handle handle;
62810+ lock_handle parent_lh;
62811+ lookup_result result;
62812+
62813+ init_lh(lh);
62814+ init_lh(&parent_lh);
62815+
62816+ assert("nikita-3023", reiser4_schedulable());
62817+
62818+ assert("nikita-354", key != NULL);
62819+ assert("nikita-355", coord != NULL);
62820+ assert("nikita-356", (bias == FIND_EXACT)
62821+ || (bias == FIND_MAX_NOT_MORE_THAN));
62822+ assert("nikita-357", stop_level >= LEAF_LEVEL);
62823+ /* no locks can be held during tree search by key */
62824+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
62825+
62826+ cbk_pack(&handle,
62827+ object != NULL ? reiser4_tree_by_inode(object) : current_tree,
62828+ key,
62829+ coord,
62830+ lh,
62831+ &parent_lh,
62832+ lock_mode, bias, lock_level, stop_level, flags, info);
62833+ handle.object = object;
62834+
62835+ result = coord_by_handle(&handle);
62836+ assert("nikita-3247",
62837+ ergo(!IS_CBKERR(result), coord->node == lh->node));
62838+ return result;
62839+}
62840+
62841+/* lookup by cbk_handle. Common part of coord_by_key() and
62842+ reiser4_object_lookup(). */
62843+static lookup_result coord_by_handle(cbk_handle * handle)
62844+{
62845+ /*
62846+ * first check cbk_cache (which is look-aside cache for our tree) and
62847+ * of this fails, start traversal.
62848+ */
62849+ /* first check whether "key" is in cache of recent lookups. */
62850+ if (cbk_cache_search(handle) == 0)
62851+ return handle->result;
62852+ else
62853+ return traverse_tree(handle);
62854+}
62855+
62856+/* Execute actor for each item (or unit, depending on @through_units_p),
62857+ starting from @coord, right-ward, until either:
62858+
62859+ - end of the tree is reached
62860+ - unformatted node is met
62861+ - error occurred
62862+ - @actor returns 0 or less
62863+
62864+ Error code, or last actor return value is returned.
62865+
62866+ This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
62867+ sequence of entries with identical keys and alikes.
62868+*/
62869+int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
62870+ coord_t * coord /* coord to start from */ ,
62871+ lock_handle * lh /* lock handle to start with and to
62872+ * update along the way */ ,
62873+ tree_iterate_actor_t actor /* function to call on each
62874+ * item/unit */ ,
62875+ void *arg /* argument to pass to @actor */ ,
62876+ znode_lock_mode mode /* lock mode on scanned nodes */ ,
62877+ int through_units_p /* call @actor on each item or on
62878+ * each unit */ )
62879+{
62880+ int result;
62881+
62882+ assert("nikita-1143", tree != NULL);
62883+ assert("nikita-1145", coord != NULL);
62884+ assert("nikita-1146", lh != NULL);
62885+ assert("nikita-1147", actor != NULL);
62886+
62887+ result = zload(coord->node);
62888+ coord_clear_iplug(coord);
62889+ if (result != 0)
62890+ return result;
62891+ if (!coord_is_existing_unit(coord)) {
62892+ zrelse(coord->node);
62893+ return -ENOENT;
62894+ }
62895+ while ((result = actor(tree, coord, lh, arg)) > 0) {
62896+ /* move further */
62897+ if ((through_units_p && coord_next_unit(coord)) ||
62898+ (!through_units_p && coord_next_item(coord))) {
62899+ do {
62900+ lock_handle couple;
62901+
62902+ /* move to the next node */
62903+ init_lh(&couple);
62904+ result =
62905+ reiser4_get_right_neighbor(&couple,
62906+ coord->node,
62907+ (int)mode,
62908+ GN_CAN_USE_UPPER_LEVELS);
62909+ zrelse(coord->node);
62910+ if (result == 0) {
62911+
62912+ result = zload(couple.node);
62913+ if (result != 0) {
62914+ done_lh(&couple);
62915+ return result;
62916+ }
62917+
62918+ coord_init_first_unit(coord,
62919+ couple.node);
62920+ done_lh(lh);
62921+ move_lh(lh, &couple);
62922+ } else
62923+ return result;
62924+ } while (node_is_empty(coord->node));
62925+ }
62926+
62927+ assert("nikita-1149", coord_is_existing_unit(coord));
62928+ }
62929+ zrelse(coord->node);
62930+ return result;
62931+}
62932+
62933+/* return locked uber znode for @tree */
62934+int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
62935+ znode_lock_request pri, lock_handle * lh)
62936+{
62937+ int result;
62938+
62939+ result = longterm_lock_znode(lh, tree->uber, mode, pri);
62940+ return result;
62941+}
62942+
62943+/* true if @key is strictly within @node
62944+
62945+ we are looking for possibly non-unique key and it is item is at the edge of
62946+ @node. May be it is in the neighbor.
62947+*/
62948+static int znode_contains_key_strict(znode * node /* node to check key
62949+ * against */ ,
62950+ const reiser4_key *
62951+ key /* key to check */ ,
62952+ int isunique)
62953+{
62954+ int answer;
62955+
62956+ assert("nikita-1760", node != NULL);
62957+ assert("nikita-1722", key != NULL);
62958+
62959+ if (keyge(key, &node->rd_key))
62960+ return 0;
62961+
62962+ answer = keycmp(&node->ld_key, key);
62963+
62964+ if (isunique)
62965+ return answer != GREATER_THAN;
62966+ else
62967+ return answer == LESS_THAN;
62968+}
62969+
62970+/*
62971+ * Virtual Root (vroot) code.
62972+ *
62973+ * For given file system object (e.g., regular file or directory) let's
62974+ * define its "virtual root" as lowest in the tree (that is, furtherest
62975+ * from the tree root) node such that all body items of said object are
62976+ * located in a tree rooted at this node.
62977+ *
62978+ * Once vroot of object is found all tree lookups for items within body of
62979+ * this object ("object lookups") can be started from its vroot rather
62980+ * than from real root. This has following advantages:
62981+ *
62982+ * 1. amount of nodes traversed during lookup (and, hence, amount of
62983+ * key comparisons made) decreases, and
62984+ *
62985+ * 2. contention on tree root is decreased. This latter was actually
62986+ * motivating reason behind vroot, because spin lock of root node,
62987+ * which is taken when acquiring long-term lock on root node is the
62988+ * hottest lock in the reiser4.
62989+ *
62990+ * How to find vroot.
62991+ *
62992+ * When vroot of object F is not yet determined, all object lookups start
62993+ * from the root of the tree. At each tree level during traversal we have
62994+ * a node N such that a key we are looking for (which is the key inside
62995+ * object's body) is located within N. In function handle_vroot() called
62996+ * from cbk_level_lookup() we check whether N is possible vroot for
62997+ * F. Check is trivial---if neither leftmost nor rightmost item of N
62998+ * belongs to F (and we already have helpful ->owns_item() method of
62999+ * object plugin for this), then N is possible vroot of F. This, of
63000+ * course, relies on the assumption that each object occupies contiguous
63001+ * range of keys in the tree.
63002+ *
63003+ * Thus, traversing tree downward and checking each node as we go, we can
63004+ * find lowest such node, which, by definition, is vroot.
63005+ *
63006+ * How to track vroot.
63007+ *
63008+ * Nohow. If actual vroot changes, next object lookup will just restart
63009+ * from the actual tree root, refreshing object's vroot along the way.
63010+ *
63011+ */
63012+
63013+/*
63014+ * Check whether @node is possible vroot of @object.
63015+ */
63016+static void handle_vroot(struct inode *object, znode * node)
63017+{
63018+ file_plugin *fplug;
63019+ coord_t coord;
63020+
63021+ fplug = inode_file_plugin(object);
63022+ assert("nikita-3353", fplug != NULL);
63023+ assert("nikita-3354", fplug->owns_item != NULL);
63024+
63025+ if (unlikely(node_is_empty(node)))
63026+ return;
63027+
63028+ coord_init_first_unit(&coord, node);
63029+ /*
63030+ * if leftmost item of @node belongs to @object, we cannot be sure
63031+ * that @node is vroot of @object, because, some items of @object are
63032+ * probably in the sub-tree rooted at the left neighbor of @node.
63033+ */
63034+ if (fplug->owns_item(object, &coord))
63035+ return;
63036+ coord_init_last_unit(&coord, node);
63037+ /* mutatis mutandis for the rightmost item */
63038+ if (fplug->owns_item(object, &coord))
63039+ return;
63040+ /* otherwise, @node is possible vroot of @object */
63041+ inode_set_vroot(object, node);
63042+}
63043+
63044+/*
63045+ * helper function used by traverse tree to start tree traversal not from the
63046+ * tree root, but from @h->object's vroot, if possible.
63047+ */
63048+static int prepare_object_lookup(cbk_handle * h)
63049+{
63050+ znode *vroot;
63051+ int result;
63052+
63053+ vroot = inode_get_vroot(h->object);
63054+ if (vroot == NULL) {
63055+ /*
63056+ * object doesn't have known vroot, start from real tree root.
63057+ */
63058+ return LOOKUP_CONT;
63059+ }
63060+
63061+ h->level = znode_get_level(vroot);
63062+ /* take a long-term lock on vroot */
63063+ h->result = longterm_lock_znode(h->active_lh, vroot,
63064+ cbk_lock_mode(h->level, h),
63065+ ZNODE_LOCK_LOPRI);
63066+ result = LOOKUP_REST;
63067+ if (h->result == 0) {
63068+ int isunique;
63069+ int inside;
63070+
63071+ isunique = h->flags & CBK_UNIQUE;
63072+ /* check that key is inside vroot */
63073+ read_lock_dk(h->tree);
63074+ inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
63075+ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
63076+ read_unlock_dk(h->tree);
63077+ if (inside) {
63078+ h->result = zload(vroot);
63079+ if (h->result == 0) {
63080+ /* search for key in vroot. */
63081+ result = cbk_node_lookup(h);
63082+ zrelse(vroot); /*h->active_lh->node); */
63083+ if (h->active_lh->node != vroot) {
63084+ result = LOOKUP_REST;
63085+ } else if (result == LOOKUP_CONT) {
63086+ move_lh(h->parent_lh, h->active_lh);
63087+ h->flags &= ~CBK_DKSET;
63088+ }
63089+ }
63090+ }
63091+ }
63092+
63093+ zput(vroot);
63094+
63095+ if (IS_CBKERR(h->result) || result == LOOKUP_REST)
63096+ hput(h);
63097+ return result;
63098+}
63099+
63100+/* main function that handles common parts of tree traversal: starting
63101+ (fake znode handling), restarts, error handling, completion */
63102+static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
63103+{
63104+ int done;
63105+ int iterations;
63106+ int vroot_used;
63107+
63108+ assert("nikita-365", h != NULL);
63109+ assert("nikita-366", h->tree != NULL);
63110+ assert("nikita-367", h->key != NULL);
63111+ assert("nikita-368", h->coord != NULL);
63112+ assert("nikita-369", (h->bias == FIND_EXACT)
63113+ || (h->bias == FIND_MAX_NOT_MORE_THAN));
63114+ assert("nikita-370", h->stop_level >= LEAF_LEVEL);
63115+ assert("nikita-2949", !(h->flags & CBK_DKSET));
63116+ assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
63117+
63118+ done = 0;
63119+ iterations = 0;
63120+ vroot_used = 0;
63121+
63122+ /* loop for restarts */
63123+ restart:
63124+
63125+ assert("nikita-3024", reiser4_schedulable());
63126+
63127+ h->result = CBK_COORD_FOUND;
63128+ /* connect_znode() needs it */
63129+ h->ld_key = *reiser4_min_key();
63130+ h->rd_key = *reiser4_max_key();
63131+ h->flags |= CBK_DKSET;
63132+ h->error = NULL;
63133+
63134+ if (!vroot_used && h->object != NULL) {
63135+ vroot_used = 1;
63136+ done = prepare_object_lookup(h);
63137+ if (done == LOOKUP_REST) {
63138+ goto restart;
63139+ } else if (done == LOOKUP_DONE)
63140+ return h->result;
63141+ }
63142+ if (h->parent_lh->node == NULL) {
63143+ done =
63144+ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
63145+ h->parent_lh);
63146+
63147+ assert("nikita-1637", done != -E_DEADLOCK);
63148+
63149+ h->block = h->tree->root_block;
63150+ h->level = h->tree->height;
63151+ h->coord->node = h->parent_lh->node;
63152+
63153+ if (done != 0)
63154+ return done;
63155+ }
63156+
63157+ /* loop descending a tree */
63158+ while (!done) {
63159+
63160+ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
63161+ IS_POW(iterations))) {
63162+ warning("nikita-1481", "Too many iterations: %i",
63163+ iterations);
63164+ reiser4_print_key("key", h->key);
63165+ ++iterations;
63166+ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
63167+ h->error =
63168+ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
63169+ h->result = RETERR(-EIO);
63170+ break;
63171+ }
63172+ switch (cbk_level_lookup(h)) {
63173+ case LOOKUP_CONT:
63174+ move_lh(h->parent_lh, h->active_lh);
63175+ continue;
63176+ default:
63177+ wrong_return_value("nikita-372", "cbk_level");
63178+ case LOOKUP_DONE:
63179+ done = 1;
63180+ break;
63181+ case LOOKUP_REST:
63182+ hput(h);
63183+ /* deadlock avoidance is normal case. */
63184+ if (h->result != -E_DEADLOCK)
63185+ ++iterations;
63186+ reiser4_preempt_point();
63187+ goto restart;
63188+ }
63189+ }
63190+ /* that's all. The rest is error handling */
63191+ if (unlikely(h->error != NULL)) {
63192+ warning("nikita-373", "%s: level: %i, "
63193+ "lock_level: %i, stop_level: %i "
63194+ "lock_mode: %s, bias: %s",
63195+ h->error, h->level, h->lock_level, h->stop_level,
63196+ lock_mode_name(h->lock_mode), bias_name(h->bias));
63197+ reiser4_print_address("block", &h->block);
63198+ reiser4_print_key("key", h->key);
63199+ print_coord_content("coord", h->coord);
63200+ }
63201+ /* `unlikely' error case */
63202+ if (unlikely(IS_CBKERR(h->result))) {
63203+ /* failure. do cleanup */
63204+ hput(h);
63205+ } else {
63206+ assert("nikita-1605", WITH_DATA_RET
63207+ (h->coord->node, 1,
63208+ ergo((h->result == CBK_COORD_FOUND) &&
63209+ (h->bias == FIND_EXACT) &&
63210+ (!node_is_empty(h->coord->node)),
63211+ coord_is_existing_item(h->coord))));
63212+ }
63213+ return h->result;
63214+}
63215+
63216+/* find delimiting keys of child
63217+
63218+ Determine left and right delimiting keys for child pointed to by
63219+ @parent_coord.
63220+
63221+*/
63222+static void find_child_delimiting_keys(znode * parent /* parent znode, passed
63223+ * locked */ ,
63224+ const coord_t * parent_coord /* coord where
63225+ * pointer to
63226+ * child is
63227+ * stored */ ,
63228+ reiser4_key * ld /* where to store left
63229+ * delimiting key */ ,
63230+ reiser4_key * rd /* where to store right
63231+ * delimiting key */ )
63232+{
63233+ coord_t neighbor;
63234+
63235+ assert("nikita-1484", parent != NULL);
63236+ assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
63237+
63238+ coord_dup(&neighbor, parent_coord);
63239+
63240+ if (neighbor.between == AT_UNIT)
63241+ /* imitate item ->lookup() behavior. */
63242+ neighbor.between = AFTER_UNIT;
63243+
63244+ if (coord_set_to_left(&neighbor) == 0)
63245+ unit_key_by_coord(&neighbor, ld);
63246+ else {
63247+ assert("nikita-14851", 0);
63248+ *ld = *znode_get_ld_key(parent);
63249+ }
63250+
63251+ coord_dup(&neighbor, parent_coord);
63252+ if (neighbor.between == AT_UNIT)
63253+ neighbor.between = AFTER_UNIT;
63254+ if (coord_set_to_right(&neighbor) == 0)
63255+ unit_key_by_coord(&neighbor, rd);
63256+ else
63257+ *rd = *znode_get_rd_key(parent);
63258+}
63259+
63260+/*
63261+ * setup delimiting keys for a child
63262+ *
63263+ * @parent parent node
63264+ *
63265+ * @coord location in @parent where pointer to @child is
63266+ *
63267+ * @child child node
63268+ */
63269+int
63270+set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
63271+{
63272+ reiser4_tree *tree;
63273+
63274+ assert("nikita-2952",
63275+ znode_get_level(parent) == znode_get_level(coord->node));
63276+
63277+ /* fast check without taking dk lock. This is safe, because
63278+ * JNODE_DKSET is never cleared once set. */
63279+ if (!ZF_ISSET(child, JNODE_DKSET)) {
63280+ tree = znode_get_tree(parent);
63281+ write_lock_dk(tree);
63282+ if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
63283+ find_child_delimiting_keys(parent, coord,
63284+ &child->ld_key,
63285+ &child->rd_key);
63286+ ON_DEBUG(child->ld_key_version =
63287+ atomic_inc_return(&delim_key_version);
63288+ child->rd_key_version =
63289+ atomic_inc_return(&delim_key_version););
63290+ ZF_SET(child, JNODE_DKSET);
63291+ }
63292+ write_unlock_dk(tree);
63293+ return 1;
63294+ }
63295+ return 0;
63296+}
63297+
63298+/* Perform tree lookup at one level. This is called from cbk_traverse()
63299+ function that drives lookup through tree and calls cbk_node_lookup() to
63300+ perform lookup within one node.
63301+
63302+ See comments in a code.
63303+*/
63304+static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
63305+{
63306+ int ret;
63307+ int setdk;
63308+ int ldkeyset = 0;
63309+ reiser4_key ldkey;
63310+ reiser4_key key;
63311+ znode *active;
63312+
63313+ assert("nikita-3025", reiser4_schedulable());
63314+
63315+ /* acquire reference to @active node */
63316+ active =
63317+ zget(h->tree, &h->block, h->parent_lh->node, h->level,
63318+ reiser4_ctx_gfp_mask_get());
63319+
63320+ if (IS_ERR(active)) {
63321+ h->result = PTR_ERR(active);
63322+ return LOOKUP_DONE;
63323+ }
63324+
63325+ /* lock @active */
63326+ h->result = longterm_lock_znode(h->active_lh,
63327+ active,
63328+ cbk_lock_mode(h->level, h),
63329+ ZNODE_LOCK_LOPRI);
63330+ /* longterm_lock_znode() acquires additional reference to znode (which
63331+ will be later released by longterm_unlock_znode()). Release
63332+ reference acquired by zget().
63333+ */
63334+ zput(active);
63335+ if (unlikely(h->result != 0))
63336+ goto fail_or_restart;
63337+
63338+ setdk = 0;
63339+ /* if @active is accessed for the first time, setup delimiting keys on
63340+ it. Delimiting keys are taken from the parent node. See
63341+ setup_delimiting_keys() for details.
63342+ */
63343+ if (h->flags & CBK_DKSET) {
63344+ setdk = setup_delimiting_keys(h);
63345+ h->flags &= ~CBK_DKSET;
63346+ } else {
63347+ znode *parent;
63348+
63349+ parent = h->parent_lh->node;
63350+ h->result = zload(parent);
63351+ if (unlikely(h->result != 0))
63352+ goto fail_or_restart;
63353+
63354+ if (!ZF_ISSET(active, JNODE_DKSET))
63355+ setdk = set_child_delimiting_keys(parent,
63356+ h->coord, active);
63357+ else {
63358+ read_lock_dk(h->tree);
63359+ find_child_delimiting_keys(parent, h->coord, &ldkey,
63360+ &key);
63361+ read_unlock_dk(h->tree);
63362+ ldkeyset = 1;
63363+ }
63364+ zrelse(parent);
63365+ }
63366+
63367+ /* this is ugly kludge. Reminder: this is necessary, because
63368+ ->lookup() method returns coord with ->between field probably set
63369+ to something different from AT_UNIT.
63370+ */
63371+ h->coord->between = AT_UNIT;
63372+
63373+ if (znode_just_created(active) && (h->coord->node != NULL)) {
63374+ write_lock_tree(h->tree);
63375+ /* if we are going to load znode right now, setup
63376+ ->in_parent: coord where pointer to this node is stored in
63377+ parent.
63378+ */
63379+ coord_to_parent_coord(h->coord, &active->in_parent);
63380+ write_unlock_tree(h->tree);
63381+ }
63382+
63383+ /* check connectedness without holding tree lock---false negatives
63384+ * will be re-checked by connect_znode(), and false positives are
63385+ * impossible---@active cannot suddenly turn into unconnected
63386+ * state. */
63387+ if (!znode_is_connected(active)) {
63388+ h->result = connect_znode(h->coord, active);
63389+ if (unlikely(h->result != 0)) {
63390+ put_parent(h);
63391+ goto fail_or_restart;
63392+ }
63393+ }
63394+
63395+ jload_prefetch(ZJNODE(active));
63396+
63397+ if (setdk)
63398+ update_stale_dk(h->tree, active);
63399+
63400+ /* put_parent() cannot be called earlier, because connect_znode()
63401+ assumes parent node is referenced; */
63402+ put_parent(h);
63403+
63404+ if ((!znode_contains_key_lock(active, h->key) &&
63405+ (h->flags & CBK_TRUST_DK))
63406+ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
63407+ /* 1. key was moved out of this node while this thread was
63408+ waiting for the lock. Restart. More elaborate solution is
63409+ to determine where key moved (to the left, or to the right)
63410+ and try to follow it through sibling pointers.
63411+
63412+ 2. or, node itself is going to be removed from the
63413+ tree. Release lock and restart.
63414+ */
63415+ h->result = -E_REPEAT;
63416+ }
63417+ if (h->result == -E_REPEAT)
63418+ return LOOKUP_REST;
63419+
63420+ h->result = zload_ra(active, h->ra_info);
63421+ if (h->result) {
63422+ return LOOKUP_DONE;
63423+ }
63424+
63425+ /* sanity checks */
63426+ if (sanity_check(h)) {
63427+ zrelse(active);
63428+ return LOOKUP_DONE;
63429+ }
63430+
63431+ /* check that key of leftmost item in the @active is the same as in
63432+ * its parent */
63433+ if (ldkeyset && !node_is_empty(active) &&
63434+ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
63435+ warning("vs-3533", "Keys are inconsistent. Fsck?");
63436+ reiser4_print_key("inparent", &ldkey);
63437+ reiser4_print_key("inchild", &key);
63438+ h->result = RETERR(-EIO);
63439+ zrelse(active);
63440+ return LOOKUP_DONE;
63441+ }
63442+
63443+ if (h->object != NULL)
63444+ handle_vroot(h->object, active);
63445+
63446+ ret = cbk_node_lookup(h);
63447+
63448+ /* h->active_lh->node might change, but active is yet to be zrelsed */
63449+ zrelse(active);
63450+
63451+ return ret;
63452+
63453+ fail_or_restart:
63454+ if (h->result == -E_DEADLOCK)
63455+ return LOOKUP_REST;
63456+ return LOOKUP_DONE;
63457+}
63458+
63459+#if REISER4_DEBUG
63460+/* check left and right delimiting keys of a znode */
63461+void check_dkeys(znode * node)
63462+{
63463+ znode *left;
63464+ znode *right;
63465+
63466+ read_lock_tree(current_tree);
63467+ read_lock_dk(current_tree);
63468+
63469+ assert("vs-1710", znode_is_any_locked(node));
63470+ assert("vs-1197",
63471+ !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
63472+
63473+ left = node->left;
63474+ right = node->right;
63475+
63476+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63477+ && left != NULL && ZF_ISSET(left, JNODE_DKSET))
63478+ /* check left neighbor. Note that left neighbor is not locked,
63479+ so it might get wrong delimiting keys therefore */
63480+ assert("vs-1198",
63481+ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
63482+ || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
63483+
63484+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63485+ && right != NULL && ZF_ISSET(right, JNODE_DKSET))
63486+ /* check right neighbor. Note that right neighbor is not
63487+ locked, so it might get wrong delimiting keys therefore */
63488+ assert("vs-1199",
63489+ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
63490+ || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
63491+
63492+ read_unlock_dk(current_tree);
63493+ read_unlock_tree(current_tree);
63494+}
63495+#endif
63496+
63497+/* true if @key is left delimiting key of @node */
63498+static int key_is_ld(znode * node, const reiser4_key * key)
63499+{
63500+ int ld;
63501+
63502+ assert("nikita-1716", node != NULL);
63503+ assert("nikita-1758", key != NULL);
63504+
63505+ read_lock_dk(znode_get_tree(node));
63506+ assert("nikita-1759", znode_contains_key(node, key));
63507+ ld = keyeq(znode_get_ld_key(node), key);
63508+ read_unlock_dk(znode_get_tree(node));
63509+ return ld;
63510+}
63511+
63512+/* Process one node during tree traversal.
63513+
63514+ This is called by cbk_level_lookup(). */
63515+static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
63516+{
63517+ /* node plugin of @active */
63518+ node_plugin *nplug;
63519+ /* item plugin of item that was found */
63520+ item_plugin *iplug;
63521+ /* search bias */
63522+ lookup_bias node_bias;
63523+ /* node we are operating upon */
63524+ znode *active;
63525+ /* tree we are searching in */
63526+ reiser4_tree *tree;
63527+ /* result */
63528+ int result;
63529+
63530+ assert("nikita-379", h != NULL);
63531+
63532+ active = h->active_lh->node;
63533+ tree = h->tree;
63534+
63535+ nplug = active->nplug;
63536+ assert("nikita-380", nplug != NULL);
63537+
63538+ ON_DEBUG(check_dkeys(active));
63539+
63540+ /* return item from "active" node with maximal key not greater than
63541+ "key" */
63542+ node_bias = h->bias;
63543+ result = nplug->lookup(active, h->key, node_bias, h->coord);
63544+ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
63545+ /* error occurred */
63546+ h->result = result;
63547+ return LOOKUP_DONE;
63548+ }
63549+ if (h->level == h->stop_level) {
63550+ /* welcome to the stop level */
63551+ assert("nikita-381", h->coord->node == active);
63552+ if (result == NS_FOUND) {
63553+ /* success of tree lookup */
63554+ if (!(h->flags & CBK_UNIQUE)
63555+ && key_is_ld(active, h->key)) {
63556+ return search_to_left(h);
63557+ } else
63558+ h->result = CBK_COORD_FOUND;
63559+ } else {
63560+ h->result = CBK_COORD_NOTFOUND;
63561+ }
63562+ if (!(h->flags & CBK_IN_CACHE))
63563+ cbk_cache_add(active);
63564+ return LOOKUP_DONE;
63565+ }
63566+
63567+ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
63568+ h->error = "not found on internal node";
63569+ h->result = result;
63570+ return LOOKUP_DONE;
63571+ }
63572+
63573+ assert("vs-361", h->level > h->stop_level);
63574+
63575+ if (handle_eottl(h, &result)) {
63576+ assert("vs-1674", (result == LOOKUP_DONE ||
63577+ result == LOOKUP_REST));
63578+ return result;
63579+ }
63580+
63581+ /* go down to next level */
63582+ check_me("vs-12", zload(h->coord->node) == 0);
63583+ assert("nikita-2116", item_is_internal(h->coord));
63584+ iplug = item_plugin_by_coord(h->coord);
63585+ iplug->s.internal.down_link(h->coord, h->key, &h->block);
63586+ zrelse(h->coord->node);
63587+ --h->level;
63588+ return LOOKUP_CONT; /* continue */
63589+}
63590+
63591+/* scan cbk_cache slots looking for a match for @h */
63592+static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
63593+{
63594+ level_lookup_result llr;
63595+ znode *node;
63596+ reiser4_tree *tree;
63597+ cbk_cache_slot *slot;
63598+ cbk_cache *cache;
63599+ tree_level level;
63600+ int isunique;
63601+ const reiser4_key *key;
63602+ int result;
63603+
63604+ assert("nikita-1317", h != NULL);
63605+ assert("nikita-1315", h->tree != NULL);
63606+ assert("nikita-1316", h->key != NULL);
63607+
63608+ tree = h->tree;
63609+ cache = &tree->cbk_cache;
63610+ if (cache->nr_slots == 0)
63611+ /* size of cbk cache was set to 0 by mount time option. */
63612+ return RETERR(-ENOENT);
63613+
63614+ assert("nikita-2474", cbk_cache_invariant(cache));
63615+ node = NULL; /* to keep gcc happy */
63616+ level = h->level;
63617+ key = h->key;
63618+ isunique = h->flags & CBK_UNIQUE;
63619+ result = RETERR(-ENOENT);
63620+
63621+ /*
63622+ * this is time-critical function and dragons had, hence, been settled
63623+ * here.
63624+ *
63625+ * Loop below scans cbk cache slots trying to find matching node with
63626+ * suitable range of delimiting keys and located at the h->level.
63627+ *
63628+ * Scan is done under cbk cache spin lock that protects slot->node
63629+ * pointers. If suitable node is found we want to pin it in
63630+ * memory. But slot->node can point to the node with x_count 0
63631+ * (unreferenced). Such node can be recycled at any moment, or can
63632+ * already be in the process of being recycled (within jput()).
63633+ *
63634+ * As we found node in the cbk cache, it means that jput() hasn't yet
63635+ * called cbk_cache_invalidate().
63636+ *
63637+ * We acquire reference to the node without holding tree lock, and
63638+ * later, check node's RIP bit. This avoids races with jput().
63639+ */
63640+
63641+ rcu_read_lock();
63642+ read_lock(&((cbk_cache *)cache)->guard);
63643+
63644+ slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
63645+ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
63646+ BUG_ON(&slot->lru != &cache->lru);/*????*/
63647+ while (1) {
63648+
63649+ slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
63650+
63651+ if (&cache->lru != &slot->lru)
63652+ node = slot->node;
63653+ else
63654+ node = NULL;
63655+
63656+ if (unlikely(node == NULL))
63657+ break;
63658+
63659+ /*
63660+ * this is (hopefully) the only place in the code where we are
63661+ * working with delimiting keys without holding dk lock. This
63662+ * is fine here, because this is only "guess" anyway---keys
63663+ * are rechecked under dk lock below.
63664+ */
63665+ if (znode_get_level(node) == level &&
63666+ /* reiser4_min_key < key < reiser4_max_key */
63667+ znode_contains_key_strict(node, key, isunique)) {
63668+ zref(node);
63669+ result = 0;
63670+ spin_lock_prefetch(&tree->tree_lock);
63671+ break;
63672+ }
63673+ }
63674+ read_unlock(&((cbk_cache *)cache)->guard);
63675+
63676+ assert("nikita-2475", cbk_cache_invariant(cache));
63677+
63678+ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
63679+ result = -ENOENT;
63680+
63681+ rcu_read_unlock();
63682+
63683+ if (result != 0) {
63684+ h->result = CBK_COORD_NOTFOUND;
63685+ return RETERR(-ENOENT);
63686+ }
63687+
63688+ result =
63689+ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
63690+ ZNODE_LOCK_LOPRI);
63691+ zput(node);
63692+ if (result != 0)
63693+ return result;
63694+ result = zload(node);
63695+ if (result != 0)
63696+ return result;
63697+
63698+ /* recheck keys */
63699+ read_lock_dk(tree);
63700+ result = (znode_contains_key_strict(node, key, isunique) &&
63701+ !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
63702+ read_unlock_dk(tree);
63703+ if (result) {
63704+ /* do lookup inside node */
63705+ llr = cbk_node_lookup(h);
63706+ /* if cbk_node_lookup() wandered to another node (due to eottl
63707+ or non-unique keys), adjust @node */
63708+ /*node = h->active_lh->node; */
63709+
63710+ if (llr != LOOKUP_DONE) {
63711+ /* restart or continue on the next level */
63712+ result = RETERR(-ENOENT);
63713+ } else if (IS_CBKERR(h->result))
63714+ /* io or oom */
63715+ result = RETERR(-ENOENT);
63716+ else {
63717+ /* good. Either item found or definitely not found. */
63718+ result = 0;
63719+
63720+ write_lock(&(cache->guard));
63721+ if (slot->node == h->active_lh->node /*node */ ) {
63722+ /* if this node is still in cbk cache---move
63723+ its slot to the head of the LRU list. */
63724+ list_move(&slot->lru, &cache->lru);
63725+ }
63726+ write_unlock(&(cache->guard));
63727+ }
63728+ } else {
63729+ /* race. While this thread was waiting for the lock, node was
63730+ rebalanced and item we are looking for, shifted out of it
63731+ (if it ever was here).
63732+
63733+ Continuing scanning is almost hopeless: node key range was
63734+ moved to, is almost certainly at the beginning of the LRU
63735+ list at this time, because it's hot, but restarting
63736+ scanning from the very beginning is complex. Just return,
63737+ so that cbk() will be performed. This is not that
63738+ important, because such races should be rare. Are they?
63739+ */
63740+ result = RETERR(-ENOENT); /* -ERAUGHT */
63741+ }
63742+ zrelse(node);
63743+ assert("nikita-2476", cbk_cache_invariant(cache));
63744+ return result;
63745+}
63746+
63747+/* look for item with given key in the coord cache
63748+
63749+ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
63750+ which is a small LRU list of znodes accessed lately. For each znode in
63751+ znode in this list, it checks whether key we are looking for fits into key
63752+ range covered by this node. If so, and in addition, node lies at allowed
63753+ level (this is to handle extents on a twig level), node is locked, and
63754+ lookup inside it is performed.
63755+
63756+ we need a measurement of the cost of this cache search compared to the cost
63757+ of coord_by_key.
63758+
63759+*/
63760+static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
63761+{
63762+ int result = 0;
63763+ tree_level level;
63764+
63765+ /* add CBK_IN_CACHE to the handle flags. This means that
63766+ * cbk_node_lookup() assumes that cbk_cache is scanned and would add
63767+ * found node to the cache. */
63768+ h->flags |= CBK_IN_CACHE;
63769+ for (level = h->stop_level; level <= h->lock_level; ++level) {
63770+ h->level = level;
63771+ result = cbk_cache_scan_slots(h);
63772+ if (result != 0) {
63773+ done_lh(h->active_lh);
63774+ done_lh(h->parent_lh);
63775+ } else {
63776+ assert("nikita-1319", !IS_CBKERR(h->result));
63777+ break;
63778+ }
63779+ }
63780+ h->flags &= ~CBK_IN_CACHE;
63781+ return result;
63782+}
63783+
63784+/* type of lock we want to obtain during tree traversal. On stop level
63785+ we want type of lock user asked for, on upper levels: read lock. */
63786+znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
63787+{
63788+ assert("nikita-382", h != NULL);
63789+
63790+ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
63791+}
63792+
63793+/* update outdated delimiting keys */
63794+static void stale_dk(reiser4_tree * tree, znode * node)
63795+{
63796+ znode *right;
63797+
63798+ read_lock_tree(tree);
63799+ write_lock_dk(tree);
63800+ right = node->right;
63801+
63802+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63803+ right && ZF_ISSET(right, JNODE_DKSET) &&
63804+ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
63805+ znode_set_rd_key(node, znode_get_ld_key(right));
63806+
63807+ write_unlock_dk(tree);
63808+ read_unlock_tree(tree);
63809+}
63810+
63811+/* check for possibly outdated delimiting keys, and update them if
63812+ * necessary. */
63813+static void update_stale_dk(reiser4_tree * tree, znode * node)
63814+{
63815+ znode *right;
63816+ reiser4_key rd;
63817+
63818+ read_lock_tree(tree);
63819+ read_lock_dk(tree);
63820+ rd = *znode_get_rd_key(node);
63821+ right = node->right;
63822+ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63823+ right && ZF_ISSET(right, JNODE_DKSET) &&
63824+ !keyeq(&rd, znode_get_ld_key(right)))) {
63825+ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
63826+ read_unlock_dk(tree);
63827+ read_unlock_tree(tree);
63828+ stale_dk(tree, node);
63829+ return;
63830+ }
63831+ read_unlock_dk(tree);
63832+ read_unlock_tree(tree);
63833+}
63834+
63835+/*
63836+ * handle searches a the non-unique key.
63837+ *
63838+ * Suppose that we are looking for an item with possibly non-unique key 100.
63839+ *
63840+ * Root node contains two pointers: one to a node with left delimiting key 0,
63841+ * and another to a node with left delimiting key 100. Item we interested in
63842+ * may well happen in the sub-tree rooted at the first pointer.
63843+ *
63844+ * To handle this search_to_left() is called when search reaches stop
63845+ * level. This function checks it is _possible_ that item we are looking for
63846+ * is in the left neighbor (this can be done by comparing delimiting keys) and
63847+ * if so, tries to lock left neighbor (this is low priority lock, so it can
63848+ * deadlock, tree traversal is just restarted if it did) and then checks
63849+ * whether left neighbor actually contains items with our key.
63850+ *
63851+ * Note that this is done on the stop level only. It is possible to try such
63852+ * left-check on each level, but as duplicate keys are supposed to be rare
63853+ * (very unlikely that more than one node is completely filled with items with
63854+ * duplicate keys), it sis cheaper to scan to the left on the stop level once.
63855+ *
63856+ */
63857+static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
63858+{
63859+ level_lookup_result result;
63860+ coord_t *coord;
63861+ znode *node;
63862+ znode *neighbor;
63863+
63864+ lock_handle lh;
63865+
63866+ assert("nikita-1761", h != NULL);
63867+ assert("nikita-1762", h->level == h->stop_level);
63868+
63869+ init_lh(&lh);
63870+ coord = h->coord;
63871+ node = h->active_lh->node;
63872+ assert("nikita-1763", coord_is_leftmost_unit(coord));
63873+
63874+ h->result =
63875+ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
63876+ GN_CAN_USE_UPPER_LEVELS);
63877+ neighbor = NULL;
63878+ switch (h->result) {
63879+ case -E_DEADLOCK:
63880+ result = LOOKUP_REST;
63881+ break;
63882+ case 0:{
63883+ node_plugin *nplug;
63884+ coord_t crd;
63885+ lookup_bias bias;
63886+
63887+ neighbor = lh.node;
63888+ h->result = zload(neighbor);
63889+ if (h->result != 0) {
63890+ result = LOOKUP_DONE;
63891+ break;
63892+ }
63893+
63894+ nplug = neighbor->nplug;
63895+
63896+ coord_init_zero(&crd);
63897+ bias = h->bias;
63898+ h->bias = FIND_EXACT;
63899+ h->result =
63900+ nplug->lookup(neighbor, h->key, h->bias, &crd);
63901+ h->bias = bias;
63902+
63903+ if (h->result == NS_NOT_FOUND) {
63904+ case -E_NO_NEIGHBOR:
63905+ h->result = CBK_COORD_FOUND;
63906+ if (!(h->flags & CBK_IN_CACHE))
63907+ cbk_cache_add(node);
63908+ default: /* some other error */
63909+ result = LOOKUP_DONE;
63910+ } else if (h->result == NS_FOUND) {
63911+ read_lock_dk(znode_get_tree(neighbor));
63912+ h->rd_key = *znode_get_ld_key(node);
63913+ leftmost_key_in_node(neighbor, &h->ld_key);
63914+ read_unlock_dk(znode_get_tree(neighbor));
63915+ h->flags |= CBK_DKSET;
63916+
63917+ h->block = *znode_get_block(neighbor);
63918+ /* clear coord -> node so that cbk_level_lookup()
63919+ wouldn't overwrite parent hint in neighbor.
63920+
63921+ Parent hint was set up by
63922+ reiser4_get_left_neighbor()
63923+ */
63924+ /* FIXME: why do we have to spinlock here? */
63925+ write_lock_tree(znode_get_tree(neighbor));
63926+ h->coord->node = NULL;
63927+ write_unlock_tree(znode_get_tree(neighbor));
63928+ result = LOOKUP_CONT;
63929+ } else {
63930+ result = LOOKUP_DONE;
63931+ }
63932+ if (neighbor != NULL)
63933+ zrelse(neighbor);
63934+ }
63935+ }
63936+ done_lh(&lh);
63937+ return result;
63938+}
63939+
63940+/* debugging aid: return symbolic name of search bias */
63941+static const char *bias_name(lookup_bias bias /* bias to get name of */ )
63942+{
63943+ if (bias == FIND_EXACT)
63944+ return "exact";
63945+ else if (bias == FIND_MAX_NOT_MORE_THAN)
63946+ return "left-slant";
63947+/* else if( bias == RIGHT_SLANT_BIAS ) */
63948+/* return "right-bias"; */
63949+ else {
63950+ static char buf[30];
63951+
63952+ sprintf(buf, "unknown: %i", bias);
63953+ return buf;
63954+ }
63955+}
63956+
63957+#if REISER4_DEBUG
63958+/* debugging aid: print human readable information about @p */
63959+void print_coord_content(const char *prefix /* prefix to print */ ,
63960+ coord_t * p /* coord to print */ )
63961+{
63962+ reiser4_key key;
63963+
63964+ if (p == NULL) {
63965+ printk("%s: null\n", prefix);
63966+ return;
63967+ }
63968+ if ((p->node != NULL) && znode_is_loaded(p->node)
63969+ && coord_is_existing_item(p))
63970+ printk("%s: data: %p, length: %i\n", prefix,
63971+ item_body_by_coord(p), item_length_by_coord(p));
63972+ if (znode_is_loaded(p->node)) {
63973+ item_key_by_coord(p, &key);
63974+ reiser4_print_key(prefix, &key);
63975+ }
63976+}
63977+
63978+/* debugging aid: print human readable information about @block */
63979+void reiser4_print_address(const char *prefix /* prefix to print */ ,
63980+ const reiser4_block_nr * block /* block number to print */ )
63981+{
63982+ printk("%s: %s\n", prefix, sprint_address(block));
63983+}
63984+#endif
63985+
63986+/* return string containing human readable representation of @block */
63987+char *sprint_address(const reiser4_block_nr *
63988+ block /* block number to print */ )
63989+{
63990+ static char address[30];
63991+
63992+ if (block == NULL)
63993+ sprintf(address, "null");
63994+ else if (reiser4_blocknr_is_fake(block))
63995+ sprintf(address, "%llx", (unsigned long long)(*block));
63996+ else
63997+ sprintf(address, "%llu", (unsigned long long)(*block));
63998+ return address;
63999+}
64000+
64001+/* release parent node during traversal */
64002+static void put_parent(cbk_handle * h /* search handle */ )
64003+{
64004+ assert("nikita-383", h != NULL);
64005+ if (h->parent_lh->node != NULL) {
64006+ longterm_unlock_znode(h->parent_lh);
64007+ }
64008+}
64009+
64010+/* helper function used by coord_by_key(): release reference to parent znode
64011+ stored in handle before processing its child. */
64012+static void hput(cbk_handle * h /* search handle */ )
64013+{
64014+ assert("nikita-385", h != NULL);
64015+ done_lh(h->parent_lh);
64016+ done_lh(h->active_lh);
64017+}
64018+
64019+/* Helper function used by cbk(): update delimiting keys of child node (stored
64020+ in h->active_lh->node) using key taken from parent on the parent level. */
64021+static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
64022+{
64023+ znode *active;
64024+ reiser4_tree *tree;
64025+
64026+ assert("nikita-1088", h != NULL);
64027+
64028+ active = h->active_lh->node;
64029+
64030+ /* fast check without taking dk lock. This is safe, because
64031+ * JNODE_DKSET is never cleared once set. */
64032+ if (!ZF_ISSET(active, JNODE_DKSET)) {
64033+ tree = znode_get_tree(active);
64034+ write_lock_dk(tree);
64035+ if (!ZF_ISSET(active, JNODE_DKSET)) {
64036+ znode_set_ld_key(active, &h->ld_key);
64037+ znode_set_rd_key(active, &h->rd_key);
64038+ ZF_SET(active, JNODE_DKSET);
64039+ }
64040+ write_unlock_dk(tree);
64041+ return 1;
64042+ }
64043+ return 0;
64044+}
64045+
64046+/* true if @block makes sense for the @tree. Used to detect corrupted node
64047+ * pointers */
64048+static int
64049+block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
64050+ reiser4_tree * tree /* tree to check against */ )
64051+{
64052+ assert("nikita-757", block != NULL);
64053+ assert("nikita-758", tree != NULL);
64054+
64055+ /* check to see if it exceeds the size of the device. */
64056+ return reiser4_blocknr_is_sane_for(tree->super, block);
64057+}
64058+
64059+/* check consistency of fields */
64060+static int sanity_check(cbk_handle * h /* search handle */ )
64061+{
64062+ assert("nikita-384", h != NULL);
64063+
64064+ if (h->level < h->stop_level) {
64065+ h->error = "Buried under leaves";
64066+ h->result = RETERR(-EIO);
64067+ return LOOKUP_DONE;
64068+ } else if (!block_nr_is_correct(&h->block, h->tree)) {
64069+ h->error = "bad block number";
64070+ h->result = RETERR(-EIO);
64071+ return LOOKUP_DONE;
64072+ } else
64073+ return 0;
64074+}
64075+
64076+/* Make Linus happy.
64077+ Local variables:
64078+ c-indentation-style: "K&R"
64079+ mode-name: "LC"
64080+ c-basic-offset: 8
64081+ tab-width: 8
64082+ fill-column: 120
64083+ scroll-step: 1
64084+ End:
64085+*/
64086diff -urN linux-2.6.24.orig/fs/reiser4/status_flags.c linux-2.6.24/fs/reiser4/status_flags.c
64087--- linux-2.6.24.orig/fs/reiser4/status_flags.c 1970-01-01 03:00:00.000000000 +0300
64088+++ linux-2.6.24/fs/reiser4/status_flags.c 2008-01-25 11:54:46.665843146 +0300
64089@@ -0,0 +1,170 @@
64090+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64091+ * reiser4/README */
64092+
64093+/* Functions that deal with reiser4 status block, query status and update it, if needed */
64094+
64095+#include <linux/bio.h>
64096+#include <linux/highmem.h>
64097+#include <linux/fs.h>
64098+#include <linux/blkdev.h>
64099+#include "debug.h"
64100+#include "dformat.h"
64101+#include "status_flags.h"
64102+#include "super.h"
64103+
64104+/* This is our end I/O handler that marks page uptodate if IO was successful. It also
64105+ unconditionally unlocks the page, so we can see that io was done.
64106+ We do not free bio, because we hope to reuse that. */
64107+static void reiser4_status_endio(struct bio *bio, int err)
64108+{
64109+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
64110+ SetPageUptodate(bio->bi_io_vec->bv_page);
64111+ } else {
64112+ ClearPageUptodate(bio->bi_io_vec->bv_page);
64113+ SetPageError(bio->bi_io_vec->bv_page);
64114+ }
64115+ unlock_page(bio->bi_io_vec->bv_page);
64116+}
64117+
64118+/* Initialise status code. This is expected to be called from the disk format
64119+ code. block paremeter is where status block lives. */
64120+int reiser4_status_init(reiser4_block_nr block)
64121+{
64122+ struct super_block *sb = reiser4_get_current_sb();
64123+ struct reiser4_status *statuspage;
64124+ struct bio *bio;
64125+ struct page *page;
64126+
64127+ get_super_private(sb)->status_page = NULL;
64128+ get_super_private(sb)->status_bio = NULL;
64129+
64130+ page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
64131+ if (!page)
64132+ return -ENOMEM;
64133+
64134+ bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
64135+ if (bio != NULL) {
64136+ bio->bi_sector = block * (sb->s_blocksize >> 9);
64137+ bio->bi_bdev = sb->s_bdev;
64138+ bio->bi_io_vec[0].bv_page = page;
64139+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64140+ bio->bi_io_vec[0].bv_offset = 0;
64141+ bio->bi_vcnt = 1;
64142+ bio->bi_size = sb->s_blocksize;
64143+ bio->bi_end_io = reiser4_status_endio;
64144+ } else {
64145+ __free_pages(page, 0);
64146+ return -ENOMEM;
64147+ }
64148+ lock_page(page);
64149+ submit_bio(READ, bio);
64150+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64151+ wait_on_page_locked(page);
64152+ if (!PageUptodate(page)) {
64153+ warning("green-2007",
64154+ "I/O error while tried to read status page\n");
64155+ return -EIO;
64156+ }
64157+
64158+ statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
64159+ if (memcmp
64160+ (statuspage->magic, REISER4_STATUS_MAGIC,
64161+ sizeof(REISER4_STATUS_MAGIC))) {
64162+ /* Magic does not match. */
64163+ kunmap_atomic((char *)statuspage, KM_USER0);
64164+ warning("green-2008", "Wrong magic in status block\n");
64165+ __free_pages(page, 0);
64166+ bio_put(bio);
64167+ return -EINVAL;
64168+ }
64169+ kunmap_atomic((char *)statuspage, KM_USER0);
64170+
64171+ get_super_private(sb)->status_page = page;
64172+ get_super_private(sb)->status_bio = bio;
64173+ return 0;
64174+}
64175+
64176+/* Query the status of fs. Returns if the FS can be safely mounted.
64177+ Also if "status" and "extended" parameters are given, it will fill
64178+ actual parts of status from disk there. */
64179+int reiser4_status_query(u64 * status, u64 * extended)
64180+{
64181+ struct super_block *sb = reiser4_get_current_sb();
64182+ struct reiser4_status *statuspage;
64183+ int retval;
64184+
64185+ if (!get_super_private(sb)->status_page) { // No status page?
64186+ return REISER4_STATUS_MOUNT_UNKNOWN;
64187+ }
64188+ statuspage = (struct reiser4_status *)
64189+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64190+ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work.
64191+ case REISER4_STATUS_OK:
64192+ retval = REISER4_STATUS_MOUNT_OK;
64193+ break;
64194+ case REISER4_STATUS_CORRUPTED:
64195+ retval = REISER4_STATUS_MOUNT_WARN;
64196+ break;
64197+ case REISER4_STATUS_DAMAGED:
64198+ case REISER4_STATUS_DESTROYED:
64199+ case REISER4_STATUS_IOERROR:
64200+ retval = REISER4_STATUS_MOUNT_RO;
64201+ break;
64202+ default:
64203+ retval = REISER4_STATUS_MOUNT_UNKNOWN;
64204+ break;
64205+ }
64206+
64207+ if (status)
64208+ *status = le64_to_cpu(get_unaligned(&statuspage->status));
64209+ if (extended)
64210+ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
64211+
64212+ kunmap_atomic((char *)statuspage, KM_USER0);
64213+ return retval;
64214+}
64215+
64216+/* This function should be called when something bad happens (e.g. from reiser4_panic).
64217+ It fills the status structure and tries to push it to disk. */
64218+int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
64219+{
64220+ struct super_block *sb = reiser4_get_current_sb();
64221+ struct reiser4_status *statuspage;
64222+ struct bio *bio = get_super_private(sb)->status_bio;
64223+
64224+ if (!get_super_private(sb)->status_page) { // No status page?
64225+ return -1;
64226+ }
64227+ statuspage = (struct reiser4_status *)
64228+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64229+
64230+ put_unaligned(cpu_to_le64(status), &statuspage->status);
64231+ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
64232+ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
64233+
64234+ kunmap_atomic((char *)statuspage, KM_USER0);
64235+ bio->bi_bdev = sb->s_bdev;
64236+ bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
64237+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64238+ bio->bi_io_vec[0].bv_offset = 0;
64239+ bio->bi_vcnt = 1;
64240+ bio->bi_size = sb->s_blocksize;
64241+ bio->bi_end_io = reiser4_status_endio;
64242+ lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
64243+ /* We can block now, but we have no other choice anyway */
64244+ submit_bio(WRITE, bio);
64245+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64246+ return 0; // We do not wait for io to finish.
64247+}
64248+
64249+/* Frees the page with status and bio structure. Should be called by disk format at umount time */
64250+int reiser4_status_finish(void)
64251+{
64252+ struct super_block *sb = reiser4_get_current_sb();
64253+
64254+ __free_pages(get_super_private(sb)->status_page, 0);
64255+ get_super_private(sb)->status_page = NULL;
64256+ bio_put(get_super_private(sb)->status_bio);
64257+ get_super_private(sb)->status_bio = NULL;
64258+ return 0;
64259+}
64260diff -urN linux-2.6.24.orig/fs/reiser4/status_flags.h linux-2.6.24/fs/reiser4/status_flags.h
64261--- linux-2.6.24.orig/fs/reiser4/status_flags.h 1970-01-01 03:00:00.000000000 +0300
64262+++ linux-2.6.24/fs/reiser4/status_flags.h 2008-01-25 11:39:07.088246844 +0300
64263@@ -0,0 +1,43 @@
64264+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64265+ * reiser4/README */
64266+
64267+/* Here we declare structures and flags that store reiser4 status on disk.
64268+ The status that helps us to find out if the filesystem is valid or if it
64269+ contains some critical, or not so critical errors */
64270+
64271+#if !defined( __REISER4_STATUS_FLAGS_H__ )
64272+#define __REISER4_STATUS_FLAGS_H__
64273+
64274+#include "dformat.h"
64275+/* These are major status flags */
64276+#define REISER4_STATUS_OK 0
64277+#define REISER4_STATUS_CORRUPTED 0x1
64278+#define REISER4_STATUS_DAMAGED 0x2
64279+#define REISER4_STATUS_DESTROYED 0x4
64280+#define REISER4_STATUS_IOERROR 0x8
64281+
64282+/* Return values for reiser4_status_query() */
64283+#define REISER4_STATUS_MOUNT_OK 0
64284+#define REISER4_STATUS_MOUNT_WARN 1
64285+#define REISER4_STATUS_MOUNT_RO 2
64286+#define REISER4_STATUS_MOUNT_UNKNOWN -1
64287+
64288+#define REISER4_TEXTERROR_LEN 256
64289+
64290+#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
64291+/* We probably need to keep its size under sector size which is 512 bytes */
64292+struct reiser4_status {
64293+ char magic[16];
64294+ d64 status; /* Current FS state */
64295+ d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
64296+ last sector where io error happened if status is "io error encountered" */
64297+ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
64298+ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
64299+};
64300+
64301+int reiser4_status_init(reiser4_block_nr block);
64302+int reiser4_status_query(u64 * status, u64 * extended);
64303+int reiser4_status_write(u64 status, u64 extended_status, char *message);
64304+int reiser4_status_finish(void);
64305+
64306+#endif
64307diff -urN linux-2.6.24.orig/fs/reiser4/super.c linux-2.6.24/fs/reiser4/super.c
64308--- linux-2.6.24.orig/fs/reiser4/super.c 1970-01-01 03:00:00.000000000 +0300
64309+++ linux-2.6.24/fs/reiser4/super.c 2008-01-25 11:39:07.088246844 +0300
64310@@ -0,0 +1,316 @@
64311+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64312+ * reiser4/README */
64313+
64314+/* Super-block manipulations. */
64315+
64316+#include "debug.h"
64317+#include "dformat.h"
64318+#include "key.h"
64319+#include "plugin/security/perm.h"
64320+#include "plugin/space/space_allocator.h"
64321+#include "plugin/plugin.h"
64322+#include "tree.h"
64323+#include "vfs_ops.h"
64324+#include "super.h"
64325+#include "reiser4.h"
64326+
64327+#include <linux/types.h> /* for __u?? */
64328+#include <linux/fs.h> /* for struct super_block */
64329+
64330+static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
64331+static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
64332+static __u64 reserved_for_root(const struct super_block *super);
64333+
64334+/* Return reiser4-specific part of super block */
64335+reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block
64336+ * queried */ )
64337+{
64338+ return (reiser4_super_info_data *) super->s_fs_info;
64339+}
64340+
64341+/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
64342+long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
64343+{
64344+ assert("nikita-448", super != NULL);
64345+ assert("nikita-449", is_reiser4_super(super));
64346+ return (long)REISER4_SUPER_MAGIC;
64347+}
64348+
64349+/* functions to read/modify fields of reiser4_super_info_data */
64350+
64351+/* get number of blocks in file system */
64352+__u64 reiser4_block_count(const struct super_block *super /* super block
64353+ queried */ )
64354+{
64355+ assert("vs-494", super != NULL);
64356+ assert("vs-495", is_reiser4_super(super));
64357+ return get_super_private(super)->block_count;
64358+}
64359+
64360+#if REISER4_DEBUG
64361+/*
64362+ * number of blocks in the current file system
64363+ */
64364+__u64 reiser4_current_block_count(void)
64365+{
64366+ return get_current_super_private()->block_count;
64367+}
64368+#endif /* REISER4_DEBUG */
64369+
64370+/* set number of block in filesystem */
64371+void reiser4_set_block_count(const struct super_block *super, __u64 nr)
64372+{
64373+ assert("vs-501", super != NULL);
64374+ assert("vs-502", is_reiser4_super(super));
64375+ get_super_private(super)->block_count = nr;
64376+ /*
64377+ * The proper calculation of the reserved space counter (%5 of device
64378+ * block counter) we need a 64 bit division which is missing in Linux
64379+ * on i386 platform. Because we do not need a precise calculation here
64380+ * we can replace a div64 operation by this combination of
64381+ * multiplication and shift: 51. / (2^10) == .0498 .
64382+ * FIXME: this is a bug. It comes up only for very small filesystems
64383+ * which probably are never used. Nevertheless, it is a bug. Number of
64384+ * reserved blocks must be not less than maximal number of blocks which
64385+ * get grabbed with BA_RESERVED.
64386+ */
64387+ get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
64388+}
64389+
64390+/* amount of blocks used (allocated for data) in file system */
64391+__u64 reiser4_data_blocks(const struct super_block *super /* super block
64392+ queried */ )
64393+{
64394+ assert("nikita-452", super != NULL);
64395+ assert("nikita-453", is_reiser4_super(super));
64396+ return get_super_private(super)->blocks_used;
64397+}
64398+
64399+/* set number of block used in filesystem */
64400+void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
64401+{
64402+ assert("vs-503", super != NULL);
64403+ assert("vs-504", is_reiser4_super(super));
64404+ get_super_private(super)->blocks_used = nr;
64405+}
64406+
64407+/* amount of free blocks in file system */
64408+__u64 reiser4_free_blocks(const struct super_block *super /* super block
64409+ queried */ )
64410+{
64411+ assert("nikita-454", super != NULL);
64412+ assert("nikita-455", is_reiser4_super(super));
64413+ return get_super_private(super)->blocks_free;
64414+}
64415+
64416+/* set number of blocks free in filesystem */
64417+void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
64418+{
64419+ assert("vs-505", super != NULL);
64420+ assert("vs-506", is_reiser4_super(super));
64421+ get_super_private(super)->blocks_free = nr;
64422+}
64423+
64424+/* get mkfs unique identifier */
64425+__u32 reiser4_mkfs_id(const struct super_block *super /* super block
64426+ queried */ )
64427+{
64428+ assert("vpf-221", super != NULL);
64429+ assert("vpf-222", is_reiser4_super(super));
64430+ return get_super_private(super)->mkfs_id;
64431+}
64432+
64433+/* amount of free blocks in file system */
64434+__u64 reiser4_free_committed_blocks(const struct super_block *super)
64435+{
64436+ assert("vs-497", super != NULL);
64437+ assert("vs-498", is_reiser4_super(super));
64438+ return get_super_private(super)->blocks_free_committed;
64439+}
64440+
64441+/* amount of blocks in the file system reserved for @uid and @gid */
64442+long reiser4_reserved_blocks(const struct super_block *super /* super block
64443+ queried */ ,
64444+ uid_t uid /* user id */ ,
64445+ gid_t gid /* group id */ )
64446+{
64447+ long reserved;
64448+
64449+ assert("nikita-456", super != NULL);
64450+ assert("nikita-457", is_reiser4_super(super));
64451+
64452+ reserved = 0;
64453+ if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
64454+ reserved += reserved_for_gid(super, gid);
64455+ if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
64456+ reserved += reserved_for_uid(super, uid);
64457+ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
64458+ reserved += reserved_for_root(super);
64459+ return reserved;
64460+}
64461+
64462+/* get/set value of/to grabbed blocks counter */
64463+__u64 reiser4_grabbed_blocks(const struct super_block * super)
64464+{
64465+ assert("zam-512", super != NULL);
64466+ assert("zam-513", is_reiser4_super(super));
64467+
64468+ return get_super_private(super)->blocks_grabbed;
64469+}
64470+
64471+__u64 reiser4_flush_reserved(const struct super_block * super)
64472+{
64473+ assert("vpf-285", super != NULL);
64474+ assert("vpf-286", is_reiser4_super(super));
64475+
64476+ return get_super_private(super)->blocks_flush_reserved;
64477+}
64478+
64479+/* get/set value of/to counter of fake allocated formatted blocks */
64480+__u64 reiser4_fake_allocated(const struct super_block * super)
64481+{
64482+ assert("zam-516", super != NULL);
64483+ assert("zam-517", is_reiser4_super(super));
64484+
64485+ return get_super_private(super)->blocks_fake_allocated;
64486+}
64487+
64488+/* get/set value of/to counter of fake allocated unformatted blocks */
64489+__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
64490+{
64491+ assert("zam-516", super != NULL);
64492+ assert("zam-517", is_reiser4_super(super));
64493+
64494+ return get_super_private(super)->blocks_fake_allocated_unformatted;
64495+}
64496+
64497+/* get/set value of/to counter of clustered blocks */
64498+__u64 reiser4_clustered_blocks(const struct super_block * super)
64499+{
64500+ assert("edward-601", super != NULL);
64501+ assert("edward-602", is_reiser4_super(super));
64502+
64503+ return get_super_private(super)->blocks_clustered;
64504+}
64505+
64506+/* space allocator used by this file system */
64507+reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
64508+ *super)
64509+{
64510+ assert("nikita-1965", super != NULL);
64511+ assert("nikita-1966", is_reiser4_super(super));
64512+ return &get_super_private(super)->space_allocator;
64513+}
64514+
64515+/* return fake inode used to bind formatted nodes in the page cache */
64516+struct inode *reiser4_get_super_fake(const struct super_block *super /* super block
64517+ queried */ )
64518+{
64519+ assert("nikita-1757", super != NULL);
64520+ return get_super_private(super)->fake;
64521+}
64522+
64523+/* return fake inode used to bind copied on capture nodes in the page cache */
64524+struct inode *reiser4_get_cc_fake(const struct super_block *super /* super block
64525+ queried */ )
64526+{
64527+ assert("nikita-1757", super != NULL);
64528+ return get_super_private(super)->cc;
64529+}
64530+
64531+/* return fake inode used to bind bitmaps and journlal heads */
64532+struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
64533+{
64534+ assert("nikita-17571", super != NULL);
64535+ return get_super_private(super)->bitmap;
64536+}
64537+
64538+/* tree used by this file system */
64539+reiser4_tree *reiser4_get_tree(const struct super_block * super /* super block
64540+ * queried */ )
64541+{
64542+ assert("nikita-460", super != NULL);
64543+ assert("nikita-461", is_reiser4_super(super));
64544+ return &get_super_private(super)->tree;
64545+}
64546+
64547+/* Check that @super is (looks like) reiser4 super block. This is mainly for
64548+ use in assertions. */
64549+int is_reiser4_super(const struct super_block *super /* super block
64550+ * queried */ )
64551+{
64552+ return
64553+ super != NULL &&
64554+ get_super_private(super) != NULL &&
64555+ super->s_op == &(get_super_private(super)->ops.super);
64556+}
64557+
64558+int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
64559+{
64560+ return test_bit((int)f, &get_super_private(super)->fs_flags);
64561+}
64562+
64563+/* amount of blocks reserved for given group in file system */
64564+static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super
64565+ * block
64566+ * queried */ ,
64567+ gid_t gid UNUSED_ARG /* group id */ )
64568+{
64569+ return 0;
64570+}
64571+
64572+/* amount of blocks reserved for given user in file system */
64573+static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super
64574+ block
64575+ queried */ ,
64576+ uid_t uid UNUSED_ARG /* user id */ )
64577+{
64578+ return 0;
64579+}
64580+
64581+/* amount of blocks reserved for super user in file system */
64582+static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super
64583+ block
64584+ queried */ )
64585+{
64586+ return 0;
64587+}
64588+
64589+/*
64590+ * true if block number @blk makes sense for the file system at @super.
64591+ */
64592+int
64593+reiser4_blocknr_is_sane_for(const struct super_block *super,
64594+ const reiser4_block_nr * blk)
64595+{
64596+ reiser4_super_info_data *sbinfo;
64597+
64598+ assert("nikita-2957", super != NULL);
64599+ assert("nikita-2958", blk != NULL);
64600+
64601+ if (reiser4_blocknr_is_fake(blk))
64602+ return 1;
64603+
64604+ sbinfo = get_super_private(super);
64605+ return *blk < sbinfo->block_count;
64606+}
64607+
64608+#if REISER4_DEBUG
64609+/*
64610+ * true, if block number @blk makes sense for the current file system
64611+ */
64612+int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
64613+{
64614+ return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
64615+}
64616+#endif /* REISER4_DEBUG */
64617+
64618+/* Make Linus happy.
64619+ Local variables:
64620+ c-indentation-style: "K&R"
64621+ mode-name: "LC"
64622+ c-basic-offset: 8
64623+ tab-width: 8
64624+ fill-column: 120
64625+ End:
64626+*/
64627diff -urN linux-2.6.24.orig/fs/reiser4/super.h linux-2.6.24/fs/reiser4/super.h
64628--- linux-2.6.24.orig/fs/reiser4/super.h 1970-01-01 03:00:00.000000000 +0300
64629+++ linux-2.6.24/fs/reiser4/super.h 2008-01-25 11:39:07.088246844 +0300
64630@@ -0,0 +1,466 @@
64631+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64632+ * reiser4/README */
64633+
64634+/* Super-block functions. See super.c for details. */
64635+
64636+#if !defined( __REISER4_SUPER_H__ )
64637+#define __REISER4_SUPER_H__
64638+
64639+#include <linux/exportfs.h>
64640+
64641+#include "tree.h"
64642+#include "entd.h"
64643+#include "wander.h"
64644+#include "fsdata.h"
64645+#include "plugin/object.h"
64646+#include "plugin/space/space_allocator.h"
64647+
64648+/*
64649+ * Flush algorithms parameters.
64650+ */
64651+struct flush_params {
64652+ unsigned relocate_threshold;
64653+ unsigned relocate_distance;
64654+ unsigned written_threshold;
64655+ unsigned scan_maxnodes;
64656+};
64657+
64658+typedef enum {
64659+ /*
64660+ * True if this file system doesn't support hard-links (multiple names)
64661+ * for directories: this is default UNIX behavior.
64662+ *
64663+ * If hard-links on directoires are not allowed, file system is Acyclic
64664+ * Directed Graph (modulo dot, and dotdot, of course).
64665+ *
64666+ * This is used by reiser4_link().
64667+ */
64668+ REISER4_ADG = 0,
64669+ /*
64670+ * set if all nodes in internal tree have the same node layout plugin.
64671+ * If so, znode_guess_plugin() will return tree->node_plugin in stead
64672+ * of guessing plugin by plugin id stored in the node.
64673+ */
64674+ REISER4_ONE_NODE_PLUGIN = 1,
64675+ /* if set, bsd gid assignment is supported. */
64676+ REISER4_BSD_GID = 2,
64677+ /* [mac]_time are 32 bit in inode */
64678+ REISER4_32_BIT_TIMES = 3,
64679+ /* load all bitmap blocks at mount time */
64680+ REISER4_DONT_LOAD_BITMAP = 5,
64681+ /* enforce atomicity during write(2) */
64682+ REISER4_ATOMIC_WRITE = 6,
64683+ /* don't use write barriers in the log writer code. */
64684+ REISER4_NO_WRITE_BARRIER = 7
64685+} reiser4_fs_flag;
64686+
64687+/*
64688+ * VFS related operation vectors.
64689+ */
64690+struct object_ops {
64691+ struct super_operations super;
64692+ struct dentry_operations dentry;
64693+ struct export_operations export;
64694+};
64695+
64696+/* reiser4-specific part of super block
64697+
64698+ Locking
64699+
64700+ Fields immutable after mount:
64701+
64702+ ->oid*
64703+ ->space*
64704+ ->default_[ug]id
64705+ ->mkfs_id
64706+ ->trace_flags
64707+ ->debug_flags
64708+ ->fs_flags
64709+ ->df_plug
64710+ ->optimal_io_size
64711+ ->plug
64712+ ->flush
64713+ ->u (bad name)
64714+ ->txnmgr
64715+ ->ra_params
64716+ ->fsuid
64717+ ->journal_header
64718+ ->journal_footer
64719+
64720+ Fields protected by ->lnode_guard
64721+
64722+ ->lnode_htable
64723+
64724+ Fields protected by per-super block spin lock
64725+
64726+ ->block_count
64727+ ->blocks_used
64728+ ->blocks_free
64729+ ->blocks_free_committed
64730+ ->blocks_grabbed
64731+ ->blocks_fake_allocated_unformatted
64732+ ->blocks_fake_allocated
64733+ ->blocks_flush_reserved
64734+ ->eflushed
64735+ ->blocknr_hint_default
64736+
64737+ After journal replaying during mount,
64738+
64739+ ->last_committed_tx
64740+
64741+ is protected by ->tmgr.commit_mutex
64742+
64743+ Invariants involving this data-type:
64744+
64745+ [sb-block-counts]
64746+ [sb-grabbed]
64747+ [sb-fake-allocated]
64748+*/
64749+struct reiser4_super_info_data {
64750+ /*
64751+ * guard spinlock which protects reiser4 super block fields (currently
64752+ * blocks_free, blocks_free_committed)
64753+ */
64754+ spinlock_t guard;
64755+
64756+ /* next oid that will be returned by oid_allocate() */
64757+ oid_t next_to_use;
64758+ /* total number of used oids */
64759+ oid_t oids_in_use;
64760+
64761+ /* space manager plugin */
64762+ reiser4_space_allocator space_allocator;
64763+
64764+ /* reiser4 internal tree */
64765+ reiser4_tree tree;
64766+
64767+ /*
64768+ * default user id used for light-weight files without their own
64769+ * stat-data.
64770+ */
64771+ uid_t default_uid;
64772+
64773+ /*
64774+ * default group id used for light-weight files without their own
64775+ * stat-data.
64776+ */
64777+ gid_t default_gid;
64778+
64779+ /* mkfs identifier generated at mkfs time. */
64780+ __u32 mkfs_id;
64781+ /* amount of blocks in a file system */
64782+ __u64 block_count;
64783+
64784+ /* inviolable reserve */
64785+ __u64 blocks_reserved;
64786+
64787+ /* amount of blocks used by file system data and meta-data. */
64788+ __u64 blocks_used;
64789+
64790+ /*
64791+ * amount of free blocks. This is "working" free blocks counter. It is
64792+ * like "working" bitmap, please see block_alloc.c for description.
64793+ */
64794+ __u64 blocks_free;
64795+
64796+ /*
64797+ * free block count for fs committed state. This is "commit" version of
64798+ * free block counter.
64799+ */
64800+ __u64 blocks_free_committed;
64801+
64802+ /*
64803+ * number of blocks reserved for further allocation, for all
64804+ * threads.
64805+ */
64806+ __u64 blocks_grabbed;
64807+
64808+ /* number of fake allocated unformatted blocks in tree. */
64809+ __u64 blocks_fake_allocated_unformatted;
64810+
64811+ /* number of fake allocated formatted blocks in tree. */
64812+ __u64 blocks_fake_allocated;
64813+
64814+ /* number of blocks reserved for flush operations. */
64815+ __u64 blocks_flush_reserved;
64816+
64817+ /* number of blocks reserved for cluster operations. */
64818+ __u64 blocks_clustered;
64819+
64820+ /* unique file-system identifier */
64821+ __u32 fsuid;
64822+
64823+ /* On-disk format version. If does not equal to the disk_format
64824+ plugin version, some format updates (e.g. enlarging plugin
64825+ set, etc) may have place on mount. */
64826+ int version;
64827+
64828+ /* file-system wide flags. See reiser4_fs_flag enum */
64829+ unsigned long fs_flags;
64830+
64831+ /* transaction manager */
64832+ txn_mgr tmgr;
64833+
64834+ /* ent thread */
64835+ entd_context entd;
64836+
64837+ /* fake inode used to bind formatted nodes */
64838+ struct inode *fake;
64839+ /* inode used to bind bitmaps (and journal heads) */
64840+ struct inode *bitmap;
64841+ /* inode used to bind copied on capture nodes */
64842+ struct inode *cc;
64843+
64844+ /* disk layout plugin */
64845+ disk_format_plugin *df_plug;
64846+
64847+ /* disk layout specific part of reiser4 super info data */
64848+ union {
64849+ format40_super_info format40;
64850+ } u;
64851+
64852+ /* value we return in st_blksize on stat(2) */
64853+ unsigned long optimal_io_size;
64854+
64855+ /* parameters for the flush algorithm */
64856+ struct flush_params flush;
64857+
64858+ /* pointers to jnodes for journal header and footer */
64859+ jnode *journal_header;
64860+ jnode *journal_footer;
64861+
64862+ journal_location jloc;
64863+
64864+ /* head block number of last committed transaction */
64865+ __u64 last_committed_tx;
64866+
64867+ /*
64868+ * we remember last written location for using as a hint for new block
64869+ * allocation
64870+ */
64871+ __u64 blocknr_hint_default;
64872+
64873+ /* committed number of files (oid allocator state variable ) */
64874+ __u64 nr_files_committed;
64875+
64876+ struct formatted_ra_params ra_params;
64877+
64878+ /*
64879+ * A mutex for serializing cut tree operation if out-of-free-space:
64880+ * the only one cut_tree thread is allowed to grab space from reserved
64881+ * area (it is 5% of disk space)
64882+ */
64883+ struct mutex delete_mutex;
64884+ /* task owning ->delete_mutex */
64885+ struct task_struct *delete_mutex_owner;
64886+
64887+ /* Diskmap's blocknumber */
64888+ __u64 diskmap_block;
64889+
64890+ /* What to do in case of error */
64891+ int onerror;
64892+
64893+ /* operations for objects on this file system */
64894+ struct object_ops ops;
64895+
64896+ /*
64897+ * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
64898+ * more details
64899+ */
64900+ struct d_cursor_info d_info;
64901+
64902+#ifdef CONFIG_REISER4_BADBLOCKS
64903+ /* Alternative master superblock offset (in bytes) */
64904+ unsigned long altsuper;
64905+#endif
64906+ struct repacker *repacker;
64907+ struct page *status_page;
64908+ struct bio *status_bio;
64909+
64910+#if REISER4_DEBUG
64911+ /*
64912+ * minimum used blocks value (includes super blocks, bitmap blocks and
64913+ * other fs reserved areas), depends on fs format and fs size.
64914+ */
64915+ __u64 min_blocks_used;
64916+
64917+ /*
64918+ * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
64919+ * are kept on a list anchored at sbinfo->all_jnodes. This list is
64920+ * protected by sbinfo->all_guard spin lock. This lock should be taken
64921+ * with _irq modifier, because it is also modified from interrupt
64922+ * contexts (by RCU).
64923+ */
64924+ spinlock_t all_guard;
64925+ /* list of all jnodes */
64926+ struct list_head all_jnodes;
64927+#endif
64928+ struct dentry *debugfs_root;
64929+};
64930+
64931+extern reiser4_super_info_data *get_super_private_nocheck(const struct
64932+ super_block *super);
64933+
64934+/* Return reiser4-specific part of super block */
64935+static inline reiser4_super_info_data *get_super_private(const struct
64936+ super_block *super)
64937+{
64938+ assert("nikita-447", super != NULL);
64939+
64940+ return (reiser4_super_info_data *) super->s_fs_info;
64941+}
64942+
64943+/* get ent context for the @super */
64944+static inline entd_context *get_entd_context(struct super_block *super)
64945+{
64946+ return &get_super_private(super)->entd;
64947+}
64948+
64949+/* "Current" super-block: main super block used during current system
64950+ call. Reference to this super block is stored in reiser4_context. */
64951+static inline struct super_block *reiser4_get_current_sb(void)
64952+{
64953+ return get_current_context()->super;
64954+}
64955+
64956+/* Reiser4-specific part of "current" super-block: main super block used
64957+ during current system call. Reference to this super block is stored in
64958+ reiser4_context. */
64959+static inline reiser4_super_info_data *get_current_super_private(void)
64960+{
64961+ return get_super_private(reiser4_get_current_sb());
64962+}
64963+
64964+static inline struct formatted_ra_params *get_current_super_ra_params(void)
64965+{
64966+ return &(get_current_super_private()->ra_params);
64967+}
64968+
64969+/*
64970+ * true, if file system on @super is read-only
64971+ */
64972+static inline int rofs_super(struct super_block *super)
64973+{
64974+ return super->s_flags & MS_RDONLY;
64975+}
64976+
64977+/*
64978+ * true, if @tree represents read-only file system
64979+ */
64980+static inline int rofs_tree(reiser4_tree * tree)
64981+{
64982+ return rofs_super(tree->super);
64983+}
64984+
64985+/*
64986+ * true, if file system where @inode lives on, is read-only
64987+ */
64988+static inline int rofs_inode(struct inode *inode)
64989+{
64990+ return rofs_super(inode->i_sb);
64991+}
64992+
64993+/*
64994+ * true, if file system where @node lives on, is read-only
64995+ */
64996+static inline int rofs_jnode(jnode * node)
64997+{
64998+ return rofs_tree(jnode_get_tree(node));
64999+}
65000+
65001+extern __u64 reiser4_current_block_count(void);
65002+
65003+extern void build_object_ops(struct super_block *super, struct object_ops * ops);
65004+
65005+#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
65006+
65007+static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
65008+{
65009+ spin_lock(&(sbinfo->guard));
65010+}
65011+
65012+static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
65013+{
65014+ assert_spin_locked(&(sbinfo->guard));
65015+ spin_unlock(&(sbinfo->guard));
65016+}
65017+
65018+extern __u64 reiser4_flush_reserved(const struct super_block *);
65019+extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
65020+extern long reiser4_statfs_type(const struct super_block *super);
65021+extern __u64 reiser4_block_count(const struct super_block *super);
65022+extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
65023+extern __u64 reiser4_data_blocks(const struct super_block *super);
65024+extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
65025+extern __u64 reiser4_free_blocks(const struct super_block *super);
65026+extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
65027+extern __u32 reiser4_mkfs_id(const struct super_block *super);
65028+
65029+extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
65030+
65031+extern __u64 reiser4_grabbed_blocks(const struct super_block *);
65032+extern __u64 reiser4_fake_allocated(const struct super_block *);
65033+extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
65034+extern __u64 reiser4_clustered_blocks(const struct super_block *);
65035+
65036+extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
65037+ gid_t gid);
65038+
65039+extern reiser4_space_allocator *
65040+reiser4_get_space_allocator(const struct super_block *super);
65041+extern reiser4_oid_allocator *
65042+reiser4_get_oid_allocator(const struct super_block *super);
65043+extern struct inode *reiser4_get_super_fake(const struct super_block *super);
65044+extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
65045+extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
65046+extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
65047+extern int is_reiser4_super(const struct super_block *super);
65048+
65049+extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
65050+extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
65051+ const reiser4_block_nr * blk);
65052+extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
65053+extern int reiser4_done_super(struct super_block *s);
65054+
65055+/* step of fill super */
65056+extern int reiser4_init_fs_info(struct super_block *);
65057+extern void reiser4_done_fs_info(struct super_block *);
65058+extern int reiser4_init_super_data(struct super_block *, char *opt_string);
65059+extern int reiser4_init_read_super(struct super_block *, int silent);
65060+extern int reiser4_init_root_inode(struct super_block *);
65061+extern reiser4_plugin *get_default_plugin(pset_member memb);
65062+
65063+/* Maximal possible object id. */
65064+#define ABSOLUTE_MAX_OID ((oid_t)~0)
65065+
65066+#define OIDS_RESERVED ( 1 << 16 )
65067+int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
65068+oid_t oid_allocate(struct super_block *);
65069+int oid_release(struct super_block *, oid_t);
65070+oid_t oid_next(const struct super_block *);
65071+void oid_count_allocated(void);
65072+void oid_count_released(void);
65073+long oids_used(const struct super_block *);
65074+
65075+#if REISER4_DEBUG
65076+void print_fs_info(const char *prefix, const struct super_block *);
65077+#endif
65078+
65079+extern void destroy_reiser4_cache(struct kmem_cache **);
65080+
65081+extern struct super_operations reiser4_super_operations;
65082+extern struct export_operations reiser4_export_operations;
65083+extern struct dentry_operations reiser4_dentry_operations;
65084+
65085+/* __REISER4_SUPER_H__ */
65086+#endif
65087+
65088+/*
65089+ * Local variables:
65090+ * c-indentation-style: "K&R"
65091+ * mode-name: "LC"
65092+ * c-basic-offset: 8
65093+ * tab-width: 8
65094+ * fill-column: 120
65095+ * End:
65096+ */
65097diff -urN linux-2.6.24.orig/fs/reiser4/super_ops.c linux-2.6.24/fs/reiser4/super_ops.c
65098--- linux-2.6.24.orig/fs/reiser4/super_ops.c 1970-01-01 03:00:00.000000000 +0300
65099+++ linux-2.6.24/fs/reiser4/super_ops.c 2008-01-25 12:23:33.922660872 +0300
65100@@ -0,0 +1,724 @@
65101+/* Copyright 2005 by Hans Reiser, licensing governed by
65102+ * reiser4/README */
65103+
65104+#include "inode.h"
65105+#include "page_cache.h"
65106+#include "ktxnmgrd.h"
65107+#include "flush.h"
65108+#include "safe_link.h"
65109+
65110+#include <linux/vfs.h>
65111+#include <linux/writeback.h>
65112+#include <linux/mount.h>
65113+#include <linux/seq_file.h>
65114+#include <linux/debugfs.h>
65115+
65116+/* slab cache for inodes */
65117+static struct kmem_cache *inode_cache;
65118+
65119+static struct dentry *reiser4_debugfs_root = NULL;
65120+
65121+/**
65122+ * init_once - constructor for reiser4 inodes
65123+ * @cache: cache @obj belongs to
65124+ * @obj: inode to be initialized
65125+ *
65126+ * Initialization function to be called when new page is allocated by reiser4
65127+ * inode cache. It is set on inode cache creation.
65128+ */
65129+static void init_once(struct kmem_cache *cache, void *obj)
65130+{
65131+ struct reiser4_inode_object *info;
65132+
65133+ info = obj;
65134+
65135+ /* initialize vfs inode */
65136+ inode_init_once(&info->vfs_inode);
65137+
65138+ /*
65139+ * initialize reiser4 specific part fo inode.
65140+ * NOTE-NIKITA add here initializations for locks, list heads,
65141+ * etc. that will be added to our private inode part.
65142+ */
65143+ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
65144+ init_rwsem(&info->p.conv_sem);
65145+ /* init semaphore which is used during inode loading */
65146+ loading_init_once(&info->p);
65147+ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
65148+ GFP_ATOMIC);
65149+#if REISER4_DEBUG
65150+ info->p.nr_jnodes = 0;
65151+#endif
65152+}
65153+
65154+/**
65155+ * init_inodes - create znode cache
65156+ *
65157+ * Initializes slab cache of inodes. It is part of reiser4 module initialization.
65158+ */
65159+static int init_inodes(void)
65160+{
65161+ inode_cache = kmem_cache_create("reiser4_inode",
65162+ sizeof(struct reiser4_inode_object),
65163+ 0,
65164+ SLAB_HWCACHE_ALIGN |
65165+ SLAB_RECLAIM_ACCOUNT, init_once);
65166+ if (inode_cache == NULL)
65167+ return RETERR(-ENOMEM);
65168+ return 0;
65169+}
65170+
65171+/**
65172+ * done_inodes - delete inode cache
65173+ *
65174+ * This is called on reiser4 module unloading or system shutdown.
65175+ */
65176+static void done_inodes(void)
65177+{
65178+ destroy_reiser4_cache(&inode_cache);
65179+}
65180+
65181+/**
65182+ * reiser4_alloc_inode - alloc_inode of super operations
65183+ * @super: super block new inode is allocated for
65184+ *
65185+ * Allocates new inode, initializes reiser4 specific part of it.
65186+ */
65187+static struct inode *reiser4_alloc_inode(struct super_block *super)
65188+{
65189+ struct reiser4_inode_object *obj;
65190+
65191+ assert("nikita-1696", super != NULL);
65192+ obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
65193+ if (obj != NULL) {
65194+ reiser4_inode *info;
65195+
65196+ info = &obj->p;
65197+
65198+ info->pset = plugin_set_get_empty();
65199+ info->hset = plugin_set_get_empty();
65200+ info->extmask = 0;
65201+ info->locality_id = 0ull;
65202+ info->plugin_mask = 0;
65203+ info->heir_mask = 0;
65204+#if !REISER4_INO_IS_OID
65205+ info->oid_hi = 0;
65206+#endif
65207+ reiser4_seal_init(&info->sd_seal, NULL, NULL);
65208+ coord_init_invalid(&info->sd_coord, NULL);
65209+ info->flags = 0;
65210+ spin_lock_init(&info->guard);
65211+ /* this deals with info's loading semaphore */
65212+ loading_alloc(info);
65213+ info->vroot = UBER_TREE_ADDR;
65214+ return &obj->vfs_inode;
65215+ } else
65216+ return NULL;
65217+}
65218+
65219+/**
65220+ * reiser4_destroy_inode - destroy_inode of super operations
65221+ * @inode: inode being destroyed
65222+ *
65223+ * Puts reiser4 specific portion of inode, frees memory occupied by inode.
65224+ */
65225+static void reiser4_destroy_inode(struct inode *inode)
65226+{
65227+ reiser4_inode *info;
65228+
65229+ info = reiser4_inode_data(inode);
65230+
65231+ assert("vs-1220", inode_has_no_jnodes(info));
65232+
65233+ if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
65234+ file_plugin *fplug = inode_file_plugin(inode);
65235+ if (fplug->destroy_inode != NULL)
65236+ fplug->destroy_inode(inode);
65237+ }
65238+ reiser4_dispose_cursors(inode);
65239+ if (info->pset)
65240+ plugin_set_put(info->pset);
65241+ if (info->hset)
65242+ plugin_set_put(info->hset);
65243+
65244+ /*
65245+ * cannot add similar assertion about ->i_list as prune_icache return
65246+ * inode into slab with dangling ->list.{next,prev}. This is safe,
65247+ * because they are re-initialized in the new_inode().
65248+ */
65249+ assert("nikita-2895", list_empty(&inode->i_dentry));
65250+ assert("nikita-2896", hlist_unhashed(&inode->i_hash));
65251+ assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
65252+
65253+ /* this deals with info's loading semaphore */
65254+ loading_destroy(info);
65255+
65256+ kmem_cache_free(inode_cache,
65257+ container_of(info, struct reiser4_inode_object, p));
65258+}
65259+
65260+/**
65261+ * reiser4_dirty_inode - dirty_inode of super operations
65262+ * @inode: inode being dirtied
65263+ *
65264+ * Updates stat data.
65265+ */
65266+static void reiser4_dirty_inode(struct inode *inode)
65267+{
65268+ int result;
65269+
65270+ if (!is_in_reiser4_context())
65271+ return;
65272+ assert("", !IS_RDONLY(inode));
65273+ assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
65274+ get_current_context()->grabbed_blocks));
65275+
65276+ result = reiser4_update_sd(inode);
65277+ if (result)
65278+ warning("", "failed to dirty inode for %llu: %d",
65279+ get_inode_oid(inode), result);
65280+}
65281+
65282+/**
65283+ * reiser4_delete_inode - delete_inode of super operations
65284+ * @inode: inode to delete
65285+ *
65286+ * Calls file plugin's delete_object method to delete object items from
65287+ * filesystem tree and calls clear_inode.
65288+ */
65289+static void reiser4_delete_inode(struct inode *inode)
65290+{
65291+ reiser4_context *ctx;
65292+ file_plugin *fplug;
65293+
65294+ ctx = reiser4_init_context(inode->i_sb);
65295+ if (IS_ERR(ctx)) {
65296+ warning("vs-15", "failed to init context");
65297+ return;
65298+ }
65299+
65300+ if (is_inode_loaded(inode)) {
65301+ fplug = inode_file_plugin(inode);
65302+ if (fplug != NULL && fplug->delete_object != NULL)
65303+ fplug->delete_object(inode);
65304+ }
65305+
65306+ truncate_inode_pages(&inode->i_data, 0);
65307+ inode->i_blocks = 0;
65308+ clear_inode(inode);
65309+ reiser4_exit_context(ctx);
65310+}
65311+
65312+/**
65313+ * reiser4_put_super - put_super of super operations
65314+ * @super: super block to free
65315+ *
65316+ * Stops daemons, release resources, umounts in short.
65317+ */
65318+static void reiser4_put_super(struct super_block *super)
65319+{
65320+ reiser4_super_info_data *sbinfo;
65321+ reiser4_context *ctx;
65322+
65323+ sbinfo = get_super_private(super);
65324+ assert("vs-1699", sbinfo);
65325+
65326+ debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
65327+ debugfs_remove(sbinfo->tmgr.debugfs_id_count);
65328+ debugfs_remove(sbinfo->debugfs_root);
65329+
65330+ ctx = reiser4_init_context(super);
65331+ if (IS_ERR(ctx)) {
65332+ warning("vs-17", "failed to init context");
65333+ return;
65334+ }
65335+
65336+ /* have disk format plugin to free its resources */
65337+ if (get_super_private(super)->df_plug->release)
65338+ get_super_private(super)->df_plug->release(super);
65339+
65340+ reiser4_done_formatted_fake(super);
65341+
65342+ /* stop daemons: ktxnmgr and entd */
65343+ reiser4_done_entd(super);
65344+ reiser4_done_ktxnmgrd(super);
65345+ reiser4_done_txnmgr(&sbinfo->tmgr);
65346+
65347+ reiser4_done_fs_info(super);
65348+ reiser4_exit_context(ctx);
65349+}
65350+
65351+/**
65352+ * reiser4_write_super - write_super of super operations
65353+ * @super: super block to write
65354+ *
65355+ * Captures znode associated with super block, comit all transactions.
65356+ */
65357+static void reiser4_write_super(struct super_block *super)
65358+{
65359+ int ret;
65360+ reiser4_context *ctx;
65361+
65362+ assert("vs-1700", !rofs_super(super));
65363+
65364+ ctx = reiser4_init_context(super);
65365+ if (IS_ERR(ctx)) {
65366+ warning("vs-16", "failed to init context");
65367+ return;
65368+ }
65369+
65370+ ret = reiser4_capture_super_block(super);
65371+ if (ret != 0)
65372+ warning("vs-1701",
65373+ "reiser4_capture_super_block failed in write_super: %d",
65374+ ret);
65375+ ret = txnmgr_force_commit_all(super, 0);
65376+ if (ret != 0)
65377+ warning("jmacd-77113",
65378+ "txn_force failed in write_super: %d", ret);
65379+
65380+ super->s_dirt = 0;
65381+
65382+ reiser4_exit_context(ctx);
65383+}
65384+
65385+/**
65386+ * reiser4_statfs - statfs of super operations
65387+ * @super: super block of file system in queried
65388+ * @stafs: buffer to fill with statistics
65389+ *
65390+ * Returns information about filesystem.
65391+ */
65392+static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
65393+{
65394+ sector_t total;
65395+ sector_t reserved;
65396+ sector_t free;
65397+ sector_t forroot;
65398+ sector_t deleted;
65399+ reiser4_context *ctx;
65400+ struct super_block *super = dentry->d_sb;
65401+
65402+ assert("nikita-408", super != NULL);
65403+ assert("nikita-409", statfs != NULL);
65404+
65405+ ctx = reiser4_init_context(super);
65406+ if (IS_ERR(ctx))
65407+ return PTR_ERR(ctx);
65408+
65409+ statfs->f_type = reiser4_statfs_type(super);
65410+ statfs->f_bsize = super->s_blocksize;
65411+
65412+ /*
65413+ * 5% of total block space is reserved. This is needed for flush and
65414+ * for truncates (so that we are able to perform truncate/unlink even
65415+ * on the otherwise completely full file system). If this reservation
65416+ * is hidden from statfs(2), users will mistakenly guess that they
65417+ * have enough free space to complete some operation, which is
65418+ * frustrating.
65419+ *
65420+ * Another possible solution is to subtract ->blocks_reserved from
65421+ * ->f_bfree, but changing available space seems less intrusive than
65422+ * letting user to see 5% of disk space to be used directly after
65423+ * mkfs.
65424+ */
65425+ total = reiser4_block_count(super);
65426+ reserved = get_super_private(super)->blocks_reserved;
65427+ deleted = txnmgr_count_deleted_blocks();
65428+ free = reiser4_free_blocks(super) + deleted;
65429+ forroot = reiser4_reserved_blocks(super, 0, 0);
65430+
65431+ /*
65432+ * These counters may be in inconsistent state because we take the
65433+ * values without keeping any global spinlock. Here we do a sanity
65434+ * check that free block counter does not exceed the number of all
65435+ * blocks.
65436+ */
65437+ if (free > total)
65438+ free = total;
65439+ statfs->f_blocks = total - reserved;
65440+ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
65441+ if (free > reserved)
65442+ free -= reserved;
65443+ else
65444+ free = 0;
65445+ statfs->f_bfree = free;
65446+
65447+ if (free > forroot)
65448+ free -= forroot;
65449+ else
65450+ free = 0;
65451+ statfs->f_bavail = free;
65452+
65453+ statfs->f_files = 0;
65454+ statfs->f_ffree = 0;
65455+
65456+ /* maximal acceptable name length depends on directory plugin. */
65457+ assert("nikita-3351", super->s_root->d_inode != NULL);
65458+ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
65459+ reiser4_exit_context(ctx);
65460+ return 0;
65461+}
65462+
65463+/**
65464+ * reiser4_clear_inode - clear_inode of super operation
65465+ * @inode: inode about to destroy
65466+ *
65467+ * Does sanity checks: being destroyed should have all jnodes detached.
65468+ */
65469+static void reiser4_clear_inode(struct inode *inode)
65470+{
65471+#if REISER4_DEBUG
65472+ reiser4_inode *r4_inode;
65473+
65474+ r4_inode = reiser4_inode_data(inode);
65475+ if (!inode_has_no_jnodes(r4_inode))
65476+ warning("vs-1732", "reiser4 inode has %ld jnodes\n",
65477+ r4_inode->nr_jnodes);
65478+#endif
65479+}
65480+
65481+/**
65482+ * reiser4_sync_inodes - sync_inodes of super operations
65483+ * @super:
65484+ * @wbc:
65485+ *
65486+ * This method is called by background and non-backgound writeback. Reiser4's
65487+ * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
65488+ * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
65489+ * mapping - dirty pages get into atoms. Writeout is called to flush some
65490+ * atoms.
65491+ */
65492+static void reiser4_sync_inodes(struct super_block *super,
65493+ struct writeback_control *wbc)
65494+{
65495+ reiser4_context *ctx;
65496+ long to_write;
65497+
65498+ if (wbc->for_kupdate)
65499+ /* reiser4 has its own means of periodical write-out */
65500+ return;
65501+
65502+ to_write = wbc->nr_to_write;
65503+ assert("vs-49", wbc->older_than_this == NULL);
65504+
65505+ ctx = reiser4_init_context(super);
65506+ if (IS_ERR(ctx)) {
65507+ warning("vs-13", "failed to init context");
65508+ return;
65509+ }
65510+
65511+ /*
65512+ * call reiser4_writepages for each of dirty inodes to turn dirty pages
65513+ * into transactions if they were not yet.
65514+ */
65515+ generic_sync_sb_inodes(super, wbc);
65516+
65517+ /* flush goes here */
65518+ wbc->nr_to_write = to_write;
65519+ reiser4_writeout(super, wbc);
65520+
65521+ /* avoid recursive calls to ->sync_inodes */
65522+ context_set_commit_async(ctx);
65523+ reiser4_exit_context(ctx);
65524+}
65525+
65526+/**
65527+ * reiser4_show_options - show_options of super operations
65528+ * @m: file where to write information
65529+ * @mnt: mount structure
65530+ *
65531+ * Makes reiser4 mount options visible in /proc/mounts.
65532+ */
65533+static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
65534+{
65535+ struct super_block *super;
65536+ reiser4_super_info_data *sbinfo;
65537+
65538+ super = mnt->mnt_sb;
65539+ sbinfo = get_super_private(super);
65540+
65541+ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
65542+ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
65543+ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
65544+ seq_printf(m, ",atom_max_flushers=0x%x",
65545+ sbinfo->tmgr.atom_max_flushers);
65546+ seq_printf(m, ",cbk_cache_slots=0x%x",
65547+ sbinfo->tree.cbk_cache.nr_slots);
65548+
65549+ return 0;
65550+}
65551+
65552+struct super_operations reiser4_super_operations = {
65553+ .alloc_inode = reiser4_alloc_inode,
65554+ .destroy_inode = reiser4_destroy_inode,
65555+ .dirty_inode = reiser4_dirty_inode,
65556+ .delete_inode = reiser4_delete_inode,
65557+ .put_super = reiser4_put_super,
65558+ .write_super = reiser4_write_super,
65559+ .statfs = reiser4_statfs,
65560+ .clear_inode = reiser4_clear_inode,
65561+ .sync_inodes = reiser4_sync_inodes,
65562+ .show_options = reiser4_show_options
65563+};
65564+
65565+/**
65566+ * fill_super - initialize super block on mount
65567+ * @super: super block to fill
65568+ * @data: reiser4 specific mount option
65569+ * @silent:
65570+ *
65571+ * This is to be called by reiser4_get_sb. Mounts filesystem.
65572+ */
65573+static int fill_super(struct super_block *super, void *data, int silent)
65574+{
65575+ reiser4_context ctx;
65576+ int result;
65577+ reiser4_super_info_data *sbinfo;
65578+
65579+ assert("zam-989", super != NULL);
65580+
65581+ super->s_op = NULL;
65582+ init_stack_context(&ctx, super);
65583+
65584+ /* allocate reiser4 specific super block */
65585+ if ((result = reiser4_init_fs_info(super)) != 0)
65586+ goto failed_init_sinfo;
65587+
65588+ sbinfo = get_super_private(super);
65589+ /* initialize various reiser4 parameters, parse mount options */
65590+ if ((result = reiser4_init_super_data(super, data)) != 0)
65591+ goto failed_init_super_data;
65592+
65593+ /* read reiser4 master super block, initialize disk format plugin */
65594+ if ((result = reiser4_init_read_super(super, silent)) != 0)
65595+ goto failed_init_read_super;
65596+
65597+ /* initialize transaction manager */
65598+ reiser4_init_txnmgr(&sbinfo->tmgr);
65599+
65600+ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
65601+ if ((result = reiser4_init_ktxnmgrd(super)) != 0)
65602+ goto failed_init_ktxnmgrd;
65603+
65604+ /* initialize entd context and start kernel thread entd */
65605+ if ((result = reiser4_init_entd(super)) != 0)
65606+ goto failed_init_entd;
65607+
65608+ /* initialize address spaces for formatted nodes and bitmaps */
65609+ if ((result = reiser4_init_formatted_fake(super)) != 0)
65610+ goto failed_init_formatted_fake;
65611+
65612+ /* initialize disk format plugin */
65613+ if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
65614+ goto failed_init_disk_format;
65615+
65616+ /*
65617+ * There are some 'committed' versions of reiser4 super block counters,
65618+ * which correspond to reiser4 on-disk state. These counters are
65619+ * initialized here
65620+ */
65621+ sbinfo->blocks_free_committed = sbinfo->blocks_free;
65622+ sbinfo->nr_files_committed = oids_used(super);
65623+
65624+ /* get inode of root directory */
65625+ if ((result = reiser4_init_root_inode(super)) != 0)
65626+ goto failed_init_root_inode;
65627+
65628+ if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 )
65629+ goto failed_update_format_version;
65630+
65631+ process_safelinks(super);
65632+ reiser4_exit_context(&ctx);
65633+
65634+ sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
65635+ reiser4_debugfs_root);
65636+ if (sbinfo->debugfs_root) {
65637+ sbinfo->tmgr.debugfs_atom_count =
65638+ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
65639+ sbinfo->debugfs_root,
65640+ &sbinfo->tmgr.atom_count);
65641+ sbinfo->tmgr.debugfs_id_count =
65642+ debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
65643+ sbinfo->debugfs_root,
65644+ &sbinfo->tmgr.id_count);
65645+ }
65646+ return 0;
65647+
65648+ failed_update_format_version:
65649+ failed_init_root_inode:
65650+ if (sbinfo->df_plug->release)
65651+ sbinfo->df_plug->release(super);
65652+ failed_init_disk_format:
65653+ reiser4_done_formatted_fake(super);
65654+ failed_init_formatted_fake:
65655+ reiser4_done_entd(super);
65656+ failed_init_entd:
65657+ reiser4_done_ktxnmgrd(super);
65658+ failed_init_ktxnmgrd:
65659+ reiser4_done_txnmgr(&sbinfo->tmgr);
65660+ failed_init_read_super:
65661+ failed_init_super_data:
65662+ reiser4_done_fs_info(super);
65663+ failed_init_sinfo:
65664+ reiser4_exit_context(&ctx);
65665+ return result;
65666+}
65667+
65668+/**
65669+ * reiser4_get_sb - get_sb of file_system_type operations
65670+ * @fs_type:
65671+ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
65672+ * @dev_name: block device file name
65673+ * @data: specific mount options
65674+ *
65675+ * Reiser4 mount entry.
65676+ */
65677+static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
65678+ const char *dev_name, void *data, struct vfsmount *mnt)
65679+{
65680+ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
65681+}
65682+
65683+/* structure describing the reiser4 filesystem implementation */
65684+static struct file_system_type reiser4_fs_type = {
65685+ .owner = THIS_MODULE,
65686+ .name = "reiser4",
65687+ .fs_flags = FS_REQUIRES_DEV,
65688+ .get_sb = reiser4_get_sb,
65689+ .kill_sb = kill_block_super,
65690+ .next = NULL
65691+};
65692+
65693+void destroy_reiser4_cache(struct kmem_cache **cachep)
65694+{
65695+ BUG_ON(*cachep == NULL);
65696+ kmem_cache_destroy(*cachep);
65697+ *cachep = NULL;
65698+}
65699+
65700+/**
65701+ * init_reiser4 - reiser4 initialization entry point
65702+ *
65703+ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
65704+ * on kernel initialization or during reiser4 module load.
65705+ */
65706+static int __init init_reiser4(void)
65707+{
65708+ int result;
65709+
65710+ printk(KERN_INFO
65711+ "Loading Reiser4. "
65712+ "See www.namesys.com for a description of Reiser4.\n");
65713+
65714+ /* initialize slab cache of inodes */
65715+ if ((result = init_inodes()) != 0)
65716+ goto failed_inode_cache;
65717+
65718+ /* initialize cache of znodes */
65719+ if ((result = init_znodes()) != 0)
65720+ goto failed_init_znodes;
65721+
65722+ /* initialize all plugins */
65723+ if ((result = init_plugins()) != 0)
65724+ goto failed_init_plugins;
65725+
65726+ /* initialize cache of plugin_set-s and plugin_set's hash table */
65727+ if ((result = init_plugin_set()) != 0)
65728+ goto failed_init_plugin_set;
65729+
65730+ /* initialize caches of txn_atom-s and txn_handle-s */
65731+ if ((result = init_txnmgr_static()) != 0)
65732+ goto failed_init_txnmgr_static;
65733+
65734+ /* initialize cache of jnodes */
65735+ if ((result = init_jnodes()) != 0)
65736+ goto failed_init_jnodes;
65737+
65738+ /* initialize cache of flush queues */
65739+ if ((result = reiser4_init_fqs()) != 0)
65740+ goto failed_init_fqs;
65741+
65742+ /* initialize cache of structures attached to dentry->d_fsdata */
65743+ if ((result = reiser4_init_dentry_fsdata()) != 0)
65744+ goto failed_init_dentry_fsdata;
65745+
65746+ /* initialize cache of structures attached to file->private_data */
65747+ if ((result = reiser4_init_file_fsdata()) != 0)
65748+ goto failed_init_file_fsdata;
65749+
65750+ /*
65751+ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
65752+ * more details
65753+ */
65754+ if ((result = reiser4_init_d_cursor()) != 0)
65755+ goto failed_init_d_cursor;
65756+
65757+ if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
65758+ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
65759+ return 0;
65760+ }
65761+
65762+ reiser4_done_d_cursor();
65763+ failed_init_d_cursor:
65764+ reiser4_done_file_fsdata();
65765+ failed_init_file_fsdata:
65766+ reiser4_done_dentry_fsdata();
65767+ failed_init_dentry_fsdata:
65768+ reiser4_done_fqs();
65769+ failed_init_fqs:
65770+ done_jnodes();
65771+ failed_init_jnodes:
65772+ done_txnmgr_static();
65773+ failed_init_txnmgr_static:
65774+ done_plugin_set();
65775+ failed_init_plugin_set:
65776+ failed_init_plugins:
65777+ done_znodes();
65778+ failed_init_znodes:
65779+ done_inodes();
65780+ failed_inode_cache:
65781+ return result;
65782+}
65783+
65784+/**
65785+ * done_reiser4 - reiser4 exit entry point
65786+ *
65787+ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
65788+ * or at module unload.
65789+ */
65790+static void __exit done_reiser4(void)
65791+{
65792+ int result;
65793+
65794+ debugfs_remove(reiser4_debugfs_root);
65795+ result = unregister_filesystem(&reiser4_fs_type);
65796+ BUG_ON(result != 0);
65797+ reiser4_done_d_cursor();
65798+ reiser4_done_file_fsdata();
65799+ reiser4_done_dentry_fsdata();
65800+ reiser4_done_fqs();
65801+ done_jnodes();
65802+ done_txnmgr_static();
65803+ done_plugin_set();
65804+ done_znodes();
65805+ destroy_reiser4_cache(&inode_cache);
65806+}
65807+
65808+module_init(init_reiser4);
65809+module_exit(done_reiser4);
65810+
65811+MODULE_DESCRIPTION("Reiser4 filesystem");
65812+MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
65813+
65814+MODULE_LICENSE("GPL");
65815+
65816+/*
65817+ * Local variables:
65818+ * c-indentation-style: "K&R"
65819+ * mode-name: "LC"
65820+ * c-basic-offset: 8
65821+ * tab-width: 8
65822+ * fill-column: 79
65823+ * End:
65824+ */
65825diff -urN linux-2.6.24.orig/fs/reiser4/tap.c linux-2.6.24/fs/reiser4/tap.c
65826--- linux-2.6.24.orig/fs/reiser4/tap.c 1970-01-01 03:00:00.000000000 +0300
65827+++ linux-2.6.24/fs/reiser4/tap.c 2008-01-25 11:39:07.092247874 +0300
65828@@ -0,0 +1,377 @@
65829+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
65830+ * reiser4/README */
65831+
65832+/*
65833+ Tree Access Pointer (tap).
65834+
65835+ tap is data structure combining coord and lock handle (mostly). It is
65836+ useful when one has to scan tree nodes (for example, in readdir, or flush),
65837+ for tap functions allow to move tap in either direction transparently
65838+ crossing unit/item/node borders.
65839+
65840+ Tap doesn't provide automatic synchronization of its fields as it is
65841+ supposed to be per-thread object.
65842+*/
65843+
65844+#include "forward.h"
65845+#include "debug.h"
65846+#include "coord.h"
65847+#include "tree.h"
65848+#include "context.h"
65849+#include "tap.h"
65850+#include "znode.h"
65851+#include "tree_walk.h"
65852+
65853+#if REISER4_DEBUG
65854+static int tap_invariant(const tap_t * tap);
65855+static void tap_check(const tap_t * tap);
65856+#else
65857+#define tap_check(tap) noop
65858+#endif
65859+
65860+/** load node tap is pointing to, if not loaded already */
65861+int reiser4_tap_load(tap_t * tap)
65862+{
65863+ tap_check(tap);
65864+ if (tap->loaded == 0) {
65865+ int result;
65866+
65867+ result = zload_ra(tap->coord->node, &tap->ra_info);
65868+ if (result != 0)
65869+ return result;
65870+ coord_clear_iplug(tap->coord);
65871+ }
65872+ ++tap->loaded;
65873+ tap_check(tap);
65874+ return 0;
65875+}
65876+
65877+/** release node tap is pointing to. Dual to tap_load() */
65878+void reiser4_tap_relse(tap_t * tap)
65879+{
65880+ tap_check(tap);
65881+ if (tap->loaded > 0) {
65882+ --tap->loaded;
65883+ if (tap->loaded == 0) {
65884+ zrelse(tap->coord->node);
65885+ }
65886+ }
65887+ tap_check(tap);
65888+}
65889+
65890+/**
65891+ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
65892+ * @mode
65893+ */
65894+void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
65895+ znode_lock_mode mode)
65896+{
65897+ tap->coord = coord;
65898+ tap->lh = lh;
65899+ tap->mode = mode;
65900+ tap->loaded = 0;
65901+ INIT_LIST_HEAD(&tap->linkage);
65902+ reiser4_init_ra_info(&tap->ra_info);
65903+}
65904+
65905+/** add @tap to the per-thread list of all taps */
65906+void reiser4_tap_monitor(tap_t * tap)
65907+{
65908+ assert("nikita-2623", tap != NULL);
65909+ tap_check(tap);
65910+ list_add(&tap->linkage, reiser4_taps_list());
65911+ tap_check(tap);
65912+}
65913+
65914+/* duplicate @src into @dst. Copy lock handle. @dst is not initially
65915+ * loaded. */
65916+void reiser4_tap_copy(tap_t * dst, tap_t * src)
65917+{
65918+ assert("nikita-3193", src != NULL);
65919+ assert("nikita-3194", dst != NULL);
65920+
65921+ *dst->coord = *src->coord;
65922+ if (src->lh->node)
65923+ copy_lh(dst->lh, src->lh);
65924+ dst->mode = src->mode;
65925+ dst->loaded = 0;
65926+ INIT_LIST_HEAD(&dst->linkage);
65927+ dst->ra_info = src->ra_info;
65928+}
65929+
65930+/** finish with @tap */
65931+void reiser4_tap_done(tap_t * tap)
65932+{
65933+ assert("nikita-2565", tap != NULL);
65934+ tap_check(tap);
65935+ if (tap->loaded > 0)
65936+ zrelse(tap->coord->node);
65937+ done_lh(tap->lh);
65938+ tap->loaded = 0;
65939+ list_del_init(&tap->linkage);
65940+ tap->coord->node = NULL;
65941+}
65942+
65943+/**
65944+ * move @tap to the new node, locked with @target. Load @target, if @tap was
65945+ * already loaded.
65946+ */
65947+int reiser4_tap_move(tap_t * tap, lock_handle * target)
65948+{
65949+ int result = 0;
65950+
65951+ assert("nikita-2567", tap != NULL);
65952+ assert("nikita-2568", target != NULL);
65953+ assert("nikita-2570", target->node != NULL);
65954+ assert("nikita-2569", tap->coord->node == tap->lh->node);
65955+
65956+ tap_check(tap);
65957+ if (tap->loaded > 0)
65958+ result = zload_ra(target->node, &tap->ra_info);
65959+
65960+ if (result == 0) {
65961+ if (tap->loaded > 0)
65962+ zrelse(tap->coord->node);
65963+ done_lh(tap->lh);
65964+ copy_lh(tap->lh, target);
65965+ tap->coord->node = target->node;
65966+ coord_clear_iplug(tap->coord);
65967+ }
65968+ tap_check(tap);
65969+ return result;
65970+}
65971+
65972+/**
65973+ * move @tap to @target. Acquire lock on @target, if @tap was already
65974+ * loaded.
65975+ */
65976+static int tap_to(tap_t * tap, znode * target)
65977+{
65978+ int result;
65979+
65980+ assert("nikita-2624", tap != NULL);
65981+ assert("nikita-2625", target != NULL);
65982+
65983+ tap_check(tap);
65984+ result = 0;
65985+ if (tap->coord->node != target) {
65986+ lock_handle here;
65987+
65988+ init_lh(&here);
65989+ result = longterm_lock_znode(&here, target,
65990+ tap->mode, ZNODE_LOCK_HIPRI);
65991+ if (result == 0) {
65992+ result = reiser4_tap_move(tap, &here);
65993+ done_lh(&here);
65994+ }
65995+ }
65996+ tap_check(tap);
65997+ return result;
65998+}
65999+
66000+/**
66001+ * move @tap to given @target, loading and locking @target->node if
66002+ * necessary
66003+ */
66004+int tap_to_coord(tap_t * tap, coord_t * target)
66005+{
66006+ int result;
66007+
66008+ tap_check(tap);
66009+ result = tap_to(tap, target->node);
66010+ if (result == 0)
66011+ coord_dup(tap->coord, target);
66012+ tap_check(tap);
66013+ return result;
66014+}
66015+
66016+/** return list of all taps */
66017+struct list_head *reiser4_taps_list(void)
66018+{
66019+ return &get_current_context()->taps;
66020+}
66021+
66022+/** helper function for go_{next,prev}_{item,unit,node}() */
66023+int go_dir_el(tap_t * tap, sideof dir, int units_p)
66024+{
66025+ coord_t dup;
66026+ coord_t *coord;
66027+ int result;
66028+
66029+ int (*coord_dir) (coord_t *);
66030+ int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
66031+ void (*coord_init) (coord_t *, const znode *);
66032+ ON_DEBUG(int (*coord_check) (const coord_t *));
66033+
66034+ assert("nikita-2556", tap != NULL);
66035+ assert("nikita-2557", tap->coord != NULL);
66036+ assert("nikita-2558", tap->lh != NULL);
66037+ assert("nikita-2559", tap->coord->node != NULL);
66038+
66039+ tap_check(tap);
66040+ if (dir == LEFT_SIDE) {
66041+ coord_dir = units_p ? coord_prev_unit : coord_prev_item;
66042+ get_dir_neighbor = reiser4_get_left_neighbor;
66043+ coord_init = coord_init_last_unit;
66044+ } else {
66045+ coord_dir = units_p ? coord_next_unit : coord_next_item;
66046+ get_dir_neighbor = reiser4_get_right_neighbor;
66047+ coord_init = coord_init_first_unit;
66048+ }
66049+ ON_DEBUG(coord_check =
66050+ units_p ? coord_is_existing_unit : coord_is_existing_item);
66051+ assert("nikita-2560", coord_check(tap->coord));
66052+
66053+ coord = tap->coord;
66054+ coord_dup(&dup, coord);
66055+ if (coord_dir(&dup) != 0) {
66056+ do {
66057+ /* move to the left neighboring node */
66058+ lock_handle dup;
66059+
66060+ init_lh(&dup);
66061+ result =
66062+ get_dir_neighbor(&dup, coord->node, (int)tap->mode,
66063+ GN_CAN_USE_UPPER_LEVELS);
66064+ if (result == 0) {
66065+ result = reiser4_tap_move(tap, &dup);
66066+ if (result == 0)
66067+ coord_init(tap->coord, dup.node);
66068+ done_lh(&dup);
66069+ }
66070+ /* skip empty nodes */
66071+ } while ((result == 0) && node_is_empty(coord->node));
66072+ } else {
66073+ result = 0;
66074+ coord_dup(coord, &dup);
66075+ }
66076+ assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
66077+ tap_check(tap);
66078+ return result;
66079+}
66080+
66081+/**
66082+ * move @tap to the next unit, transparently crossing item and node
66083+ * boundaries
66084+ */
66085+int go_next_unit(tap_t * tap)
66086+{
66087+ return go_dir_el(tap, RIGHT_SIDE, 1);
66088+}
66089+
66090+/**
66091+ * move @tap to the previous unit, transparently crossing item and node
66092+ * boundaries
66093+ */
66094+int go_prev_unit(tap_t * tap)
66095+{
66096+ return go_dir_el(tap, LEFT_SIDE, 1);
66097+}
66098+
66099+/**
66100+ * @shift times apply @actor to the @tap. This is used to move @tap by
66101+ * @shift units (or items, or nodes) in either direction.
66102+ */
66103+static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
66104+{
66105+ int result;
66106+
66107+ assert("nikita-2555", shift >= 0);
66108+ assert("nikita-2562", tap->coord->node == tap->lh->node);
66109+
66110+ tap_check(tap);
66111+ result = reiser4_tap_load(tap);
66112+ if (result != 0)
66113+ return result;
66114+
66115+ for (; shift > 0; --shift) {
66116+ result = actor(tap);
66117+ assert("nikita-2563", tap->coord->node == tap->lh->node);
66118+ if (result != 0)
66119+ break;
66120+ }
66121+ reiser4_tap_relse(tap);
66122+ tap_check(tap);
66123+ return result;
66124+}
66125+
66126+/** move @tap @shift units rightward */
66127+int rewind_right(tap_t * tap, int shift)
66128+{
66129+ return rewind_to(tap, go_next_unit, shift);
66130+}
66131+
66132+/** move @tap @shift units leftward */
66133+int rewind_left(tap_t * tap, int shift)
66134+{
66135+ return rewind_to(tap, go_prev_unit, shift);
66136+}
66137+
66138+#if REISER4_DEBUG
66139+/** debugging function: print @tap content in human readable form */
66140+static void print_tap(const char *prefix, const tap_t * tap)
66141+{
66142+ if (tap == NULL) {
66143+ printk("%s: null tap\n", prefix);
66144+ return;
66145+ }
66146+ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
66147+ tap->loaded, (&tap->linkage == tap->linkage.next &&
66148+ &tap->linkage == tap->linkage.prev),
66149+ tap->lh->node,
66150+ lock_mode_name(tap->mode));
66151+ print_coord("\tcoord", tap->coord, 0);
66152+}
66153+
66154+/** check [tap-sane] invariant */
66155+static int tap_invariant(const tap_t * tap)
66156+{
66157+ /* [tap-sane] invariant */
66158+
66159+ if (tap == NULL)
66160+ return 1;
66161+ /* tap->mode is one of
66162+ *
66163+ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
66164+ */
66165+ if (tap->mode != ZNODE_NO_LOCK &&
66166+ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
66167+ return 2;
66168+ /* tap->coord != NULL, and */
66169+ if (tap->coord == NULL)
66170+ return 3;
66171+ /* tap->lh != NULL, and */
66172+ if (tap->lh == NULL)
66173+ return 4;
66174+ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
66175+ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
66176+ return 5;
66177+ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
66178+ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
66179+ return 6;
66180+ return 0;
66181+}
66182+
66183+/** debugging function: check internal @tap consistency */
66184+static void tap_check(const tap_t * tap)
66185+{
66186+ int result;
66187+
66188+ result = tap_invariant(tap);
66189+ if (result != 0) {
66190+ print_tap("broken", tap);
66191+ reiser4_panic("nikita-2831", "tap broken: %i\n", result);
66192+ }
66193+}
66194+#endif
66195+
66196+/* Make Linus happy.
66197+ Local variables:
66198+ c-indentation-style: "K&R"
66199+ mode-name: "LC"
66200+ c-basic-offset: 8
66201+ tab-width: 8
66202+ fill-column: 120
66203+ scroll-step: 1
66204+ End:
66205+*/
66206diff -urN linux-2.6.24.orig/fs/reiser4/tap.h linux-2.6.24/fs/reiser4/tap.h
66207--- linux-2.6.24.orig/fs/reiser4/tap.h 1970-01-01 03:00:00.000000000 +0300
66208+++ linux-2.6.24/fs/reiser4/tap.h 2008-01-25 11:39:07.092247874 +0300
66209@@ -0,0 +1,70 @@
66210+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
66211+
66212+/* Tree Access Pointers. See tap.c for more details. */
66213+
66214+#if !defined( __REISER4_TAP_H__ )
66215+#define __REISER4_TAP_H__
66216+
66217+#include "forward.h"
66218+#include "readahead.h"
66219+
66220+/**
66221+ tree_access_pointer aka tap. Data structure combining coord_t and lock
66222+ handle.
66223+ Invariants involving this data-type, see doc/lock-ordering for details:
66224+
66225+ [tap-sane]
66226+ */
66227+struct tree_access_pointer {
66228+ /* coord tap is at */
66229+ coord_t *coord;
66230+ /* lock handle on ->coord->node */
66231+ lock_handle *lh;
66232+ /* mode of lock acquired by this tap */
66233+ znode_lock_mode mode;
66234+ /* incremented by reiser4_tap_load().
66235+ Decremented by reiser4_tap_relse(). */
66236+ int loaded;
66237+ /* list of taps */
66238+ struct list_head linkage;
66239+ /* read-ahead hint */
66240+ ra_info_t ra_info;
66241+};
66242+
66243+typedef int (*go_actor_t) (tap_t * tap);
66244+
66245+extern int reiser4_tap_load(tap_t * tap);
66246+extern void reiser4_tap_relse(tap_t * tap);
66247+extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
66248+ znode_lock_mode mode);
66249+extern void reiser4_tap_monitor(tap_t * tap);
66250+extern void reiser4_tap_copy(tap_t * dst, tap_t * src);
66251+extern void reiser4_tap_done(tap_t * tap);
66252+extern int reiser4_tap_move(tap_t * tap, lock_handle * target);
66253+extern int tap_to_coord(tap_t * tap, coord_t * target);
66254+
66255+extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
66256+extern int go_next_unit(tap_t * tap);
66257+extern int go_prev_unit(tap_t * tap);
66258+extern int rewind_right(tap_t * tap, int shift);
66259+extern int rewind_left(tap_t * tap, int shift);
66260+
66261+extern struct list_head *reiser4_taps_list(void);
66262+
66263+#define for_all_taps(tap) \
66264+ for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \
66265+ reiser4_taps_list() != &tap->linkage; \
66266+ tap = list_entry(tap->linkage.next, tap_t, linkage))
66267+
66268+/* __REISER4_TAP_H__ */
66269+#endif
66270+/* Make Linus happy.
66271+ Local variables:
66272+ c-indentation-style: "K&R"
66273+ mode-name: "LC"
66274+ c-basic-offset: 8
66275+ tab-width: 8
66276+ fill-column: 120
66277+ scroll-step: 1
66278+ End:
66279+*/
66280diff -urN linux-2.6.24.orig/fs/reiser4/tree.c linux-2.6.24/fs/reiser4/tree.c
66281--- linux-2.6.24.orig/fs/reiser4/tree.c 1970-01-01 03:00:00.000000000 +0300
66282+++ linux-2.6.24/fs/reiser4/tree.c 2008-01-25 11:39:07.096248905 +0300
66283@@ -0,0 +1,1876 @@
66284+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66285+ * reiser4/README */
66286+
66287+/*
66288+ * KEYS IN A TREE.
66289+ *
66290+ * The tree consists of nodes located on the disk. Node in the tree is either
66291+ * formatted or unformatted. Formatted node is one that has structure
66292+ * understood by the tree balancing and traversal code. Formatted nodes are
66293+ * further classified into leaf and internal nodes. Latter distinctions is
66294+ * (almost) of only historical importance: general structure of leaves and
66295+ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
66296+ * that are part of bodies of ordinary files and attributes.
66297+ *
66298+ * Each node in the tree spawns some interval in the key space. Key ranges for
66299+ * all nodes in the tree are disjoint. Actually, this only holds in some weak
66300+ * sense, because of the non-unique keys: intersection of key ranges for
66301+ * different nodes is either empty, or consists of exactly one key.
66302+ *
66303+ * Formatted node consists of a sequence of items. Each item spawns some
66304+ * interval in key space. Key ranges for all items in a tree are disjoint,
66305+ * modulo non-unique keys again. Items within nodes are ordered in the key
66306+ * order of the smallest key in a item.
66307+ *
66308+ * Particular type of item can be further split into units. Unit is piece of
66309+ * item that can be cut from item and moved into another item of the same
66310+ * time. Units are used by balancing code to repack data during balancing.
66311+ *
66312+ * Unit can be further split into smaller entities (for example, extent unit
66313+ * represents several pages, and it is natural for extent code to operate on
66314+ * particular pages and even bytes within one unit), but this is of no
66315+ * relevance to the generic balancing and lookup code.
66316+ *
66317+ * Although item is said to "spawn" range or interval of keys, it is not
66318+ * necessary that item contains piece of data addressable by each and every
66319+ * key in this range. For example, compound directory item, consisting of
66320+ * units corresponding to directory entries and keyed by hashes of file names,
66321+ * looks more as having "discrete spectrum": only some disjoint keys inside
66322+ * range occupied by this item really address data.
66323+ *
66324+ * No than less, each item always has well-defined least (minimal) key, that
66325+ * is recorded in item header, stored in the node this item is in. Also, item
66326+ * plugin can optionally define method ->max_key_inside() returning maximal
66327+ * key that can _possibly_ be located within this item. This method is used
66328+ * (mainly) to determine when given piece of data should be merged into
66329+ * existing item, in stead of creating new one. Because of this, even though
66330+ * ->max_key_inside() can be larger that any key actually located in the item,
66331+ * intervals
66332+ *
66333+ * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
66334+ *
66335+ * are still disjoint for all items within the _same_ node.
66336+ *
66337+ * In memory node is represented by znode. It plays several roles:
66338+ *
66339+ * . something locks are taken on
66340+ *
66341+ * . something tracked by transaction manager (this is going to change)
66342+ *
66343+ * . something used to access node data
66344+ *
66345+ * . something used to maintain tree structure in memory: sibling and
66346+ * parental linkage.
66347+ *
66348+ * . something used to organize nodes into "slums"
66349+ *
66350+ * More on znodes see in znode.[ch]
66351+ *
66352+ * DELIMITING KEYS
66353+ *
66354+ * To simplify balancing, allow some flexibility in locking and speed up
66355+ * important coord cache optimization, we keep delimiting keys of nodes in
66356+ * memory. Depending on disk format (implemented by appropriate node plugin)
66357+ * node on disk can record both left and right delimiting key, only one of
66358+ * them, or none. Still, our balancing and tree traversal code keep both
66359+ * delimiting keys for a node that is in memory stored in the znode. When
66360+ * node is first brought into memory during tree traversal, its left
66361+ * delimiting key is taken from its parent, and its right delimiting key is
66362+ * either next key in its parent, or is right delimiting key of parent if
66363+ * node is the rightmost child of parent.
66364+ *
66365+ * Physical consistency of delimiting key is protected by special dk
66366+ * read-write lock. That is, delimiting keys can only be inspected or
66367+ * modified under this lock. But dk lock is only sufficient for fast
66368+ * "pessimistic" check, because to simplify code and to decrease lock
66369+ * contention, balancing (carry) only updates delimiting keys right before
66370+ * unlocking all locked nodes on the given tree level. For example,
66371+ * coord-by-key cache scans LRU list of recently accessed znodes. For each
66372+ * node it first does fast check under dk spin lock. If key looked for is
66373+ * not between delimiting keys for this node, next node is inspected and so
66374+ * on. If key is inside of the key range, long term lock is taken on node
66375+ * and key range is rechecked.
66376+ *
66377+ * COORDINATES
66378+ *
66379+ * To find something in the tree, you supply a key, and the key is resolved
66380+ * by coord_by_key() into a coord (coordinate) that is valid as long as the
66381+ * node the coord points to remains locked. As mentioned above trees
66382+ * consist of nodes that consist of items that consist of units. A unit is
66383+ * the smallest and indivisible piece of tree as far as balancing and tree
66384+ * search are concerned. Each node, item, and unit can be addressed by
66385+ * giving its level in the tree and the key occupied by this entity. A node
66386+ * knows what the key ranges are of the items within it, and how to find its
66387+ * items and invoke their item handlers, but it does not know how to access
66388+ * individual units within its items except through the item handlers.
66389+ * coord is a structure containing a pointer to the node, the ordinal number
66390+ * of the item within this node (a sort of item offset), and the ordinal
66391+ * number of the unit within this item.
66392+ *
66393+ * TREE LOOKUP
66394+ *
66395+ * There are two types of access to the tree: lookup and modification.
66396+ *
66397+ * Lookup is a search for the key in the tree. Search can look for either
66398+ * exactly the key given to it, or for the largest key that is not greater
66399+ * than the key given to it. This distinction is determined by "bias"
66400+ * parameter of search routine (coord_by_key()). coord_by_key() either
66401+ * returns error (key is not in the tree, or some kind of external error
66402+ * occurred), or successfully resolves key into coord.
66403+ *
66404+ * This resolution is done by traversing tree top-to-bottom from root level
66405+ * to the desired level. On levels above twig level (level one above the
66406+ * leaf level) nodes consist exclusively of internal items. Internal item is
66407+ * nothing more than pointer to the tree node on the child level. On twig
66408+ * level nodes consist of internal items intermixed with extent
66409+ * items. Internal items form normal search tree structure used by traversal
66410+ * to descent through the tree.
66411+ *
66412+ * TREE LOOKUP OPTIMIZATIONS
66413+ *
66414+ * Tree lookup described above is expensive even if all nodes traversed are
66415+ * already in the memory: for each node binary search within it has to be
66416+ * performed and binary searches are CPU consuming and tend to destroy CPU
66417+ * caches.
66418+ *
66419+ * Several optimizations are used to work around this:
66420+ *
66421+ * . cbk_cache (look-aside cache for tree traversals, see search.c for
66422+ * details)
66423+ *
66424+ * . seals (see seal.[ch])
66425+ *
66426+ * . vroot (see search.c)
66427+ *
66428+ * General search-by-key is layered thusly:
66429+ *
66430+ * [check seal, if any] --ok--> done
66431+ * |
66432+ * failed
66433+ * |
66434+ * V
66435+ * [vroot defined] --no--> node = tree_root
66436+ * | |
66437+ * yes |
66438+ * | |
66439+ * V |
66440+ * node = vroot |
66441+ * | |
66442+ * | |
66443+ * | |
66444+ * V V
66445+ * [check cbk_cache for key] --ok--> done
66446+ * |
66447+ * failed
66448+ * |
66449+ * V
66450+ * [start tree traversal from node]
66451+ *
66452+ */
66453+
66454+#include "forward.h"
66455+#include "debug.h"
66456+#include "dformat.h"
66457+#include "key.h"
66458+#include "coord.h"
66459+#include "plugin/item/static_stat.h"
66460+#include "plugin/item/item.h"
66461+#include "plugin/node/node.h"
66462+#include "plugin/plugin.h"
66463+#include "txnmgr.h"
66464+#include "jnode.h"
66465+#include "znode.h"
66466+#include "block_alloc.h"
66467+#include "tree_walk.h"
66468+#include "carry.h"
66469+#include "carry_ops.h"
66470+#include "tap.h"
66471+#include "tree.h"
66472+#include "vfs_ops.h"
66473+#include "page_cache.h"
66474+#include "super.h"
66475+#include "reiser4.h"
66476+#include "inode.h"
66477+
66478+#include <linux/fs.h> /* for struct super_block */
66479+#include <linux/spinlock.h>
66480+
66481+/* Disk address (block number) never ever used for any real tree node. This is
66482+ used as block number of "uber" znode.
66483+
66484+ Invalid block addresses are 0 by tradition.
66485+
66486+*/
66487+const reiser4_block_nr UBER_TREE_ADDR = 0ull;
66488+
66489+#define CUT_TREE_MIN_ITERATIONS 64
66490+
66491+static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
66492+
66493+/* return node plugin of coord->node */
66494+node_plugin *node_plugin_by_coord(const coord_t * coord)
66495+{
66496+ assert("vs-1", coord != NULL);
66497+ assert("vs-2", coord->node != NULL);
66498+
66499+ return coord->node->nplug;
66500+}
66501+
66502+/* insert item into tree. Fields of @coord are updated so that they can be
66503+ * used by consequent insert operation. */
66504+insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
66505+ * into */ ,
66506+ const reiser4_key * key /* key of new item */ ,
66507+ reiser4_item_data * data /* parameters for item
66508+ * creation */ ,
66509+ coord_t * coord /* resulting insertion coord */ ,
66510+ lock_handle * lh /* resulting lock
66511+ * handle */ ,
66512+ tree_level stop_level /** level where to insert */ ,
66513+ __u32 flags /* insertion flags */ )
66514+{
66515+ int result;
66516+
66517+ assert("nikita-358", tree != NULL);
66518+ assert("nikita-360", coord != NULL);
66519+
66520+ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
66521+ FIND_EXACT, stop_level, stop_level,
66522+ flags | CBK_FOR_INSERT, NULL /*ra_info */ );
66523+ switch (result) {
66524+ default:
66525+ break;
66526+ case CBK_COORD_FOUND:
66527+ result = IBK_ALREADY_EXISTS;
66528+ break;
66529+ case CBK_COORD_NOTFOUND:
66530+ assert("nikita-2017", coord->node != NULL);
66531+ result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
66532+ break;
66533+ }
66534+ return result;
66535+}
66536+
66537+/* insert item by calling carry. Helper function called if short-cut
66538+ insertion failed */
66539+static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
66540+ lock_handle * lh, /* lock handle of insertion
66541+ * node */
66542+ reiser4_item_data * data, /* parameters of new
66543+ * item */
66544+ const reiser4_key * key, /* key of new item */
66545+ carry_opcode cop, /* carry operation to perform */
66546+ cop_insert_flag flags
66547+ /* carry flags */ )
66548+{
66549+ int result;
66550+ carry_pool *pool;
66551+ carry_level *lowest_level;
66552+ carry_insert_data *cdata;
66553+ carry_op *op;
66554+
66555+ assert("umka-314", coord != NULL);
66556+
66557+ /* allocate carry_pool and 3 carry_level-s */
66558+ pool =
66559+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66560+ sizeof(*cdata));
66561+ if (IS_ERR(pool))
66562+ return PTR_ERR(pool);
66563+ lowest_level = (carry_level *) (pool + 1);
66564+ init_carry_level(lowest_level, pool);
66565+
66566+ op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
66567+ if (IS_ERR(op) || (op == NULL)) {
66568+ done_carry_pool(pool);
66569+ return RETERR(op ? PTR_ERR(op) : -EIO);
66570+ }
66571+ cdata = (carry_insert_data *) (lowest_level + 3);
66572+ cdata->coord = coord;
66573+ cdata->data = data;
66574+ cdata->key = key;
66575+ op->u.insert.d = cdata;
66576+ if (flags == 0)
66577+ flags = znode_get_tree(coord->node)->carry.insert_flags;
66578+ op->u.insert.flags = flags;
66579+ op->u.insert.type = COPT_ITEM_DATA;
66580+ op->u.insert.child = NULL;
66581+ if (lh != NULL) {
66582+ assert("nikita-3245", lh->node == coord->node);
66583+ lowest_level->track_type = CARRY_TRACK_CHANGE;
66584+ lowest_level->tracked = lh;
66585+ }
66586+
66587+ result = reiser4_carry(lowest_level, NULL);
66588+ done_carry_pool(pool);
66589+
66590+ return result;
66591+}
66592+
66593+/* form carry queue to perform paste of @data with @key at @coord, and launch
66594+ its execution by calling carry().
66595+
66596+ Instruct carry to update @lh it after balancing insertion coord moves into
66597+ different block.
66598+
66599+*/
66600+static int paste_with_carry(coord_t * coord, /* coord of paste */
66601+ lock_handle * lh, /* lock handle of node
66602+ * where item is
66603+ * pasted */
66604+ reiser4_item_data * data, /* parameters of new
66605+ * item */
66606+ const reiser4_key * key, /* key of new item */
66607+ unsigned flags /* paste flags */ )
66608+{
66609+ int result;
66610+ carry_pool *pool;
66611+ carry_level *lowest_level;
66612+ carry_insert_data *cdata;
66613+ carry_op *op;
66614+
66615+ assert("umka-315", coord != NULL);
66616+ assert("umka-316", key != NULL);
66617+
66618+ pool =
66619+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66620+ sizeof(*cdata));
66621+ if (IS_ERR(pool))
66622+ return PTR_ERR(pool);
66623+ lowest_level = (carry_level *) (pool + 1);
66624+ init_carry_level(lowest_level, pool);
66625+
66626+ op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
66627+ if (IS_ERR(op) || (op == NULL)) {
66628+ done_carry_pool(pool);
66629+ return RETERR(op ? PTR_ERR(op) : -EIO);
66630+ }
66631+ cdata = (carry_insert_data *) (lowest_level + 3);
66632+ cdata->coord = coord;
66633+ cdata->data = data;
66634+ cdata->key = key;
66635+ op->u.paste.d = cdata;
66636+ if (flags == 0)
66637+ flags = znode_get_tree(coord->node)->carry.paste_flags;
66638+ op->u.paste.flags = flags;
66639+ op->u.paste.type = COPT_ITEM_DATA;
66640+ if (lh != NULL) {
66641+ lowest_level->track_type = CARRY_TRACK_CHANGE;
66642+ lowest_level->tracked = lh;
66643+ }
66644+
66645+ result = reiser4_carry(lowest_level, NULL);
66646+ done_carry_pool(pool);
66647+
66648+ return result;
66649+}
66650+
66651+/* insert item at the given coord.
66652+
66653+ First try to skip carry by directly calling ->create_item() method of node
66654+ plugin. If this is impossible (there is not enough free space in the node,
66655+ or leftmost item in the node is created), call insert_with_carry_by_coord()
66656+ that will do full carry().
66657+
66658+*/
66659+insert_result insert_by_coord(coord_t * coord /* coord where to
66660+ * insert. coord->node has
66661+ * to be write locked by
66662+ * caller */ ,
66663+ reiser4_item_data * data /* data to be
66664+ * inserted */ ,
66665+ const reiser4_key * key /* key of new item */ ,
66666+ lock_handle * lh /* lock handle of write
66667+ * lock on node */ ,
66668+ __u32 flags /* insertion flags */ )
66669+{
66670+ unsigned item_size;
66671+ int result;
66672+ znode *node;
66673+
66674+ assert("vs-247", coord != NULL);
66675+ assert("vs-248", data != NULL);
66676+ assert("vs-249", data->length >= 0);
66677+ assert("nikita-1191", znode_is_write_locked(coord->node));
66678+
66679+ node = coord->node;
66680+ coord_clear_iplug(coord);
66681+ result = zload(node);
66682+ if (result != 0)
66683+ return result;
66684+
66685+ item_size = space_needed(node, NULL, data, 1);
66686+ if (item_size > znode_free_space(node) &&
66687+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
66688+ && (flags & COPI_DONT_ALLOCATE)) {
66689+ /* we are forced to use free space of coord->node and new item
66690+ does not fit into it.
66691+
66692+ Currently we get here only when we allocate and copy units
66693+ of extent item from a node to its left neighbor during
66694+ "squalloc"-ing. If @node (this is left neighbor) does not
66695+ have enough free space - we do not want to attempt any
66696+ shifting and allocations because we are in squeezing and
66697+ everything to the left of @node is tightly packed.
66698+ */
66699+ result = -E_NODE_FULL;
66700+ } else if ((item_size <= znode_free_space(node)) &&
66701+ !coord_is_before_leftmost(coord) &&
66702+ (node_plugin_by_node(node)->fast_insert != NULL)
66703+ && node_plugin_by_node(node)->fast_insert(coord)) {
66704+ /* shortcut insertion without carry() overhead.
66705+
66706+ Only possible if:
66707+
66708+ - there is enough free space
66709+
66710+ - insertion is not into the leftmost position in a node
66711+ (otherwise it would require updating of delimiting key in a
66712+ parent)
66713+
66714+ - node plugin agrees with this
66715+
66716+ */
66717+ result =
66718+ node_plugin_by_node(node)->create_item(coord, key, data,
66719+ NULL);
66720+ znode_make_dirty(node);
66721+ } else {
66722+ /* otherwise do full-fledged carry(). */
66723+ result =
66724+ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
66725+ flags);
66726+ }
66727+ zrelse(node);
66728+ return result;
66729+}
66730+
66731+/* @coord is set to leaf level and @data is to be inserted to twig level */
66732+insert_result
66733+insert_extent_by_coord(coord_t *
66734+ coord
66735+ /* coord where to insert. coord->node * has to be write * locked by caller */
66736+ ,
66737+ reiser4_item_data * data /* data to be inserted */ ,
66738+ const reiser4_key * key /* key of new item */ ,
66739+ lock_handle *
66740+ lh /* lock handle of write lock on * node */ )
66741+{
66742+ assert("vs-405", coord != NULL);
66743+ assert("vs-406", data != NULL);
66744+ assert("vs-407", data->length > 0);
66745+ assert("vs-408", znode_is_write_locked(coord->node));
66746+ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
66747+
66748+ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
66749+ 0 /*flags */ );
66750+}
66751+
66752+/* Insert into the item at the given coord.
66753+
66754+ First try to skip carry by directly calling ->paste() method of item
66755+ plugin. If this is impossible (there is not enough free space in the node,
66756+ or we are pasting into leftmost position in the node), call
66757+ paste_with_carry() that will do full carry().
66758+
66759+*/
66760+/* paste_into_item */
66761+int insert_into_item(coord_t * coord /* coord of pasting */ ,
66762+ lock_handle * lh /* lock handle on node involved */ ,
66763+ const reiser4_key * key /* key of unit being pasted */ ,
66764+ reiser4_item_data * data /* parameters for new unit */ ,
66765+ unsigned flags /* insert/paste flags */ )
66766+{
66767+ int result;
66768+ int size_change;
66769+ node_plugin *nplug;
66770+ item_plugin *iplug;
66771+
66772+ assert("umka-317", coord != NULL);
66773+ assert("umka-318", key != NULL);
66774+
66775+ iplug = item_plugin_by_coord(coord);
66776+ nplug = node_plugin_by_coord(coord);
66777+
66778+ assert("nikita-1480", iplug == data->iplug);
66779+
66780+ size_change = space_needed(coord->node, coord, data, 0);
66781+ if (size_change > (int)znode_free_space(coord->node) &&
66782+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
66783+ && (flags & COPI_DONT_ALLOCATE)) {
66784+ /* we are forced to use free space of coord->node and new data
66785+ does not fit into it. */
66786+ return -E_NODE_FULL;
66787+ }
66788+
66789+ /* shortcut paste without carry() overhead.
66790+
66791+ Only possible if:
66792+
66793+ - there is enough free space
66794+
66795+ - paste is not into the leftmost unit in a node (otherwise
66796+ it would require updating of delimiting key in a parent)
66797+
66798+ - node plugin agrees with this
66799+
66800+ - item plugin agrees with us
66801+ */
66802+ if (size_change <= (int)znode_free_space(coord->node) &&
66803+ (coord->item_pos != 0 ||
66804+ coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
66805+ coord->unit_pos != 0 && nplug->fast_paste != NULL &&
66806+ nplug->fast_paste(coord) &&
66807+ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
66808+ if (size_change > 0)
66809+ nplug->change_item_size(coord, size_change);
66810+ /* NOTE-NIKITA: huh? where @key is used? */
66811+ result = iplug->b.paste(coord, data, NULL);
66812+ if (size_change < 0)
66813+ nplug->change_item_size(coord, size_change);
66814+ znode_make_dirty(coord->node);
66815+ } else
66816+ /* otherwise do full-fledged carry(). */
66817+ result = paste_with_carry(coord, lh, data, key, flags);
66818+ return result;
66819+}
66820+
66821+/* this either appends or truncates item @coord */
66822+int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
66823+ reiser4_item_data * data /* parameters of resize */ ,
66824+ reiser4_key * key /* key of new unit */ ,
66825+ lock_handle * lh /* lock handle of node
66826+ * being modified */ ,
66827+ cop_insert_flag flags /* carry flags */ )
66828+{
66829+ int result;
66830+ znode *node;
66831+
66832+ assert("nikita-362", coord != NULL);
66833+ assert("nikita-363", data != NULL);
66834+ assert("vs-245", data->length != 0);
66835+
66836+ node = coord->node;
66837+ coord_clear_iplug(coord);
66838+ result = zload(node);
66839+ if (result != 0)
66840+ return result;
66841+
66842+ if (data->length < 0)
66843+ result = node_plugin_by_coord(coord)->shrink_item(coord,
66844+ -data->length);
66845+ else
66846+ result = insert_into_item(coord, lh, key, data, flags);
66847+
66848+ zrelse(node);
66849+ return result;
66850+}
66851+
66852+/* insert flow @f */
66853+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
66854+{
66855+ int result;
66856+ carry_pool *pool;
66857+ carry_level *lowest_level;
66858+ reiser4_item_data *data;
66859+ carry_op *op;
66860+
66861+ pool =
66862+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66863+ sizeof(*data));
66864+ if (IS_ERR(pool))
66865+ return PTR_ERR(pool);
66866+ lowest_level = (carry_level *) (pool + 1);
66867+ init_carry_level(lowest_level, pool);
66868+
66869+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
66870+ 0 /* operate directly on coord -> node */ );
66871+ if (IS_ERR(op) || (op == NULL)) {
66872+ done_carry_pool(pool);
66873+ return RETERR(op ? PTR_ERR(op) : -EIO);
66874+ }
66875+
66876+ /* these are permanent during insert_flow */
66877+ data = (reiser4_item_data *) (lowest_level + 3);
66878+ data->user = 1;
66879+ data->iplug = item_plugin_by_id(FORMATTING_ID);
66880+ data->arg = NULL;
66881+ /* data.length and data.data will be set before calling paste or
66882+ insert */
66883+ data->length = 0;
66884+ data->data = NULL;
66885+
66886+ op->u.insert_flow.flags = 0;
66887+ op->u.insert_flow.insert_point = coord;
66888+ op->u.insert_flow.flow = f;
66889+ op->u.insert_flow.data = data;
66890+ op->u.insert_flow.new_nodes = 0;
66891+
66892+ lowest_level->track_type = CARRY_TRACK_CHANGE;
66893+ lowest_level->tracked = lh;
66894+
66895+ result = reiser4_carry(lowest_level, NULL);
66896+ done_carry_pool(pool);
66897+
66898+ return result;
66899+}
66900+
66901+/* Given a coord in parent node, obtain a znode for the corresponding child */
66902+znode *child_znode(const coord_t * parent_coord /* coord of pointer to
66903+ * child */ ,
66904+ znode * parent /* parent of child */ ,
66905+ int incore_p /* if !0 only return child if already in
66906+ * memory */ ,
66907+ int setup_dkeys_p /* if !0 update delimiting keys of
66908+ * child */ )
66909+{
66910+ znode *child;
66911+
66912+ assert("nikita-1374", parent_coord != NULL);
66913+ assert("nikita-1482", parent != NULL);
66914+#if REISER4_DEBUG
66915+ if (setup_dkeys_p)
66916+ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
66917+#endif
66918+ assert("nikita-2947", znode_is_any_locked(parent));
66919+
66920+ if (znode_get_level(parent) <= LEAF_LEVEL) {
66921+ /* trying to get child of leaf node */
66922+ warning("nikita-1217", "Child of maize?");
66923+ return ERR_PTR(RETERR(-EIO));
66924+ }
66925+ if (item_is_internal(parent_coord)) {
66926+ reiser4_block_nr addr;
66927+ item_plugin *iplug;
66928+ reiser4_tree *tree;
66929+
66930+ iplug = item_plugin_by_coord(parent_coord);
66931+ assert("vs-512", iplug->s.internal.down_link);
66932+ iplug->s.internal.down_link(parent_coord, NULL, &addr);
66933+
66934+ tree = znode_get_tree(parent);
66935+ if (incore_p)
66936+ child = zlook(tree, &addr);
66937+ else
66938+ child =
66939+ zget(tree, &addr, parent,
66940+ znode_get_level(parent) - 1,
66941+ reiser4_ctx_gfp_mask_get());
66942+ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
66943+ set_child_delimiting_keys(parent, parent_coord, child);
66944+ } else {
66945+ warning("nikita-1483", "Internal item expected");
66946+ child = ERR_PTR(RETERR(-EIO));
66947+ }
66948+ return child;
66949+}
66950+
66951+/* remove znode from transaction */
66952+static void uncapture_znode(znode * node)
66953+{
66954+ struct page *page;
66955+
66956+ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66957+
66958+ if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
66959+ int ret;
66960+
66961+ /* An already allocated block goes right to the atom's delete set. */
66962+ ret =
66963+ reiser4_dealloc_block(znode_get_block(node), 0,
66964+ BA_DEFER | BA_FORMATTED);
66965+ if (ret)
66966+ warning("zam-942",
66967+ "can\'t add a block (%llu) number to atom's delete set\n",
66968+ (unsigned long long)(*znode_get_block(node)));
66969+
66970+ spin_lock_znode(node);
66971+ /* Here we return flush reserved block which was reserved at the
66972+ * moment when this allocated node was marked dirty and still
66973+ * not used by flush in node relocation procedure. */
66974+ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
66975+ txn_atom *atom;
66976+
66977+ atom = jnode_get_atom(ZJNODE(node));
66978+ assert("zam-939", atom != NULL);
66979+ spin_unlock_znode(node);
66980+ flush_reserved2grabbed(atom, (__u64) 1);
66981+ spin_unlock_atom(atom);
66982+ } else
66983+ spin_unlock_znode(node);
66984+ } else {
66985+ /* znode has assigned block which is counted as "fake
66986+ allocated". Return it back to "free blocks") */
66987+ fake_allocated2free((__u64) 1, BA_FORMATTED);
66988+ }
66989+
66990+ /*
66991+ * uncapture page from transaction. There is a possibility of a race
66992+ * with ->releasepage(): reiser4_releasepage() detaches page from this
66993+ * jnode and we have nothing to uncapture. To avoid this, get
66994+ * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
66995+ * will deal with released page itself.
66996+ */
66997+ spin_lock_znode(node);
66998+ page = znode_page(node);
66999+ if (likely(page != NULL)) {
67000+ /*
67001+ * reiser4_uncapture_page() can only be called when we are sure
67002+ * that znode is pinned in memory, which we are, because
67003+ * forget_znode() is only called from longterm_unlock_znode().
67004+ */
67005+ page_cache_get(page);
67006+ spin_unlock_znode(node);
67007+ lock_page(page);
67008+ reiser4_uncapture_page(page);
67009+ unlock_page(page);
67010+ page_cache_release(page);
67011+ } else {
67012+ txn_atom *atom;
67013+
67014+ /* handle "flush queued" znodes */
67015+ while (1) {
67016+ atom = jnode_get_atom(ZJNODE(node));
67017+ assert("zam-943", atom != NULL);
67018+
67019+ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
67020+ || !atom->nr_running_queues)
67021+ break;
67022+
67023+ spin_unlock_znode(node);
67024+ reiser4_atom_wait_event(atom);
67025+ spin_lock_znode(node);
67026+ }
67027+
67028+ reiser4_uncapture_block(ZJNODE(node));
67029+ spin_unlock_atom(atom);
67030+ zput(node);
67031+ }
67032+}
67033+
67034+/* This is called from longterm_unlock_znode() when last lock is released from
67035+ the node that has been removed from the tree. At this point node is removed
67036+ from sibling list and its lock is invalidated. */
67037+void forget_znode(lock_handle * handle)
67038+{
67039+ znode *node;
67040+ reiser4_tree *tree;
67041+
67042+ assert("umka-319", handle != NULL);
67043+
67044+ node = handle->node;
67045+ tree = znode_get_tree(node);
67046+
67047+ assert("vs-164", znode_is_write_locked(node));
67048+ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
67049+ assert_rw_locked(&(node->lock.guard));
67050+
67051+ /* We assume that this node was detached from its parent before
67052+ * unlocking, it gives no way to reach this node from parent through a
67053+ * down link. The node should have no children and, thereby, can't be
67054+ * reached from them by their parent pointers. The only way to obtain a
67055+ * reference to the node is to use sibling pointers from its left and
67056+ * right neighbors. In the next several lines we remove the node from
67057+ * the sibling list. */
67058+
67059+ write_lock_tree(tree);
67060+ sibling_list_remove(node);
67061+ znode_remove(node, tree);
67062+ write_unlock_tree(tree);
67063+
67064+ /* Here we set JNODE_DYING and cancel all pending lock requests. It
67065+ * forces all lock requestor threads to repeat iterations of getting
67066+ * lock on a child, neighbor or parent node. But, those threads can't
67067+ * come to this node again, because this node is no longer a child,
67068+ * neighbor or parent of any other node. This order of znode
67069+ * invalidation does not allow other threads to waste cpu time is a busy
67070+ * loop, trying to lock dying object. The exception is in the flush
67071+ * code when we take node directly from atom's capture list.*/
67072+ reiser4_invalidate_lock(handle);
67073+ uncapture_znode(node);
67074+}
67075+
67076+/* Check that internal item at @pointer really contains pointer to @child. */
67077+int check_tree_pointer(const coord_t * pointer /* would-be pointer to
67078+ * @child */ ,
67079+ const znode * child /* child znode */ )
67080+{
67081+ assert("nikita-1016", pointer != NULL);
67082+ assert("nikita-1017", child != NULL);
67083+ assert("nikita-1018", pointer->node != NULL);
67084+
67085+ assert("nikita-1325", znode_is_any_locked(pointer->node));
67086+
67087+ assert("nikita-2985",
67088+ znode_get_level(pointer->node) == znode_get_level(child) + 1);
67089+
67090+ coord_clear_iplug((coord_t *) pointer);
67091+
67092+ if (coord_is_existing_unit(pointer)) {
67093+ item_plugin *iplug;
67094+ reiser4_block_nr addr;
67095+
67096+ if (item_is_internal(pointer)) {
67097+ iplug = item_plugin_by_coord(pointer);
67098+ assert("vs-513", iplug->s.internal.down_link);
67099+ iplug->s.internal.down_link(pointer, NULL, &addr);
67100+ /* check that cached value is correct */
67101+ if (disk_addr_eq(&addr, znode_get_block(child))) {
67102+ return NS_FOUND;
67103+ }
67104+ }
67105+ }
67106+ /* warning ("jmacd-1002", "tree pointer incorrect"); */
67107+ return NS_NOT_FOUND;
67108+}
67109+
67110+/* find coord of pointer to new @child in @parent.
67111+
67112+ Find the &coord_t in the @parent where pointer to a given @child will
67113+ be in.
67114+
67115+*/
67116+int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
67117+ znode *
67118+ child UNUSED_ARG /* child znode, passed locked */ ,
67119+ znode * left /* left brother of new node */ ,
67120+ coord_t * result /* where result is stored in */ )
67121+{
67122+ int ret;
67123+
67124+ assert("nikita-1486", parent != NULL);
67125+ assert("nikita-1487", child != NULL);
67126+ assert("nikita-1488", result != NULL);
67127+
67128+ ret = find_child_ptr(parent, left, result);
67129+ if (ret != NS_FOUND) {
67130+ warning("nikita-1489", "Cannot find brother position: %i", ret);
67131+ return RETERR(-EIO);
67132+ } else {
67133+ result->between = AFTER_UNIT;
67134+ return RETERR(NS_NOT_FOUND);
67135+ }
67136+}
67137+
67138+/* find coord of pointer to @child in @parent.
67139+
67140+ Find the &coord_t in the @parent where pointer to a given @child is in.
67141+
67142+*/
67143+int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
67144+ znode * child /* child znode, passed locked */ ,
67145+ coord_t * result /* where result is stored in */ )
67146+{
67147+ int lookup_res;
67148+ node_plugin *nplug;
67149+ /* left delimiting key of a child */
67150+ reiser4_key ld;
67151+ reiser4_tree *tree;
67152+
67153+ assert("nikita-934", parent != NULL);
67154+ assert("nikita-935", child != NULL);
67155+ assert("nikita-936", result != NULL);
67156+ assert("zam-356", znode_is_loaded(parent));
67157+
67158+ coord_init_zero(result);
67159+ result->node = parent;
67160+
67161+ nplug = parent->nplug;
67162+ assert("nikita-939", nplug != NULL);
67163+
67164+ tree = znode_get_tree(parent);
67165+ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
67166+ * not aliased to ->in_parent of some znode. Otherwise,
67167+ * parent_coord_to_coord() below would modify data protected by tree
67168+ * lock. */
67169+ read_lock_tree(tree);
67170+ /* fast path. Try to use cached value. Lock tree to keep
67171+ node->pos_in_parent and pos->*_blocknr consistent. */
67172+ if (child->in_parent.item_pos + 1 != 0) {
67173+ parent_coord_to_coord(&child->in_parent, result);
67174+ if (check_tree_pointer(result, child) == NS_FOUND) {
67175+ read_unlock_tree(tree);
67176+ return NS_FOUND;
67177+ }
67178+
67179+ child->in_parent.item_pos = (unsigned short)~0;
67180+ }
67181+ read_unlock_tree(tree);
67182+
67183+ /* is above failed, find some key from @child. We are looking for the
67184+ least key in a child. */
67185+ read_lock_dk(tree);
67186+ ld = *znode_get_ld_key(child);
67187+ read_unlock_dk(tree);
67188+ /*
67189+ * now, lookup parent with key just found. Note, that left delimiting
67190+ * key doesn't identify node uniquely, because (in extremely rare
67191+ * case) two nodes can have equal left delimiting keys, if one of them
67192+ * is completely filled with directory entries that all happened to be
67193+ * hash collision. But, we check block number in check_tree_pointer()
67194+ * and, so, are safe.
67195+ */
67196+ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
67197+ /* update cached pos_in_node */
67198+ if (lookup_res == NS_FOUND) {
67199+ write_lock_tree(tree);
67200+ coord_to_parent_coord(result, &child->in_parent);
67201+ write_unlock_tree(tree);
67202+ lookup_res = check_tree_pointer(result, child);
67203+ }
67204+ if (lookup_res == NS_NOT_FOUND)
67205+ lookup_res = find_child_by_addr(parent, child, result);
67206+ return lookup_res;
67207+}
67208+
67209+/* find coord of pointer to @child in @parent by scanning
67210+
67211+ Find the &coord_t in the @parent where pointer to a given @child
67212+ is in by scanning all internal items in @parent and comparing block
67213+ numbers in them with that of @child.
67214+
67215+*/
67216+static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
67217+ znode * child /* child znode, passed locked */ ,
67218+ coord_t * result /* where result is stored in */ )
67219+{
67220+ int ret;
67221+
67222+ assert("nikita-1320", parent != NULL);
67223+ assert("nikita-1321", child != NULL);
67224+ assert("nikita-1322", result != NULL);
67225+
67226+ ret = NS_NOT_FOUND;
67227+
67228+ for_all_units(result, parent) {
67229+ if (check_tree_pointer(result, child) == NS_FOUND) {
67230+ write_lock_tree(znode_get_tree(parent));
67231+ coord_to_parent_coord(result, &child->in_parent);
67232+ write_unlock_tree(znode_get_tree(parent));
67233+ ret = NS_FOUND;
67234+ break;
67235+ }
67236+ }
67237+ return ret;
67238+}
67239+
67240+/* true, if @addr is "unallocated block number", which is just address, with
67241+ highest bit set. */
67242+int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
67243+ * check */ )
67244+{
67245+ assert("nikita-1766", addr != NULL);
67246+ cassert(sizeof(reiser4_block_nr) == 8);
67247+ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
67248+ REISER4_UNALLOCATED_STATUS_VALUE;
67249+}
67250+
67251+/* returns true if removing bytes of given range of key [from_key, to_key]
67252+ causes removing of whole item @from */
67253+static int
67254+item_removed_completely(coord_t * from, const reiser4_key * from_key,
67255+ const reiser4_key * to_key)
67256+{
67257+ item_plugin *iplug;
67258+ reiser4_key key_in_item;
67259+
67260+ assert("umka-325", from != NULL);
67261+ assert("", item_is_extent(from));
67262+
67263+ /* check first key just for case */
67264+ item_key_by_coord(from, &key_in_item);
67265+ if (keygt(from_key, &key_in_item))
67266+ return 0;
67267+
67268+ /* check last key */
67269+ iplug = item_plugin_by_coord(from);
67270+ assert("vs-611", iplug && iplug->s.file.append_key);
67271+
67272+ iplug->s.file.append_key(from, &key_in_item);
67273+ set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
67274+
67275+ if (keylt(to_key, &key_in_item))
67276+ /* last byte is not removed */
67277+ return 0;
67278+ return 1;
67279+}
67280+
67281+/* helper function for prepare_twig_kill(): @left and @right are formatted
67282+ * neighbors of extent item being completely removed. Load and lock neighbors
67283+ * and store lock handles into @cdata for later use by kill_hook_extent() */
67284+static int
67285+prepare_children(znode * left, znode * right, carry_kill_data * kdata)
67286+{
67287+ int result;
67288+ int left_loaded;
67289+ int right_loaded;
67290+
67291+ result = 0;
67292+ left_loaded = right_loaded = 0;
67293+
67294+ if (left != NULL) {
67295+ result = zload(left);
67296+ if (result == 0) {
67297+ left_loaded = 1;
67298+ result = longterm_lock_znode(kdata->left, left,
67299+ ZNODE_READ_LOCK,
67300+ ZNODE_LOCK_LOPRI);
67301+ }
67302+ }
67303+ if (result == 0 && right != NULL) {
67304+ result = zload(right);
67305+ if (result == 0) {
67306+ right_loaded = 1;
67307+ result = longterm_lock_znode(kdata->right, right,
67308+ ZNODE_READ_LOCK,
67309+ ZNODE_LOCK_HIPRI |
67310+ ZNODE_LOCK_NONBLOCK);
67311+ }
67312+ }
67313+ if (result != 0) {
67314+ done_lh(kdata->left);
67315+ done_lh(kdata->right);
67316+ if (left_loaded != 0)
67317+ zrelse(left);
67318+ if (right_loaded != 0)
67319+ zrelse(right);
67320+ }
67321+ return result;
67322+}
67323+
67324+static void done_children(carry_kill_data * kdata)
67325+{
67326+ if (kdata->left != NULL && kdata->left->node != NULL) {
67327+ zrelse(kdata->left->node);
67328+ done_lh(kdata->left);
67329+ }
67330+ if (kdata->right != NULL && kdata->right->node != NULL) {
67331+ zrelse(kdata->right->node);
67332+ done_lh(kdata->right);
67333+ }
67334+}
67335+
67336+/* part of cut_node. It is called when cut_node is called to remove or cut part
67337+ of extent item. When head of that item is removed - we have to update right
67338+ delimiting of left neighbor of extent. When item is removed completely - we
67339+ have to set sibling link between left and right neighbor of removed
67340+ extent. This may return -E_DEADLOCK because of trying to get left neighbor
67341+ locked. So, caller should repeat an attempt
67342+*/
67343+/* Audited by: umka (2002.06.16) */
67344+static int
67345+prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
67346+{
67347+ int result;
67348+ reiser4_key key;
67349+ lock_handle left_lh;
67350+ lock_handle right_lh;
67351+ coord_t left_coord;
67352+ coord_t *from;
67353+ znode *left_child;
67354+ znode *right_child;
67355+ reiser4_tree *tree;
67356+ int left_zloaded_here, right_zloaded_here;
67357+
67358+ from = kdata->params.from;
67359+ assert("umka-326", from != NULL);
67360+ assert("umka-327", kdata->params.to != NULL);
67361+
67362+ /* for one extent item only yet */
67363+ assert("vs-591", item_is_extent(from));
67364+ assert("vs-592", from->item_pos == kdata->params.to->item_pos);
67365+
67366+ if ((kdata->params.from_key
67367+ && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
67368+ || from->unit_pos != 0) {
67369+ /* head of item @from is not removed, there is nothing to
67370+ worry about */
67371+ return 0;
67372+ }
67373+
67374+ result = 0;
67375+ left_zloaded_here = 0;
67376+ right_zloaded_here = 0;
67377+
67378+ left_child = right_child = NULL;
67379+
67380+ coord_dup(&left_coord, from);
67381+ init_lh(&left_lh);
67382+ init_lh(&right_lh);
67383+ if (coord_prev_unit(&left_coord)) {
67384+ /* @from is leftmost item in its node */
67385+ if (!locked_left_neighbor) {
67386+ result =
67387+ reiser4_get_left_neighbor(&left_lh, from->node,
67388+ ZNODE_READ_LOCK,
67389+ GN_CAN_USE_UPPER_LEVELS);
67390+ switch (result) {
67391+ case 0:
67392+ break;
67393+ case -E_NO_NEIGHBOR:
67394+ /* there is no formatted node to the left of
67395+ from->node */
67396+ warning("vs-605",
67397+ "extent item has smallest key in "
67398+ "the tree and it is about to be removed");
67399+ return 0;
67400+ case -E_DEADLOCK:
67401+ /* need to restart */
67402+ default:
67403+ return result;
67404+ }
67405+
67406+ /* we have acquired left neighbor of from->node */
67407+ result = zload(left_lh.node);
67408+ if (result)
67409+ goto done;
67410+
67411+ locked_left_neighbor = left_lh.node;
67412+ } else {
67413+ /* squalloc_right_twig_cut should have supplied locked
67414+ * left neighbor */
67415+ assert("vs-834",
67416+ znode_is_write_locked(locked_left_neighbor));
67417+ result = zload(locked_left_neighbor);
67418+ if (result)
67419+ return result;
67420+ }
67421+
67422+ left_zloaded_here = 1;
67423+ coord_init_last_unit(&left_coord, locked_left_neighbor);
67424+ }
67425+
67426+ if (!item_is_internal(&left_coord)) {
67427+ /* what else but extent can be on twig level */
67428+ assert("vs-606", item_is_extent(&left_coord));
67429+
67430+ /* there is no left formatted child */
67431+ if (left_zloaded_here)
67432+ zrelse(locked_left_neighbor);
67433+ done_lh(&left_lh);
67434+ return 0;
67435+ }
67436+
67437+ tree = znode_get_tree(left_coord.node);
67438+ left_child = child_znode(&left_coord, left_coord.node, 1, 0);
67439+
67440+ if (IS_ERR(left_child)) {
67441+ result = PTR_ERR(left_child);
67442+ goto done;
67443+ }
67444+
67445+ /* left child is acquired, calculate new right delimiting key for it
67446+ and get right child if it is necessary */
67447+ if (item_removed_completely
67448+ (from, kdata->params.from_key, kdata->params.to_key)) {
67449+ /* try to get right child of removed item */
67450+ coord_t right_coord;
67451+
67452+ assert("vs-607",
67453+ kdata->params.to->unit_pos ==
67454+ coord_last_unit_pos(kdata->params.to));
67455+ coord_dup(&right_coord, kdata->params.to);
67456+ if (coord_next_unit(&right_coord)) {
67457+ /* @to is rightmost unit in the node */
67458+ result =
67459+ reiser4_get_right_neighbor(&right_lh, from->node,
67460+ ZNODE_READ_LOCK,
67461+ GN_CAN_USE_UPPER_LEVELS);
67462+ switch (result) {
67463+ case 0:
67464+ result = zload(right_lh.node);
67465+ if (result)
67466+ goto done;
67467+
67468+ right_zloaded_here = 1;
67469+ coord_init_first_unit(&right_coord,
67470+ right_lh.node);
67471+ item_key_by_coord(&right_coord, &key);
67472+ break;
67473+
67474+ case -E_NO_NEIGHBOR:
67475+ /* there is no formatted node to the right of
67476+ from->node */
67477+ read_lock_dk(tree);
67478+ key = *znode_get_rd_key(from->node);
67479+ read_unlock_dk(tree);
67480+ right_coord.node = NULL;
67481+ result = 0;
67482+ break;
67483+ default:
67484+ /* real error */
67485+ goto done;
67486+ }
67487+ } else {
67488+ /* there is an item to the right of @from - take its key */
67489+ item_key_by_coord(&right_coord, &key);
67490+ }
67491+
67492+ /* try to get right child of @from */
67493+ if (right_coord.node && /* there is right neighbor of @from */
67494+ item_is_internal(&right_coord)) { /* it is internal item */
67495+ right_child = child_znode(&right_coord,
67496+ right_coord.node, 1, 0);
67497+
67498+ if (IS_ERR(right_child)) {
67499+ result = PTR_ERR(right_child);
67500+ goto done;
67501+ }
67502+
67503+ }
67504+ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
67505+ update of right delimiting key of left_child */
67506+ result = prepare_children(left_child, right_child, kdata);
67507+ } else {
67508+ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
67509+ result = prepare_children(left_child, NULL, kdata);
67510+ }
67511+
67512+ done:
67513+ if (right_child)
67514+ zput(right_child);
67515+ if (right_zloaded_here)
67516+ zrelse(right_lh.node);
67517+ done_lh(&right_lh);
67518+
67519+ if (left_child)
67520+ zput(left_child);
67521+ if (left_zloaded_here)
67522+ zrelse(locked_left_neighbor);
67523+ done_lh(&left_lh);
67524+ return result;
67525+}
67526+
67527+/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
67528+ are to be cut completely */
67529+/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
67530+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
67531+ const reiser4_key * to_key, /* last key to be removed */
67532+ reiser4_key *
67533+ smallest_removed /* smallest key actually removed */ )
67534+{
67535+ int result;
67536+ carry_pool *pool;
67537+ carry_level *lowest_level;
67538+ carry_cut_data *cut_data;
67539+ carry_op *op;
67540+
67541+ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
67542+
67543+ pool =
67544+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67545+ sizeof(*cut_data));
67546+ if (IS_ERR(pool))
67547+ return PTR_ERR(pool);
67548+ lowest_level = (carry_level *) (pool + 1);
67549+ init_carry_level(lowest_level, pool);
67550+
67551+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67552+ assert("vs-1509", op != 0);
67553+ if (IS_ERR(op)) {
67554+ done_carry_pool(pool);
67555+ return PTR_ERR(op);
67556+ }
67557+
67558+ cut_data = (carry_cut_data *) (lowest_level + 3);
67559+ cut_data->params.from = from;
67560+ cut_data->params.to = to;
67561+ cut_data->params.from_key = from_key;
67562+ cut_data->params.to_key = to_key;
67563+ cut_data->params.smallest_removed = smallest_removed;
67564+
67565+ op->u.cut_or_kill.is_cut = 1;
67566+ op->u.cut_or_kill.u.cut = cut_data;
67567+
67568+ result = reiser4_carry(lowest_level, NULL);
67569+ done_carry_pool(pool);
67570+
67571+ return result;
67572+}
67573+
67574+/* cut part of the node
67575+
67576+ Cut part or whole content of node.
67577+
67578+ cut data between @from and @to of @from->node and call carry() to make
67579+ corresponding changes in the tree. @from->node may become empty. If so -
67580+ pointer to it will be removed. Neighboring nodes are not changed. Smallest
67581+ removed key is stored in @smallest_removed
67582+
67583+*/
67584+int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
67585+ coord_t * to, /* coord of the last unit/item that will be eliminated */
67586+ const reiser4_key * from_key, /* first key to be removed */
67587+ const reiser4_key * to_key, /* last key to be removed */
67588+ reiser4_key * smallest_removed, /* smallest key actually removed */
67589+ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
67590+ * locked (in squalloc_right_twig_cut, namely) */
67591+ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
67592+ invalidate pages together with item pointing to them */
67593+ int truncate)
67594+{ /* this call is made for file truncate) */
67595+ int result;
67596+ carry_pool *pool;
67597+ carry_level *lowest_level;
67598+ carry_kill_data *kdata;
67599+ lock_handle *left_child;
67600+ lock_handle *right_child;
67601+ carry_op *op;
67602+
67603+ assert("umka-328", from != NULL);
67604+ assert("vs-316", !node_is_empty(from->node));
67605+ assert("nikita-1812", coord_is_existing_unit(from)
67606+ && coord_is_existing_unit(to));
67607+
67608+ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
67609+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67610+ sizeof(carry_kill_data) +
67611+ 2 * sizeof(lock_handle) +
67612+ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
67613+ if (IS_ERR(pool))
67614+ return PTR_ERR(pool);
67615+
67616+ lowest_level = (carry_level *) (pool + 1);
67617+ init_carry_level(lowest_level, pool);
67618+
67619+ kdata = (carry_kill_data *) (lowest_level + 3);
67620+ left_child = (lock_handle *) (kdata + 1);
67621+ right_child = left_child + 1;
67622+
67623+ init_lh(left_child);
67624+ init_lh(right_child);
67625+
67626+ kdata->params.from = from;
67627+ kdata->params.to = to;
67628+ kdata->params.from_key = from_key;
67629+ kdata->params.to_key = to_key;
67630+ kdata->params.smallest_removed = smallest_removed;
67631+ kdata->params.truncate = truncate;
67632+ kdata->flags = 0;
67633+ kdata->inode = inode;
67634+ kdata->left = left_child;
67635+ kdata->right = right_child;
67636+ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
67637+ kdata->buf = (char *)(right_child + 1);
67638+
67639+ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
67640+ /* left child of extent item may have to get updated right
67641+ delimiting key and to get linked with right child of extent
67642+ @from if it will be removed completely */
67643+ result = prepare_twig_kill(kdata, locked_left_neighbor);
67644+ if (result) {
67645+ done_children(kdata);
67646+ done_carry_pool(pool);
67647+ return result;
67648+ }
67649+ }
67650+
67651+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67652+ if (IS_ERR(op) || (op == NULL)) {
67653+ done_children(kdata);
67654+ done_carry_pool(pool);
67655+ return RETERR(op ? PTR_ERR(op) : -EIO);
67656+ }
67657+
67658+ op->u.cut_or_kill.is_cut = 0;
67659+ op->u.cut_or_kill.u.kill = kdata;
67660+
67661+ result = reiser4_carry(lowest_level, NULL);
67662+
67663+ done_children(kdata);
67664+ done_carry_pool(pool);
67665+ return result;
67666+}
67667+
67668+void
67669+fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
67670+{
67671+ if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
67672+ pgoff_t start_pg, end_pg;
67673+
67674+ start_pg = start >> PAGE_CACHE_SHIFT;
67675+ end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
67676+
67677+ if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
67678+ /*
67679+ * kill up to the page boundary.
67680+ */
67681+ assert("vs-123456", start_pg == end_pg);
67682+ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
67683+ truncate);
67684+ } else if (start_pg != end_pg) {
67685+ /*
67686+ * page boundary is within killed portion of node.
67687+ */
67688+ assert("vs-654321", end_pg - start_pg == 1);
67689+ reiser4_invalidate_pages(inode->i_mapping, end_pg,
67690+ end_pg - start_pg, 1);
67691+ }
67692+ }
67693+ inode_sub_bytes(inode, end - start);
67694+}
67695+
67696+/**
67697+ * Delete whole @node from the reiser4 tree without loading it.
67698+ *
67699+ * @left: locked left neighbor,
67700+ * @node: node to be deleted,
67701+ * @smallest_removed: leftmost key of deleted node,
67702+ * @object: inode pointer, if we truncate a file body.
67703+ * @truncate: true if called for file truncate.
67704+ *
67705+ * @return: 0 if success, error code otherwise.
67706+ *
67707+ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
67708+ * contains the right value of the smallest removed key from the previous
67709+ * cut_worker() iteration. This is needed for proper accounting of
67710+ * "i_blocks" and "i_bytes" fields of the @object.
67711+ */
67712+int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
67713+ struct inode *object, int truncate)
67714+{
67715+ lock_handle parent_lock;
67716+ coord_t cut_from;
67717+ coord_t cut_to;
67718+ reiser4_tree *tree;
67719+ int ret;
67720+
67721+ assert("zam-937", node != NULL);
67722+ assert("zam-933", znode_is_write_locked(node));
67723+ assert("zam-999", smallest_removed != NULL);
67724+
67725+ init_lh(&parent_lock);
67726+
67727+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
67728+ if (ret)
67729+ return ret;
67730+
67731+ assert("zam-934", !znode_above_root(parent_lock.node));
67732+
67733+ ret = zload(parent_lock.node);
67734+ if (ret)
67735+ goto failed_nozrelse;
67736+
67737+ ret = find_child_ptr(parent_lock.node, node, &cut_from);
67738+ if (ret)
67739+ goto failed;
67740+
67741+ /* decrement child counter and set parent pointer to NULL before
67742+ deleting the list from parent node because of checks in
67743+ internal_kill_item_hook (we can delete the last item from the parent
67744+ node, the parent node is going to be deleted and its c_count should
67745+ be zero). */
67746+
67747+ tree = znode_get_tree(node);
67748+ write_lock_tree(tree);
67749+ init_parent_coord(&node->in_parent, NULL);
67750+ --parent_lock.node->c_count;
67751+ write_unlock_tree(tree);
67752+
67753+ assert("zam-989", item_is_internal(&cut_from));
67754+
67755+ /* @node should be deleted after unlocking. */
67756+ ZF_SET(node, JNODE_HEARD_BANSHEE);
67757+
67758+ /* remove a pointer from the parent node to the node being deleted. */
67759+ coord_dup(&cut_to, &cut_from);
67760+ /* FIXME: shouldn't this be kill_node_content */
67761+ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
67762+ if (ret)
67763+ /* FIXME(Zam): Should we re-connect the node to its parent if
67764+ * cut_node fails? */
67765+ goto failed;
67766+
67767+ {
67768+ reiser4_tree *tree = current_tree;
67769+ __u64 start_offset = 0, end_offset = 0;
67770+
67771+ read_lock_tree(tree);
67772+ write_lock_dk(tree);
67773+ if (object) {
67774+ /* We use @smallest_removed and the left delimiting of
67775+ * the current node for @object->i_blocks, i_bytes
67776+ * calculation. We assume that the items after the
67777+ * *@smallest_removed key have been deleted from the
67778+ * file body. */
67779+ start_offset = get_key_offset(znode_get_ld_key(node));
67780+ end_offset = get_key_offset(smallest_removed);
67781+ }
67782+
67783+ assert("zam-1021", znode_is_connected(node));
67784+ if (node->left)
67785+ znode_set_rd_key(node->left, znode_get_rd_key(node));
67786+
67787+ *smallest_removed = *znode_get_ld_key(node);
67788+
67789+ write_unlock_dk(tree);
67790+ read_unlock_tree(tree);
67791+
67792+ if (object) {
67793+ /* we used to perform actions which are to be performed on items on their removal from tree in
67794+ special item method - kill_hook. Here for optimization reasons we avoid reading node
67795+ containing item we remove and can not call item's kill hook. Instead we call function which
67796+ does exactly the same things as tail kill hook in assumption that node we avoid reading
67797+ contains only one item and that item is a tail one. */
67798+ fake_kill_hook_tail(object, start_offset, end_offset,
67799+ truncate);
67800+ }
67801+ }
67802+ failed:
67803+ zrelse(parent_lock.node);
67804+ failed_nozrelse:
67805+ done_lh(&parent_lock);
67806+
67807+ return ret;
67808+}
67809+
67810+static int can_delete(const reiser4_key *key, znode *node)
67811+{
67812+ int result;
67813+
67814+ read_lock_dk(current_tree);
67815+ result = keyle(key, znode_get_ld_key(node));
67816+ read_unlock_dk(current_tree);
67817+ return result;
67818+}
67819+
67820+/**
67821+ * This subroutine is not optimal but implementation seems to
67822+ * be easier).
67823+ *
67824+ * @tap: the point deletion process begins from,
67825+ * @from_key: the beginning of the deleted key range,
67826+ * @to_key: the end of the deleted key range,
67827+ * @smallest_removed: the smallest removed key,
67828+ * @truncate: true if called for file truncate.
67829+ * @progress: return true if a progress in file items deletions was made,
67830+ * @smallest_removed value is actual in that case.
67831+ *
67832+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long
67833+ * reiser4_cut_tree operation was interrupted for allowing atom commit.
67834+ */
67835+int
67836+cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
67837+ const reiser4_key * to_key,
67838+ reiser4_key * smallest_removed, struct inode *object,
67839+ int truncate, int *progress)
67840+{
67841+ lock_handle next_node_lock;
67842+ coord_t left_coord;
67843+ int result;
67844+
67845+ assert("zam-931", tap->coord->node != NULL);
67846+ assert("zam-932", znode_is_write_locked(tap->coord->node));
67847+
67848+ *progress = 0;
67849+ init_lh(&next_node_lock);
67850+
67851+ while (1) {
67852+ znode *node; /* node from which items are cut */
67853+ node_plugin *nplug; /* node plugin for @node */
67854+
67855+ node = tap->coord->node;
67856+
67857+ /* Move next_node_lock to the next node on the left. */
67858+ result =
67859+ reiser4_get_left_neighbor(&next_node_lock, node,
67860+ ZNODE_WRITE_LOCK,
67861+ GN_CAN_USE_UPPER_LEVELS);
67862+ if (result != 0 && result != -E_NO_NEIGHBOR)
67863+ break;
67864+ /* Check can we delete the node as a whole. */
67865+ if (*progress && znode_get_level(node) == LEAF_LEVEL &&
67866+ can_delete(from_key, node)) {
67867+ result = reiser4_delete_node(node, smallest_removed,
67868+ object, truncate);
67869+ } else {
67870+ result = reiser4_tap_load(tap);
67871+ if (result)
67872+ return result;
67873+
67874+ /* Prepare the second (right) point for cut_node() */
67875+ if (*progress)
67876+ coord_init_last_unit(tap->coord, node);
67877+
67878+ else if (item_plugin_by_coord(tap->coord)->b.lookup ==
67879+ NULL)
67880+ /* set rightmost unit for the items without lookup method */
67881+ tap->coord->unit_pos =
67882+ coord_last_unit_pos(tap->coord);
67883+
67884+ nplug = node->nplug;
67885+
67886+ assert("vs-686", nplug);
67887+ assert("vs-687", nplug->lookup);
67888+
67889+ /* left_coord is leftmost unit cut from @node */
67890+ result = nplug->lookup(node, from_key,
67891+ FIND_MAX_NOT_MORE_THAN,
67892+ &left_coord);
67893+
67894+ if (IS_CBKERR(result))
67895+ break;
67896+
67897+ /* adjust coordinates so that they are set to existing units */
67898+ if (coord_set_to_right(&left_coord)
67899+ || coord_set_to_left(tap->coord)) {
67900+ result = 0;
67901+ break;
67902+ }
67903+
67904+ if (coord_compare(&left_coord, tap->coord) ==
67905+ COORD_CMP_ON_RIGHT) {
67906+ /* keys from @from_key to @to_key are not in the tree */
67907+ result = 0;
67908+ break;
67909+ }
67910+
67911+ if (left_coord.item_pos != tap->coord->item_pos) {
67912+ /* do not allow to cut more than one item. It is added to solve problem of truncating
67913+ partially converted files. If file is partially converted there may exist a twig node
67914+ containing both internal item or items pointing to leaf nodes with formatting items
67915+ and extent item. We do not want to kill internal items being at twig node here
67916+ because cut_tree_worker assumes killing them from level level */
67917+ coord_dup(&left_coord, tap->coord);
67918+ assert("vs-1652",
67919+ coord_is_existing_unit(&left_coord));
67920+ left_coord.unit_pos = 0;
67921+ }
67922+
67923+ /* cut data from one node */
67924+ // *smallest_removed = *reiser4_min_key();
67925+ result =
67926+ kill_node_content(&left_coord, tap->coord, from_key,
67927+ to_key, smallest_removed,
67928+ next_node_lock.node, object,
67929+ truncate);
67930+ reiser4_tap_relse(tap);
67931+ }
67932+ if (result)
67933+ break;
67934+
67935+ ++(*progress);
67936+
67937+ /* Check whether all items with keys >= from_key were removed
67938+ * from the tree. */
67939+ if (keyle(smallest_removed, from_key))
67940+ /* result = 0; */
67941+ break;
67942+
67943+ if (next_node_lock.node == NULL)
67944+ break;
67945+
67946+ result = reiser4_tap_move(tap, &next_node_lock);
67947+ done_lh(&next_node_lock);
67948+ if (result)
67949+ break;
67950+
67951+ /* Break long reiser4_cut_tree operation (deletion of a large
67952+ file) if atom requires commit. */
67953+ if (*progress > CUT_TREE_MIN_ITERATIONS
67954+ && current_atom_should_commit()) {
67955+ result = -E_REPEAT;
67956+ break;
67957+ }
67958+ }
67959+ done_lh(&next_node_lock);
67960+ // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
67961+ return result;
67962+}
67963+
67964+/* there is a fundamental problem with optimizing deletes: VFS does it
67965+ one file at a time. Another problem is that if an item can be
67966+ anything, then deleting items must be done one at a time. It just
67967+ seems clean to writes this to specify a from and a to key, and cut
67968+ everything between them though. */
67969+
67970+/* use this function with care if deleting more than what is part of a single file. */
67971+/* do not use this when cutting a single item, it is suboptimal for that */
67972+
67973+/* You are encouraged to write plugin specific versions of this. It
67974+ cannot be optimal for all plugins because it works item at a time,
67975+ and some plugins could sometimes work node at a time. Regular files
67976+ however are not optimizable to work node at a time because of
67977+ extents needing to free the blocks they point to.
67978+
67979+ Optimizations compared to v3 code:
67980+
67981+ It does not balance (that task is left to memory pressure code).
67982+
67983+ Nodes are deleted only if empty.
67984+
67985+ Uses extents.
67986+
67987+ Performs read-ahead of formatted nodes whose contents are part of
67988+ the deletion.
67989+*/
67990+
67991+/**
67992+ * Delete everything from the reiser4 tree between two keys: @from_key and
67993+ * @to_key.
67994+ *
67995+ * @from_key: the beginning of the deleted key range,
67996+ * @to_key: the end of the deleted key range,
67997+ * @smallest_removed: the smallest removed key,
67998+ * @object: owner of cutting items.
67999+ * @truncate: true if called for file truncate.
68000+ * @progress: return true if a progress in file items deletions was made,
68001+ * @smallest_removed value is actual in that case.
68002+ *
68003+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
68004+ * operation was interrupted for allowing atom commit .
68005+ */
68006+
68007+int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
68008+ const reiser4_key * to_key,
68009+ reiser4_key * smallest_removed_p,
68010+ struct inode *object, int truncate, int *progress)
68011+{
68012+ lock_handle lock;
68013+ int result;
68014+ tap_t tap;
68015+ coord_t right_coord;
68016+ reiser4_key smallest_removed;
68017+ int (*cut_tree_worker) (tap_t *, const reiser4_key *,
68018+ const reiser4_key *, reiser4_key *,
68019+ struct inode *, int, int *);
68020+ STORE_COUNTERS;
68021+
68022+ assert("umka-329", tree != NULL);
68023+ assert("umka-330", from_key != NULL);
68024+ assert("umka-331", to_key != NULL);
68025+ assert("zam-936", keyle(from_key, to_key));
68026+
68027+ if (smallest_removed_p == NULL)
68028+ smallest_removed_p = &smallest_removed;
68029+
68030+ init_lh(&lock);
68031+
68032+ do {
68033+ /* Find rightmost item to cut away from the tree. */
68034+ result = reiser4_object_lookup(object, to_key, &right_coord,
68035+ &lock, ZNODE_WRITE_LOCK,
68036+ FIND_MAX_NOT_MORE_THAN,
68037+ TWIG_LEVEL, LEAF_LEVEL,
68038+ CBK_UNIQUE, NULL /*ra_info */);
68039+ if (result != CBK_COORD_FOUND)
68040+ break;
68041+ if (object == NULL
68042+ || inode_file_plugin(object)->cut_tree_worker == NULL)
68043+ cut_tree_worker = cut_tree_worker_common;
68044+ else
68045+ cut_tree_worker =
68046+ inode_file_plugin(object)->cut_tree_worker;
68047+ reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
68048+ result =
68049+ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
68050+ object, truncate, progress);
68051+ reiser4_tap_done(&tap);
68052+
68053+ reiser4_preempt_point();
68054+
68055+ } while (0);
68056+
68057+ done_lh(&lock);
68058+
68059+ if (result) {
68060+ switch (result) {
68061+ case -E_NO_NEIGHBOR:
68062+ result = 0;
68063+ break;
68064+ case -E_DEADLOCK:
68065+ result = -E_REPEAT;
68066+ case -E_REPEAT:
68067+ case -ENOMEM:
68068+ case -ENOENT:
68069+ break;
68070+ default:
68071+ warning("nikita-2861", "failure: %i", result);
68072+ }
68073+ }
68074+
68075+ CHECK_COUNTERS;
68076+ return result;
68077+}
68078+
68079+/* repeat reiser4_cut_tree_object until everything is deleted.
68080+ * unlike cut_file_items, it does not end current transaction if -E_REPEAT
68081+ * is returned by cut_tree_object. */
68082+int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68083+ const reiser4_key * to, struct inode *inode, int truncate)
68084+{
68085+ int result;
68086+ int progress;
68087+
68088+ do {
68089+ result = reiser4_cut_tree_object(tree, from, to, NULL,
68090+ inode, truncate, &progress);
68091+ } while (result == -E_REPEAT);
68092+
68093+ return result;
68094+}
68095+
68096+/* finishing reiser4 initialization */
68097+int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being
68098+ * initialized */ ,
68099+ const reiser4_block_nr * root_block /* address of a root block
68100+ * on a disk */ ,
68101+ tree_level height /* height of a tree */ ,
68102+ node_plugin * nplug /* default node plugin */ )
68103+{
68104+ int result;
68105+
68106+ assert("nikita-306", tree != NULL);
68107+ assert("nikita-307", root_block != NULL);
68108+ assert("nikita-308", height > 0);
68109+ assert("nikita-309", nplug != NULL);
68110+ assert("zam-587", tree->super != NULL);
68111+
68112+ tree->root_block = *root_block;
68113+ tree->height = height;
68114+ tree->estimate_one_insert = calc_estimate_one_insert(height);
68115+ tree->nplug = nplug;
68116+
68117+ tree->znode_epoch = 1ull;
68118+
68119+ cbk_cache_init(&tree->cbk_cache);
68120+
68121+ result = znodes_tree_init(tree);
68122+ if (result == 0)
68123+ result = jnodes_tree_init(tree);
68124+ if (result == 0) {
68125+ tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
68126+ reiser4_ctx_gfp_mask_get());
68127+ if (IS_ERR(tree->uber)) {
68128+ result = PTR_ERR(tree->uber);
68129+ tree->uber = NULL;
68130+ }
68131+ }
68132+ return result;
68133+}
68134+
68135+/* release resources associated with @tree */
68136+void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
68137+{
68138+ if (tree == NULL)
68139+ return;
68140+
68141+ if (tree->uber != NULL) {
68142+ zput(tree->uber);
68143+ tree->uber = NULL;
68144+ }
68145+ znodes_tree_done(tree);
68146+ jnodes_tree_done(tree);
68147+ cbk_cache_done(&tree->cbk_cache);
68148+}
68149+
68150+/* Make Linus happy.
68151+ Local variables:
68152+ c-indentation-style: "K&R"
68153+ mode-name: "LC"
68154+ c-basic-offset: 8
68155+ tab-width: 8
68156+ fill-column: 120
68157+ scroll-step: 1
68158+ End:
68159+*/
68160diff -urN linux-2.6.24.orig/fs/reiser4/tree.h linux-2.6.24/fs/reiser4/tree.h
68161--- linux-2.6.24.orig/fs/reiser4/tree.h 1970-01-01 03:00:00.000000000 +0300
68162+++ linux-2.6.24/fs/reiser4/tree.h 2008-01-25 11:39:07.096248905 +0300
68163@@ -0,0 +1,577 @@
68164+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68165+ * reiser4/README */
68166+
68167+/* Tree operations. See fs/reiser4/tree.c for comments */
68168+
68169+#if !defined( __REISER4_TREE_H__ )
68170+#define __REISER4_TREE_H__
68171+
68172+#include "forward.h"
68173+#include "debug.h"
68174+#include "dformat.h"
68175+#include "plugin/node/node.h"
68176+#include "plugin/plugin.h"
68177+#include "znode.h"
68178+#include "tap.h"
68179+
68180+#include <linux/types.h> /* for __u?? */
68181+#include <linux/fs.h> /* for struct super_block */
68182+#include <linux/spinlock.h>
68183+#include <linux/sched.h> /* for struct task_struct */
68184+
68185+/* fictive block number never actually used */
68186+extern const reiser4_block_nr UBER_TREE_ADDR;
68187+
68188+/* &cbk_cache_slot - entry in a coord cache.
68189+
68190+ This is entry in a coord_by_key (cbk) cache, represented by
68191+ &cbk_cache.
68192+
68193+*/
68194+typedef struct cbk_cache_slot {
68195+ /* cached node */
68196+ znode *node;
68197+ /* linkage to the next cbk cache slot in a LRU order */
68198+ struct list_head lru;
68199+} cbk_cache_slot;
68200+
68201+/* &cbk_cache - coord cache. This is part of reiser4_tree.
68202+
68203+ cbk_cache is supposed to speed up tree lookups by caching results of recent
68204+ successful lookups (we don't cache negative results as dentry cache
68205+ does). Cache consists of relatively small number of entries kept in a LRU
68206+ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
68207+ which we can obtain a range of keys that covered by this znode. Before
68208+ embarking into real tree traversal we scan cbk_cache slot by slot and for
68209+ each slot check whether key we are looking for is between minimal and
68210+ maximal keys for node pointed to by this slot. If no match is found, real
68211+ tree traversal is performed and if result is successful, appropriate entry
68212+ is inserted into cache, possibly pulling least recently used entry out of
68213+ it.
68214+
68215+ Tree spin lock is used to protect coord cache. If contention for this
68216+ lock proves to be too high, more finer grained locking can be added.
68217+
68218+ Invariants involving parts of this data-type:
68219+
68220+ [cbk-cache-invariant]
68221+*/
68222+typedef struct cbk_cache {
68223+ /* serializator */
68224+ rwlock_t guard;
68225+ int nr_slots;
68226+ /* head of LRU list of cache slots */
68227+ struct list_head lru;
68228+ /* actual array of slots */
68229+ cbk_cache_slot *slot;
68230+} cbk_cache;
68231+
68232+/* level_lookup_result - possible outcome of looking up key at some level.
68233+ This is used by coord_by_key when traversing tree downward. */
68234+typedef enum {
68235+ /* continue to the next level */
68236+ LOOKUP_CONT,
68237+ /* done. Either required item was found, or we can prove it
68238+ doesn't exist, or some error occurred. */
68239+ LOOKUP_DONE,
68240+ /* restart traversal from the root. Infamous "repetition". */
68241+ LOOKUP_REST
68242+} level_lookup_result;
68243+
68244+/* This is representation of internal reiser4 tree where all file-system
68245+ data and meta-data are stored. This structure is passed to all tree
68246+ manipulation functions. It's different from the super block because:
68247+ we don't want to limit ourselves to strictly one to one mapping
68248+ between super blocks and trees, and, because they are logically
68249+ different: there are things in a super block that have no relation to
68250+ the tree (bitmaps, journalling area, mount options, etc.) and there
68251+ are things in a tree that bear no relation to the super block, like
68252+ tree of znodes.
68253+
68254+ At this time, there is only one tree
68255+ per filesystem, and this struct is part of the super block. We only
68256+ call the super block the super block for historical reasons (most
68257+ other filesystems call the per filesystem metadata the super block).
68258+*/
68259+
68260+struct reiser4_tree {
68261+ /* block_nr == 0 is fake znode. Write lock it, while changing
68262+ tree height. */
68263+ /* disk address of root node of a tree */
68264+ reiser4_block_nr root_block;
68265+
68266+ /* level of the root node. If this is 1, tree consists of root
68267+ node only */
68268+ tree_level height;
68269+
68270+ /*
68271+ * this is cached here avoid calling plugins through function
68272+ * dereference all the time.
68273+ */
68274+ __u64 estimate_one_insert;
68275+
68276+ /* cache of recent tree lookup results */
68277+ cbk_cache cbk_cache;
68278+
68279+ /* hash table to look up znodes by block number. */
68280+ z_hash_table zhash_table;
68281+ z_hash_table zfake_table;
68282+ /* hash table to look up jnodes by inode and offset. */
68283+ j_hash_table jhash_table;
68284+
68285+ /* lock protecting:
68286+ - parent pointers,
68287+ - sibling pointers,
68288+ - znode hash table
68289+ - coord cache
68290+ */
68291+ /* NOTE: The "giant" tree lock can be replaced by more spin locks,
68292+ hoping they will be less contented. We can use one spin lock per one
68293+ znode hash bucket. With adding of some code complexity, sibling
68294+ pointers can be protected by both znode spin locks. However it looks
68295+ more SMP scalable we should test this locking change on n-ways (n >
68296+ 4) SMP machines. Current 4-ways machine test does not show that tree
68297+ lock is contented and it is a bottleneck (2003.07.25). */
68298+
68299+ rwlock_t tree_lock;
68300+
68301+ /* lock protecting delimiting keys */
68302+ rwlock_t dk_lock;
68303+
68304+ /* spin lock protecting znode_epoch */
68305+ spinlock_t epoch_lock;
68306+ /* version stamp used to mark znode updates. See seal.[ch] for more
68307+ * information. */
68308+ __u64 znode_epoch;
68309+
68310+ znode *uber;
68311+ node_plugin *nplug;
68312+ struct super_block *super;
68313+ struct {
68314+ /* carry flags used for insertion of new nodes */
68315+ __u32 new_node_flags;
68316+ /* carry flags used for insertion of new extents */
68317+ __u32 new_extent_flags;
68318+ /* carry flags used for paste operations */
68319+ __u32 paste_flags;
68320+ /* carry flags used for insert operations */
68321+ __u32 insert_flags;
68322+ } carry;
68323+};
68324+
68325+extern int reiser4_init_tree(reiser4_tree * tree,
68326+ const reiser4_block_nr * root_block,
68327+ tree_level height, node_plugin * default_plugin);
68328+extern void reiser4_done_tree(reiser4_tree * tree);
68329+
68330+/* cbk flags: options for coord_by_key() */
68331+typedef enum {
68332+ /* coord_by_key() is called for insertion. This is necessary because
68333+ of extents being located at the twig level. For explanation, see
68334+ comment just above is_next_item_internal().
68335+ */
68336+ CBK_FOR_INSERT = (1 << 0),
68337+ /* coord_by_key() is called with key that is known to be unique */
68338+ CBK_UNIQUE = (1 << 1),
68339+ /* coord_by_key() can trust delimiting keys. This options is not user
68340+ accessible. coord_by_key() will set it automatically. It will be
68341+ only cleared by special-case in extents-on-the-twig-level handling
68342+ where it is necessary to insert item with a key smaller than
68343+ leftmost key in a node. This is necessary because of extents being
68344+ located at the twig level. For explanation, see comment just above
68345+ is_next_item_internal().
68346+ */
68347+ CBK_TRUST_DK = (1 << 2),
68348+ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
68349+ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
68350+ CBK_DKSET = (1 << 5),
68351+ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
68352+ CBK_IN_CACHE = (1 << 7), /* node is already in cache */
68353+ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
68354+ * lock */
68355+} cbk_flags;
68356+
68357+/* insertion outcome. IBK = insert by key */
68358+typedef enum {
68359+ IBK_INSERT_OK = 0,
68360+ IBK_ALREADY_EXISTS = -EEXIST,
68361+ IBK_IO_ERROR = -EIO,
68362+ IBK_NO_SPACE = -E_NODE_FULL,
68363+ IBK_OOM = -ENOMEM
68364+} insert_result;
68365+
68366+#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
68367+
68368+typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
68369+ lock_handle * lh, void *arg);
68370+extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
68371+ lock_handle * lh,
68372+ tree_iterate_actor_t actor, void *arg,
68373+ znode_lock_mode mode, int through_units_p);
68374+extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
68375+ znode_lock_request pri, lock_handle * lh);
68376+
68377+/* return node plugin of @node */
68378+static inline node_plugin *node_plugin_by_node(const znode *
68379+ node /* node to query */ )
68380+{
68381+ assert("vs-213", node != NULL);
68382+ assert("vs-214", znode_is_loaded(node));
68383+
68384+ return node->nplug;
68385+}
68386+
68387+/* number of items in @node */
68388+static inline pos_in_node_t node_num_items(const znode * node)
68389+{
68390+ assert("nikita-2754", znode_is_loaded(node));
68391+ assert("nikita-2468",
68392+ node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
68393+
68394+ return node->nr_items;
68395+}
68396+
68397+/* Return the number of items at the present node. Asserts coord->node !=
68398+ NULL. */
68399+static inline unsigned coord_num_items(const coord_t * coord)
68400+{
68401+ assert("jmacd-9805", coord->node != NULL);
68402+
68403+ return node_num_items(coord->node);
68404+}
68405+
68406+/* true if @node is empty */
68407+static inline int node_is_empty(const znode * node)
68408+{
68409+ return node_num_items(node) == 0;
68410+}
68411+
68412+typedef enum {
68413+ SHIFTED_SOMETHING = 0,
68414+ SHIFT_NO_SPACE = -E_NODE_FULL,
68415+ SHIFT_IO_ERROR = -EIO,
68416+ SHIFT_OOM = -ENOMEM,
68417+} shift_result;
68418+
68419+extern node_plugin *node_plugin_by_coord(const coord_t * coord);
68420+extern int is_coord_in_node(const coord_t * coord);
68421+extern int key_in_node(const reiser4_key *, const coord_t *);
68422+extern void coord_item_move_to(coord_t * coord, int items);
68423+extern void coord_unit_move_to(coord_t * coord, int units);
68424+
68425+/* there are two types of repetitive accesses (ra): intra-syscall
68426+ (local) and inter-syscall (global). Local ra is used when
68427+ during single syscall we add/delete several items and units in the
68428+ same place in a tree. Note that plan-A fragments local ra by
68429+ separating stat-data and file body in key-space. Global ra is
68430+ used when user does repetitive modifications in the same place in a
68431+ tree.
68432+
68433+ Our ra implementation serves following purposes:
68434+ 1 it affects balancing decisions so that next operation in a row
68435+ can be performed faster;
68436+ 2 it affects lower-level read-ahead in page-cache;
68437+ 3 it allows to avoid unnecessary lookups by maintaining some state
68438+ across several operations (this is only for local ra);
68439+ 4 it leaves room for lazy-micro-balancing: when we start a sequence of
68440+ operations they are performed without actually doing any intra-node
68441+ shifts, until we finish sequence or scope of sequence leaves
68442+ current node, only then we really pack node (local ra only).
68443+*/
68444+
68445+/* another thing that can be useful is to keep per-tree and/or
68446+ per-process cache of recent lookups. This cache can be organised as a
68447+ list of block numbers of formatted nodes sorted by starting key in
68448+ this node. Balancings should invalidate appropriate parts of this
68449+ cache.
68450+*/
68451+
68452+lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
68453+ coord_t * coord, lock_handle * handle,
68454+ znode_lock_mode lock, lookup_bias bias,
68455+ tree_level lock_level, tree_level stop_level,
68456+ __u32 flags, ra_info_t *);
68457+
68458+lookup_result reiser4_object_lookup(struct inode *object,
68459+ const reiser4_key * key,
68460+ coord_t * coord,
68461+ lock_handle * lh,
68462+ znode_lock_mode lock_mode,
68463+ lookup_bias bias,
68464+ tree_level lock_level,
68465+ tree_level stop_level,
68466+ __u32 flags, ra_info_t * info);
68467+
68468+insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
68469+ reiser4_item_data * data, coord_t * coord,
68470+ lock_handle * lh,
68471+ tree_level stop_level, __u32 flags);
68472+insert_result insert_by_coord(coord_t * coord,
68473+ reiser4_item_data * data, const reiser4_key * key,
68474+ lock_handle * lh, __u32);
68475+insert_result insert_extent_by_coord(coord_t * coord,
68476+ reiser4_item_data * data,
68477+ const reiser4_key * key, lock_handle * lh);
68478+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
68479+ const reiser4_key * to_key,
68480+ reiser4_key * smallest_removed);
68481+int kill_node_content(coord_t * from, coord_t * to,
68482+ const reiser4_key * from_key, const reiser4_key * to_key,
68483+ reiser4_key * smallest_removed,
68484+ znode * locked_left_neighbor, struct inode *inode,
68485+ int truncate);
68486+
68487+int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
68488+ reiser4_key * key, lock_handle * lh, cop_insert_flag);
68489+int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
68490+ reiser4_item_data * data, unsigned);
68491+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
68492+int find_new_child_ptr(znode * parent, znode * child, znode * left,
68493+ coord_t * result);
68494+
68495+int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
68496+int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
68497+
68498+void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
68499+
68500+extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
68501+ const reiser4_key *, reiser4_key *,
68502+ struct inode *, int, int *);
68503+extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
68504+ const reiser4_key *, reiser4_key *,
68505+ struct inode *, int, int *);
68506+extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68507+ const reiser4_key * to, struct inode *, int);
68508+
68509+extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
68510+extern int check_tree_pointer(const coord_t * pointer, const znode * child);
68511+extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
68512+ znode * left, coord_t * result);
68513+extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
68514+extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
68515+ znode * child);
68516+extern znode *child_znode(const coord_t * in_parent, znode * parent,
68517+ int incore_p, int setup_dkeys_p);
68518+
68519+extern int cbk_cache_init(cbk_cache * cache);
68520+extern void cbk_cache_done(cbk_cache * cache);
68521+extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
68522+
68523+extern char *sprint_address(const reiser4_block_nr * block);
68524+
68525+#if REISER4_DEBUG
68526+extern void print_coord_content(const char *prefix, coord_t * p);
68527+extern void reiser4_print_address(const char *prefix,
68528+ const reiser4_block_nr * block);
68529+extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
68530+ __u32 flags);
68531+extern void check_dkeys(znode *node);
68532+#else
68533+#define print_coord_content(p, c) noop
68534+#define reiser4_print_address(p, b) noop
68535+#endif
68536+
68537+extern void forget_znode(lock_handle * handle);
68538+extern int deallocate_znode(znode * node);
68539+
68540+extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
68541+
68542+/* struct used internally to pack all numerous arguments of tree lookup.
68543+ Used to avoid passing a lot of arguments to helper functions. */
68544+typedef struct cbk_handle {
68545+ /* tree we are in */
68546+ reiser4_tree *tree;
68547+ /* key we are going after */
68548+ const reiser4_key *key;
68549+ /* coord we will store result in */
68550+ coord_t *coord;
68551+ /* type of lock to take on target node */
68552+ znode_lock_mode lock_mode;
68553+ /* lookup bias. See comments at the declaration of lookup_bias */
68554+ lookup_bias bias;
68555+ /* lock level: level starting from which tree traversal starts taking
68556+ * write locks. */
68557+ tree_level lock_level;
68558+ /* level where search will stop. Either item will be found between
68559+ lock_level and stop_level, or CBK_COORD_NOTFOUND will be
68560+ returned.
68561+ */
68562+ tree_level stop_level;
68563+ /* level we are currently at */
68564+ tree_level level;
68565+ /* block number of @active node. Tree traversal operates on two
68566+ nodes: active and parent. */
68567+ reiser4_block_nr block;
68568+ /* put here error message to be printed by caller */
68569+ const char *error;
68570+ /* result passed back to caller */
68571+ lookup_result result;
68572+ /* lock handles for active and parent */
68573+ lock_handle *parent_lh;
68574+ lock_handle *active_lh;
68575+ reiser4_key ld_key;
68576+ reiser4_key rd_key;
68577+ /* flags, passed to the cbk routine. Bits of this bitmask are defined
68578+ in tree.h:cbk_flags enum. */
68579+ __u32 flags;
68580+ ra_info_t *ra_info;
68581+ struct inode *object;
68582+} cbk_handle;
68583+
68584+extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
68585+
68586+/* eottl.c */
68587+extern int handle_eottl(cbk_handle *h, int *outcome);
68588+
68589+int lookup_multikey(cbk_handle * handle, int nr_keys);
68590+int lookup_couple(reiser4_tree * tree,
68591+ const reiser4_key * key1, const reiser4_key * key2,
68592+ coord_t * coord1, coord_t * coord2,
68593+ lock_handle * lh1, lock_handle * lh2,
68594+ znode_lock_mode lock_mode, lookup_bias bias,
68595+ tree_level lock_level, tree_level stop_level, __u32 flags,
68596+ int *result1, int *result2);
68597+
68598+static inline void read_lock_tree(reiser4_tree *tree)
68599+{
68600+ /* check that tree is not locked */
68601+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68602+ LOCK_CNT_NIL(read_locked_tree) &&
68603+ LOCK_CNT_NIL(write_locked_tree)));
68604+ /* check that spinlocks of lower priorities are not held */
68605+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68606+ LOCK_CNT_NIL(rw_locked_dk) &&
68607+ LOCK_CNT_NIL(spin_locked_stack)));
68608+
68609+ read_lock(&(tree->tree_lock));
68610+
68611+ LOCK_CNT_INC(read_locked_tree);
68612+ LOCK_CNT_INC(rw_locked_tree);
68613+ LOCK_CNT_INC(spin_locked);
68614+}
68615+
68616+static inline void read_unlock_tree(reiser4_tree *tree)
68617+{
68618+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
68619+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68620+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68621+
68622+ LOCK_CNT_DEC(read_locked_tree);
68623+ LOCK_CNT_DEC(rw_locked_tree);
68624+ LOCK_CNT_DEC(spin_locked);
68625+
68626+ read_unlock(&(tree->tree_lock));
68627+}
68628+
68629+static inline void write_lock_tree(reiser4_tree *tree)
68630+{
68631+ /* check that tree is not locked */
68632+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68633+ LOCK_CNT_NIL(read_locked_tree) &&
68634+ LOCK_CNT_NIL(write_locked_tree)));
68635+ /* check that spinlocks of lower priorities are not held */
68636+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68637+ LOCK_CNT_NIL(rw_locked_dk) &&
68638+ LOCK_CNT_NIL(spin_locked_stack)));
68639+
68640+ write_lock(&(tree->tree_lock));
68641+
68642+ LOCK_CNT_INC(write_locked_tree);
68643+ LOCK_CNT_INC(rw_locked_tree);
68644+ LOCK_CNT_INC(spin_locked);
68645+}
68646+
68647+static inline void write_unlock_tree(reiser4_tree *tree)
68648+{
68649+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
68650+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68651+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68652+
68653+ LOCK_CNT_DEC(write_locked_tree);
68654+ LOCK_CNT_DEC(rw_locked_tree);
68655+ LOCK_CNT_DEC(spin_locked);
68656+
68657+ write_unlock(&(tree->tree_lock));
68658+}
68659+
68660+static inline void read_lock_dk(reiser4_tree *tree)
68661+{
68662+ /* check that dk is not locked */
68663+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68664+ LOCK_CNT_NIL(read_locked_dk) &&
68665+ LOCK_CNT_NIL(write_locked_dk)));
68666+ /* check that spinlocks of lower priorities are not held */
68667+ assert("", LOCK_CNT_NIL(spin_locked_stack));
68668+
68669+ read_lock(&((tree)->dk_lock));
68670+
68671+ LOCK_CNT_INC(read_locked_dk);
68672+ LOCK_CNT_INC(rw_locked_dk);
68673+ LOCK_CNT_INC(spin_locked);
68674+}
68675+
68676+static inline void read_unlock_dk(reiser4_tree *tree)
68677+{
68678+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
68679+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
68680+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68681+
68682+ LOCK_CNT_DEC(read_locked_dk);
68683+ LOCK_CNT_DEC(rw_locked_dk);
68684+ LOCK_CNT_DEC(spin_locked);
68685+
68686+ read_unlock(&(tree->dk_lock));
68687+}
68688+
68689+static inline void write_lock_dk(reiser4_tree *tree)
68690+{
68691+ /* check that dk is not locked */
68692+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68693+ LOCK_CNT_NIL(read_locked_dk) &&
68694+ LOCK_CNT_NIL(write_locked_dk)));
68695+ /* check that spinlocks of lower priorities are not held */
68696+ assert("", LOCK_CNT_NIL(spin_locked_stack));
68697+
68698+ write_lock(&((tree)->dk_lock));
68699+
68700+ LOCK_CNT_INC(write_locked_dk);
68701+ LOCK_CNT_INC(rw_locked_dk);
68702+ LOCK_CNT_INC(spin_locked);
68703+}
68704+
68705+static inline void write_unlock_dk(reiser4_tree *tree)
68706+{
68707+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
68708+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
68709+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68710+
68711+ LOCK_CNT_DEC(write_locked_dk);
68712+ LOCK_CNT_DEC(rw_locked_dk);
68713+ LOCK_CNT_DEC(spin_locked);
68714+
68715+ write_unlock(&(tree->dk_lock));
68716+}
68717+
68718+/* estimate api. Implementation is in estimate.c */
68719+reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
68720+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
68721+reiser4_block_nr estimate_insert_flow(tree_level);
68722+reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
68723+reiser4_block_nr calc_estimate_one_insert(tree_level);
68724+reiser4_block_nr estimate_dirty_cluster(struct inode *);
68725+reiser4_block_nr estimate_insert_cluster(struct inode *);
68726+reiser4_block_nr estimate_update_cluster(struct inode *);
68727+
68728+/* __REISER4_TREE_H__ */
68729+#endif
68730+
68731+/* Make Linus happy.
68732+ Local variables:
68733+ c-indentation-style: "K&R"
68734+ mode-name: "LC"
68735+ c-basic-offset: 8
68736+ tab-width: 8
68737+ fill-column: 120
68738+ scroll-step: 1
68739+ End:
68740+*/
68741diff -urN linux-2.6.24.orig/fs/reiser4/tree_mod.c linux-2.6.24/fs/reiser4/tree_mod.c
68742--- linux-2.6.24.orig/fs/reiser4/tree_mod.c 1970-01-01 03:00:00.000000000 +0300
68743+++ linux-2.6.24/fs/reiser4/tree_mod.c 2008-01-25 11:39:07.100249935 +0300
68744@@ -0,0 +1,386 @@
68745+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68746+ * reiser4/README */
68747+
68748+/*
68749+ * Functions to add/delete new nodes to/from the tree.
68750+ *
68751+ * Functions from this file are used by carry (see carry*) to handle:
68752+ *
68753+ * . insertion of new formatted node into tree
68754+ *
68755+ * . addition of new tree root, increasing tree height
68756+ *
68757+ * . removing tree root, decreasing tree height
68758+ *
68759+ */
68760+
68761+#include "forward.h"
68762+#include "debug.h"
68763+#include "dformat.h"
68764+#include "key.h"
68765+#include "coord.h"
68766+#include "plugin/plugin.h"
68767+#include "jnode.h"
68768+#include "znode.h"
68769+#include "tree_mod.h"
68770+#include "block_alloc.h"
68771+#include "tree_walk.h"
68772+#include "tree.h"
68773+#include "super.h"
68774+
68775+#include <linux/err.h>
68776+
68777+static int add_child_ptr(znode * parent, znode * child);
68778+/* warning only issued if error is not -E_REPEAT */
68779+#define ewarning( error, ... ) \
68780+ if( ( error ) != -E_REPEAT ) \
68781+ warning( __VA_ARGS__ )
68782+
68783+/* allocate new node on the @level and immediately on the right of @brother. */
68784+znode * reiser4_new_node(znode * brother /* existing left neighbor
68785+ * of new node */,
68786+ tree_level level /* tree level at which new node is to
68787+ * be allocated */)
68788+{
68789+ znode *result;
68790+ int retcode;
68791+ reiser4_block_nr blocknr;
68792+
68793+ assert("nikita-930", brother != NULL);
68794+ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
68795+
68796+ retcode = assign_fake_blocknr_formatted(&blocknr);
68797+ if (retcode == 0) {
68798+ result =
68799+ zget(znode_get_tree(brother), &blocknr, NULL, level,
68800+ reiser4_ctx_gfp_mask_get());
68801+ if (IS_ERR(result)) {
68802+ ewarning(PTR_ERR(result), "nikita-929",
68803+ "Cannot allocate znode for carry: %li",
68804+ PTR_ERR(result));
68805+ return result;
68806+ }
68807+ /* cheap test, can be executed even when debugging is off */
68808+ if (!znode_just_created(result)) {
68809+ warning("nikita-2213",
68810+ "Allocated already existing block: %llu",
68811+ (unsigned long long)blocknr);
68812+ zput(result);
68813+ return ERR_PTR(RETERR(-EIO));
68814+ }
68815+
68816+ assert("nikita-931", result != NULL);
68817+ result->nplug = znode_get_tree(brother)->nplug;
68818+ assert("nikita-933", result->nplug != NULL);
68819+
68820+ retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
68821+ if (retcode == 0) {
68822+ ZF_SET(result, JNODE_CREATED);
68823+ zrelse(result);
68824+ } else {
68825+ zput(result);
68826+ result = ERR_PTR(retcode);
68827+ }
68828+ } else {
68829+ /* failure to allocate new node during balancing.
68830+ This should never happen. Ever. Returning -E_REPEAT
68831+ is not viable solution, because "out of disk space"
68832+ is not transient error that will go away by itself.
68833+ */
68834+ ewarning(retcode, "nikita-928",
68835+ "Cannot allocate block for carry: %i", retcode);
68836+ result = ERR_PTR(retcode);
68837+ }
68838+ assert("nikita-1071", result != NULL);
68839+ return result;
68840+}
68841+
68842+/* allocate new root and add it to the tree
68843+
68844+ This helper function is called by add_new_root().
68845+
68846+*/
68847+znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
68848+ znode * fake /* "fake" znode */ )
68849+{
68850+ reiser4_tree *tree = znode_get_tree(old_root);
68851+ znode *new_root = NULL; /* to shut gcc up */
68852+ int result;
68853+
68854+ assert("nikita-1069", old_root != NULL);
68855+ assert("umka-262", fake != NULL);
68856+ assert("umka-263", tree != NULL);
68857+
68858+ /* "fake" znode---one always hanging just above current root. This
68859+ node is locked when new root is created or existing root is
68860+ deleted. Downward tree traversal takes lock on it before taking
68861+ lock on a root node. This avoids race conditions with root
68862+ manipulations.
68863+
68864+ */
68865+ assert("nikita-1348", znode_above_root(fake));
68866+ assert("nikita-1211", znode_is_root(old_root));
68867+
68868+ result = 0;
68869+ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
68870+ warning("nikita-1344", "Tree is too tall: %i", tree->height);
68871+ /* ext2 returns -ENOSPC when it runs out of free inodes with a
68872+ following comment (fs/ext2/ialloc.c:441): Is it really
68873+ ENOSPC?
68874+
68875+ -EXFULL? -EINVAL?
68876+ */
68877+ result = RETERR(-ENOSPC);
68878+ } else {
68879+ /* Allocate block for new root. It's not that
68880+ important where it will be allocated, as root is
68881+ almost always in memory. Moreover, allocate on
68882+ flush can be going here.
68883+ */
68884+ assert("nikita-1448", znode_is_root(old_root));
68885+ new_root = reiser4_new_node(fake, tree->height + 1);
68886+ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
68887+ lock_handle rlh;
68888+
68889+ init_lh(&rlh);
68890+ result =
68891+ longterm_lock_znode(&rlh, new_root,
68892+ ZNODE_WRITE_LOCK,
68893+ ZNODE_LOCK_LOPRI);
68894+ if (result == 0) {
68895+ parent_coord_t *in_parent;
68896+
68897+ znode_make_dirty(fake);
68898+
68899+ /* new root is a child of "fake" node */
68900+ write_lock_tree(tree);
68901+
68902+ ++tree->height;
68903+
68904+ /* recalculate max balance overhead */
68905+ tree->estimate_one_insert =
68906+ estimate_one_insert_item(tree);
68907+
68908+ tree->root_block = *znode_get_block(new_root);
68909+ in_parent = &new_root->in_parent;
68910+ init_parent_coord(in_parent, fake);
68911+ /* manually insert new root into sibling
68912+ * list. With this all nodes involved into
68913+ * balancing are connected after balancing is
68914+ * done---useful invariant to check. */
68915+ sibling_list_insert_nolock(new_root, NULL);
68916+ write_unlock_tree(tree);
68917+
68918+ /* insert into new root pointer to the
68919+ @old_root. */
68920+ assert("nikita-1110",
68921+ WITH_DATA(new_root,
68922+ node_is_empty(new_root)));
68923+ write_lock_dk(tree);
68924+ znode_set_ld_key(new_root, reiser4_min_key());
68925+ znode_set_rd_key(new_root, reiser4_max_key());
68926+ write_unlock_dk(tree);
68927+ if (REISER4_DEBUG) {
68928+ ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
68929+ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
68930+ ZF_SET(old_root, JNODE_ORPHAN);
68931+ }
68932+ result = add_child_ptr(new_root, old_root);
68933+ done_lh(&rlh);
68934+ }
68935+ zrelse(new_root);
68936+ }
68937+ }
68938+ if (result != 0)
68939+ new_root = ERR_PTR(result);
68940+ return new_root;
68941+}
68942+
68943+/* build &reiser4_item_data for inserting child pointer
68944+
68945+ Build &reiser4_item_data that can be later used to insert pointer to @child
68946+ in its parent.
68947+
68948+*/
68949+void build_child_ptr_data(znode * child /* node pointer to which will be
68950+ * inserted */ ,
68951+ reiser4_item_data * data /* where to store result */ )
68952+{
68953+ assert("nikita-1116", child != NULL);
68954+ assert("nikita-1117", data != NULL);
68955+
68956+ /*
68957+ * NOTE: use address of child's blocknr as address of data to be
68958+ * inserted. As result of this data gets into on-disk structure in cpu
68959+ * byte order. internal's create_hook converts it to little endian byte
68960+ * order.
68961+ */
68962+ data->data = (char *)znode_get_block(child);
68963+ /* data -> data is kernel space */
68964+ data->user = 0;
68965+ data->length = sizeof(reiser4_block_nr);
68966+ /* FIXME-VS: hardcoded internal item? */
68967+
68968+ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
68969+ data->iplug = item_plugin_by_id(NODE_POINTER_ID);
68970+}
68971+
68972+/* add pointer to @child into empty @parent.
68973+
68974+ This is used when pointer to old root is inserted into new root which is
68975+ empty.
68976+*/
68977+static int add_child_ptr(znode * parent, znode * child)
68978+{
68979+ coord_t coord;
68980+ reiser4_item_data data;
68981+ int result;
68982+ reiser4_key key;
68983+
68984+ assert("nikita-1111", parent != NULL);
68985+ assert("nikita-1112", child != NULL);
68986+ assert("nikita-1115",
68987+ znode_get_level(parent) == znode_get_level(child) + 1);
68988+
68989+ result = zload(parent);
68990+ if (result != 0)
68991+ return result;
68992+ assert("nikita-1113", node_is_empty(parent));
68993+ coord_init_first_unit(&coord, parent);
68994+
68995+ build_child_ptr_data(child, &data);
68996+ data.arg = NULL;
68997+
68998+ read_lock_dk(znode_get_tree(parent));
68999+ key = *znode_get_ld_key(child);
69000+ read_unlock_dk(znode_get_tree(parent));
69001+
69002+ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
69003+ NULL);
69004+ znode_make_dirty(parent);
69005+ zrelse(parent);
69006+ return result;
69007+}
69008+
69009+/* actually remove tree root */
69010+static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
69011+ * being removed */,
69012+ znode * old_root /* root node that is being
69013+ * removed */ ,
69014+ znode * new_root /* new root---sole child of
69015+ * @old_root */,
69016+ const reiser4_block_nr * new_root_blk /* disk address of
69017+ * @new_root */)
69018+{
69019+ znode *uber;
69020+ int result;
69021+ lock_handle handle_for_uber;
69022+
69023+ assert("umka-265", tree != NULL);
69024+ assert("nikita-1198", new_root != NULL);
69025+ assert("nikita-1199",
69026+ znode_get_level(new_root) + 1 == znode_get_level(old_root));
69027+
69028+ assert("nikita-1201", znode_is_write_locked(old_root));
69029+
69030+ assert("nikita-1203",
69031+ disk_addr_eq(new_root_blk, znode_get_block(new_root)));
69032+
69033+ init_lh(&handle_for_uber);
69034+ /* obtain and lock "fake" znode protecting changes in tree height. */
69035+ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
69036+ &handle_for_uber);
69037+ if (result == 0) {
69038+ uber = handle_for_uber.node;
69039+
69040+ znode_make_dirty(uber);
69041+
69042+ /* don't take long term lock a @new_root. Take spinlock. */
69043+
69044+ write_lock_tree(tree);
69045+
69046+ tree->root_block = *new_root_blk;
69047+ --tree->height;
69048+
69049+ /* recalculate max balance overhead */
69050+ tree->estimate_one_insert = estimate_one_insert_item(tree);
69051+
69052+ assert("nikita-1202",
69053+ tree->height == znode_get_level(new_root));
69054+
69055+ /* new root is child on "fake" node */
69056+ init_parent_coord(&new_root->in_parent, uber);
69057+ ++uber->c_count;
69058+
69059+ /* sibling_list_insert_nolock(new_root, NULL); */
69060+ write_unlock_tree(tree);
69061+
69062+ /* reinitialise old root. */
69063+ result = node_plugin_by_node(old_root)->init(old_root);
69064+ znode_make_dirty(old_root);
69065+ if (result == 0) {
69066+ assert("nikita-1279", node_is_empty(old_root));
69067+ ZF_SET(old_root, JNODE_HEARD_BANSHEE);
69068+ old_root->c_count = 0;
69069+ }
69070+ }
69071+ done_lh(&handle_for_uber);
69072+
69073+ return result;
69074+}
69075+
69076+/* remove tree root
69077+
69078+ This function removes tree root, decreasing tree height by one. Tree root
69079+ and its only child (that is going to become new tree root) are write locked
69080+ at the entry.
69081+
69082+ To remove tree root we need to take lock on special "fake" znode that
69083+ protects changes of tree height. See comments in reiser4_add_tree_root() for
69084+ more on this.
69085+
69086+ Also parent pointers have to be updated in
69087+ old and new root. To simplify code, function is split into two parts: outer
69088+ reiser4_kill_tree_root() collects all necessary arguments and calls
69089+ reiser4_kill_root() to do the actual job.
69090+
69091+*/
69092+int reiser4_kill_tree_root(znode * old_root /* tree root that we are
69093+ removing*/)
69094+{
69095+ int result;
69096+ coord_t down_link;
69097+ znode *new_root;
69098+ reiser4_tree *tree;
69099+
69100+ assert("umka-266", current_tree != NULL);
69101+ assert("nikita-1194", old_root != NULL);
69102+ assert("nikita-1196", znode_is_root(old_root));
69103+ assert("nikita-1200", node_num_items(old_root) == 1);
69104+ assert("nikita-1401", znode_is_write_locked(old_root));
69105+
69106+ coord_init_first_unit(&down_link, old_root);
69107+
69108+ tree = znode_get_tree(old_root);
69109+ new_root = child_znode(&down_link, old_root, 0, 1);
69110+ if (!IS_ERR(new_root)) {
69111+ result =
69112+ reiser4_kill_root(tree, old_root, new_root,
69113+ znode_get_block(new_root));
69114+ zput(new_root);
69115+ } else
69116+ result = PTR_ERR(new_root);
69117+
69118+ return result;
69119+}
69120+
69121+/* Make Linus happy.
69122+ Local variables:
69123+ c-indentation-style: "K&R"
69124+ mode-name: "LC"
69125+ c-basic-offset: 8
69126+ tab-width: 8
69127+ fill-column: 120
69128+ scroll-step: 1
69129+ End:
69130+*/
69131diff -urN linux-2.6.24.orig/fs/reiser4/tree_mod.h linux-2.6.24/fs/reiser4/tree_mod.h
69132--- linux-2.6.24.orig/fs/reiser4/tree_mod.h 1970-01-01 03:00:00.000000000 +0300
69133+++ linux-2.6.24/fs/reiser4/tree_mod.h 2008-01-25 11:39:07.100249935 +0300
69134@@ -0,0 +1,29 @@
69135+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69136+ * reiser4/README */
69137+
69138+/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
69139+ * comments. */
69140+
69141+#if !defined( __REISER4_TREE_MOD_H__ )
69142+#define __REISER4_TREE_MOD_H__
69143+
69144+#include "forward.h"
69145+
69146+znode *reiser4_new_node(znode * brother, tree_level level);
69147+znode *reiser4_add_tree_root(znode * old_root, znode * fake);
69148+int reiser4_kill_tree_root(znode * old_root);
69149+void build_child_ptr_data(znode * child, reiser4_item_data * data);
69150+
69151+/* __REISER4_TREE_MOD_H__ */
69152+#endif
69153+
69154+/* Make Linus happy.
69155+ Local variables:
69156+ c-indentation-style: "K&R"
69157+ mode-name: "LC"
69158+ c-basic-offset: 8
69159+ tab-width: 8
69160+ fill-column: 120
69161+ scroll-step: 1
69162+ End:
69163+*/
69164diff -urN linux-2.6.24.orig/fs/reiser4/tree_walk.c linux-2.6.24/fs/reiser4/tree_walk.c
69165--- linux-2.6.24.orig/fs/reiser4/tree_walk.c 1970-01-01 03:00:00.000000000 +0300
69166+++ linux-2.6.24/fs/reiser4/tree_walk.c 2008-01-25 11:39:07.100249935 +0300
69167@@ -0,0 +1,927 @@
69168+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69169+ * reiser4/README */
69170+
69171+/* Routines and macros to:
69172+
69173+ get_left_neighbor()
69174+
69175+ get_right_neighbor()
69176+
69177+ get_parent()
69178+
69179+ get_first_child()
69180+
69181+ get_last_child()
69182+
69183+ various routines to walk the whole tree and do things to it like
69184+ repack it, or move it to tertiary storage. Please make them as
69185+ generic as is reasonable.
69186+
69187+*/
69188+
69189+#include "forward.h"
69190+#include "debug.h"
69191+#include "dformat.h"
69192+#include "coord.h"
69193+#include "plugin/item/item.h"
69194+#include "jnode.h"
69195+#include "znode.h"
69196+#include "tree_walk.h"
69197+#include "tree.h"
69198+#include "super.h"
69199+
69200+/* These macros are used internally in tree_walk.c in attempt to make
69201+ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
69202+ lock_left_neighbor */
69203+#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
69204+#define FIELD_OFFSET(name) offsetof(znode, name)
69205+#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
69206+#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
69207+#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
69208+
69209+/* This is the generic procedure to get and lock `generic' neighbor (left or
69210+ right neighbor or parent). It implements common algorithm for all cases of
69211+ getting lock on neighbor node, only znode structure field is different in
69212+ each case. This is parameterized by ptr_offset argument, which is byte
69213+ offset for the pointer to the desired neighbor within the current node's
69214+ znode structure. This function should be called with the tree lock held */
69215+static int lock_neighbor(
69216+ /* resulting lock handle */
69217+ lock_handle * result,
69218+ /* znode to lock */
69219+ znode * node,
69220+ /* pointer to neighbor (or parent) znode field offset, in bytes from
69221+ the base address of znode structure */
69222+ int ptr_offset,
69223+ /* lock mode for longterm_lock_znode call */
69224+ znode_lock_mode mode,
69225+ /* lock request for longterm_lock_znode call */
69226+ znode_lock_request req,
69227+ /* GN_* flags */
69228+ int flags, int rlocked)
69229+{
69230+ reiser4_tree *tree = znode_get_tree(node);
69231+ znode *neighbor;
69232+ int ret;
69233+
69234+ assert("umka-236", node != NULL);
69235+ assert("umka-237", tree != NULL);
69236+ assert_rw_locked(&(tree->tree_lock));
69237+
69238+ if (flags & GN_TRY_LOCK)
69239+ req |= ZNODE_LOCK_NONBLOCK;
69240+ if (flags & GN_SAME_ATOM)
69241+ req |= ZNODE_LOCK_DONT_FUSE;
69242+
69243+ /* get neighbor's address by using of sibling link, quit while loop
69244+ (and return) if link is not available. */
69245+ while (1) {
69246+ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
69247+
69248+ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
69249+ * node pointed by it is not connected.
69250+ *
69251+ * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
69252+ * check and allows passing reference to not connected znode to
69253+ * subsequent longterm_lock_znode() call. This kills possible
69254+ * busy loop if we are trying to get longterm lock on locked but
69255+ * not yet connected parent node. */
69256+ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
69257+ || znode_is_connected(neighbor))) {
69258+ return RETERR(-E_NO_NEIGHBOR);
69259+ }
69260+
69261+ /* protect it from deletion. */
69262+ zref(neighbor);
69263+
69264+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69265+
69266+ ret = longterm_lock_znode(result, neighbor, mode, req);
69267+
69268+ /* The lock handle obtains its own reference, release the one from above. */
69269+ zput(neighbor);
69270+
69271+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69272+
69273+ /* restart if node we got reference to is being
69274+ invalidated. we should not get reference to this node
69275+ again. */
69276+ if (ret == -EINVAL)
69277+ continue;
69278+ if (ret)
69279+ return ret;
69280+
69281+ /* check if neighbor link still points to just locked znode;
69282+ the link could have been changed while the process slept. */
69283+ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
69284+ return 0;
69285+
69286+ /* znode was locked by mistake; unlock it and restart locking
69287+ process from beginning. */
69288+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69289+ longterm_unlock_znode(result);
69290+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69291+ }
69292+}
69293+
69294+/* get parent node with longterm lock, accepts GN* flags. */
69295+int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
69296+ znode * node /* child node */ ,
69297+ znode_lock_mode mode
69298+ /* type of lock: read or write */ ,
69299+ int flags /* GN_* flags */ )
69300+{
69301+ int result;
69302+
69303+ read_lock_tree(znode_get_tree(node));
69304+ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
69305+ ZNODE_LOCK_HIPRI, flags, 1);
69306+ read_unlock_tree(znode_get_tree(node));
69307+ return result;
69308+}
69309+
69310+/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
69311+ bit in @flags parameter */
69312+/* Audited by: umka (2002.06.14) */
69313+static inline int
69314+lock_side_neighbor(lock_handle * result,
69315+ znode * node, znode_lock_mode mode, int flags, int rlocked)
69316+{
69317+ int ret;
69318+ int ptr_offset;
69319+ znode_lock_request req;
69320+
69321+ if (flags & GN_GO_LEFT) {
69322+ ptr_offset = LEFT_PTR_OFFSET;
69323+ req = ZNODE_LOCK_LOPRI;
69324+ } else {
69325+ ptr_offset = RIGHT_PTR_OFFSET;
69326+ req = ZNODE_LOCK_HIPRI;
69327+ }
69328+
69329+ ret =
69330+ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
69331+
69332+ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
69333+ * guarantee that neighbor is absent in the
69334+ * tree; in this case we return -ENOENT --
69335+ * means neighbor at least not found in
69336+ * cache */
69337+ return RETERR(-ENOENT);
69338+
69339+ return ret;
69340+}
69341+
69342+#if REISER4_DEBUG
69343+
69344+int check_sibling_list(znode * node)
69345+{
69346+ znode *scan;
69347+ znode *next;
69348+
69349+ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
69350+
69351+ if (node == NULL)
69352+ return 1;
69353+
69354+ if (ZF_ISSET(node, JNODE_RIP))
69355+ return 1;
69356+
69357+ assert("nikita-3270", node != NULL);
69358+ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
69359+
69360+ for (scan = node; znode_is_left_connected(scan); scan = next) {
69361+ next = scan->left;
69362+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69363+ assert("nikita-3271", znode_is_right_connected(next));
69364+ assert("nikita-3272", next->right == scan);
69365+ } else
69366+ break;
69367+ }
69368+ for (scan = node; znode_is_right_connected(scan); scan = next) {
69369+ next = scan->right;
69370+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69371+ assert("nikita-3273", znode_is_left_connected(next));
69372+ assert("nikita-3274", next->left == scan);
69373+ } else
69374+ break;
69375+ }
69376+ return 1;
69377+}
69378+
69379+#endif
69380+
69381+/* Znode sibling pointers maintenence. */
69382+
69383+/* Znode sibling pointers are established between any neighbored nodes which are
69384+ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
69385+ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
69386+ value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
69387+
69388+ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
69389+ take care about searching (hash table lookup may be required) of znode
69390+ neighbors, establishing sibling pointers between them and setting
69391+ JNODE_*_CONNECTED state bits. */
69392+
69393+/* adjusting of sibling pointers and `connected' states for two
69394+ neighbors; works if one neighbor is NULL (was not found). */
69395+
69396+/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
69397+void link_left_and_right(znode * left, znode * right)
69398+{
69399+ assert("nikita-3275", check_sibling_list(left));
69400+ assert("nikita-3275", check_sibling_list(right));
69401+
69402+ if (left != NULL) {
69403+ if (left->right == NULL) {
69404+ left->right = right;
69405+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
69406+
69407+ ON_DEBUG(left->right_version =
69408+ atomic_inc_return(&delim_key_version);
69409+ );
69410+
69411+ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
69412+ && left->right != right) {
69413+
69414+ ON_DEBUG(left->right->left_version =
69415+ atomic_inc_return(&delim_key_version);
69416+ left->right_version =
69417+ atomic_inc_return(&delim_key_version););
69418+
69419+ left->right->left = NULL;
69420+ left->right = right;
69421+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
69422+ } else
69423+ /*
69424+ * there is a race condition in renew_sibling_link()
69425+ * and assertions below check that it is only one
69426+ * there. Thread T1 calls renew_sibling_link() without
69427+ * GN_NO_ALLOC flag. zlook() doesn't find neighbor
69428+ * node, but before T1 gets to the
69429+ * link_left_and_right(), another thread T2 creates
69430+ * neighbor node and connects it. check for
69431+ * left->right == NULL above protects T1 from
69432+ * overwriting correct left->right pointer installed
69433+ * by T2.
69434+ */
69435+ assert("nikita-3302",
69436+ right == NULL || left->right == right);
69437+ }
69438+ if (right != NULL) {
69439+ if (right->left == NULL) {
69440+ right->left = left;
69441+ ZF_SET(right, JNODE_LEFT_CONNECTED);
69442+
69443+ ON_DEBUG(right->left_version =
69444+ atomic_inc_return(&delim_key_version);
69445+ );
69446+
69447+ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
69448+ && right->left != left) {
69449+
69450+ ON_DEBUG(right->left->right_version =
69451+ atomic_inc_return(&delim_key_version);
69452+ right->left_version =
69453+ atomic_inc_return(&delim_key_version););
69454+
69455+ right->left->right = NULL;
69456+ right->left = left;
69457+ ZF_SET(right, JNODE_LEFT_CONNECTED);
69458+
69459+ } else
69460+ assert("nikita-3303",
69461+ left == NULL || right->left == left);
69462+ }
69463+ assert("nikita-3275", check_sibling_list(left));
69464+ assert("nikita-3275", check_sibling_list(right));
69465+}
69466+
69467+/* Audited by: umka (2002.06.14) */
69468+static void link_znodes(znode * first, znode * second, int to_left)
69469+{
69470+ if (to_left)
69471+ link_left_and_right(second, first);
69472+ else
69473+ link_left_and_right(first, second);
69474+}
69475+
69476+/* getting of next (to left or to right, depend on gn_to_left bit in flags)
69477+ coord's unit position in horizontal direction, even across node
69478+ boundary. Should be called under tree lock, it protects nonexistence of
69479+ sibling link on parent level, if lock_side_neighbor() fails with
69480+ -ENOENT. */
69481+static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
69482+{
69483+ int ret;
69484+ znode *node;
69485+ reiser4_tree *tree;
69486+
69487+ assert("umka-243", coord != NULL);
69488+ assert("umka-244", handle != NULL);
69489+ assert("zam-1069", handle->node == NULL);
69490+
69491+ ret =
69492+ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
69493+ coord_next_unit(coord);
69494+ if (!ret)
69495+ return 0;
69496+
69497+ ret =
69498+ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
69499+ if (ret)
69500+ return ret;
69501+
69502+ node = handle->node;
69503+ tree = znode_get_tree(node);
69504+ write_unlock_tree(tree);
69505+
69506+ coord_init_zero(coord);
69507+
69508+ /* We avoid synchronous read here if it is specified by flag. */
69509+ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
69510+ ret = jstartio(ZJNODE(handle->node));
69511+ if (!ret)
69512+ ret = -E_REPEAT;
69513+ goto error_locked;
69514+ }
69515+
69516+ /* corresponded zrelse() should be called by the clients of
69517+ far_next_coord(), in place when this node gets unlocked. */
69518+ ret = zload(handle->node);
69519+ if (ret)
69520+ goto error_locked;
69521+
69522+ if (flags & GN_GO_LEFT)
69523+ coord_init_last_unit(coord, node);
69524+ else
69525+ coord_init_first_unit(coord, node);
69526+
69527+ if (0) {
69528+ error_locked:
69529+ longterm_unlock_znode(handle);
69530+ }
69531+ write_lock_tree(tree);
69532+ return ret;
69533+}
69534+
69535+/* Very significant function which performs a step in horizontal direction
69536+ when sibling pointer is not available. Actually, it is only function which
69537+ does it.
69538+ Note: this function does not restore locking status at exit,
69539+ caller should does care about proper unlocking and zrelsing */
69540+static int
69541+renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
69542+ tree_level level, int flags, int *nr_locked)
69543+{
69544+ int ret;
69545+ int to_left = flags & GN_GO_LEFT;
69546+ reiser4_block_nr da;
69547+ /* parent of the neighbor node; we set it to parent until not sharing
69548+ of one parent between child and neighbor node is detected */
69549+ znode *side_parent = coord->node;
69550+ reiser4_tree *tree = znode_get_tree(child);
69551+ znode *neighbor = NULL;
69552+
69553+ assert("umka-245", coord != NULL);
69554+ assert("umka-246", handle != NULL);
69555+ assert("umka-247", child != NULL);
69556+ assert("umka-303", tree != NULL);
69557+
69558+ init_lh(handle);
69559+ write_lock_tree(tree);
69560+ ret = far_next_coord(coord, handle, flags);
69561+
69562+ if (ret) {
69563+ if (ret != -ENOENT) {
69564+ write_unlock_tree(tree);
69565+ return ret;
69566+ }
69567+ } else {
69568+ item_plugin *iplug;
69569+
69570+ if (handle->node != NULL) {
69571+ (*nr_locked)++;
69572+ side_parent = handle->node;
69573+ }
69574+
69575+ /* does coord object points to internal item? We do not
69576+ support sibling pointers between znode for formatted and
69577+ unformatted nodes and return -E_NO_NEIGHBOR in that case. */
69578+ iplug = item_plugin_by_coord(coord);
69579+ if (!item_is_internal(coord)) {
69580+ link_znodes(child, NULL, to_left);
69581+ write_unlock_tree(tree);
69582+ /* we know there can't be formatted neighbor */
69583+ return RETERR(-E_NO_NEIGHBOR);
69584+ }
69585+ write_unlock_tree(tree);
69586+
69587+ iplug->s.internal.down_link(coord, NULL, &da);
69588+
69589+ if (flags & GN_NO_ALLOC) {
69590+ neighbor = zlook(tree, &da);
69591+ } else {
69592+ neighbor =
69593+ zget(tree, &da, side_parent, level,
69594+ reiser4_ctx_gfp_mask_get());
69595+ }
69596+
69597+ if (IS_ERR(neighbor)) {
69598+ ret = PTR_ERR(neighbor);
69599+ return ret;
69600+ }
69601+
69602+ if (neighbor)
69603+ /* update delimiting keys */
69604+ set_child_delimiting_keys(coord->node, coord, neighbor);
69605+
69606+ write_lock_tree(tree);
69607+ }
69608+
69609+ if (likely(neighbor == NULL ||
69610+ (znode_get_level(child) == znode_get_level(neighbor)
69611+ && child != neighbor)))
69612+ link_znodes(child, neighbor, to_left);
69613+ else {
69614+ warning("nikita-3532",
69615+ "Sibling nodes on the different levels: %i != %i\n",
69616+ znode_get_level(child), znode_get_level(neighbor));
69617+ ret = RETERR(-EIO);
69618+ }
69619+
69620+ write_unlock_tree(tree);
69621+
69622+ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
69623+ if (neighbor != NULL && (flags & GN_NO_ALLOC))
69624+ /* atomic_dec(&ZJNODE(neighbor)->x_count); */
69625+ zput(neighbor);
69626+
69627+ return ret;
69628+}
69629+
69630+/* This function is for establishing of one side relation. */
69631+/* Audited by: umka (2002.06.14) */
69632+static int connect_one_side(coord_t * coord, znode * node, int flags)
69633+{
69634+ coord_t local;
69635+ lock_handle handle;
69636+ int nr_locked;
69637+ int ret;
69638+
69639+ assert("umka-248", coord != NULL);
69640+ assert("umka-249", node != NULL);
69641+
69642+ coord_dup_nocheck(&local, coord);
69643+
69644+ init_lh(&handle);
69645+
69646+ ret =
69647+ renew_sibling_link(&local, &handle, node, znode_get_level(node),
69648+ flags | GN_NO_ALLOC, &nr_locked);
69649+
69650+ if (handle.node != NULL) {
69651+ /* complementary operations for zload() and lock() in far_next_coord() */
69652+ zrelse(handle.node);
69653+ longterm_unlock_znode(&handle);
69654+ }
69655+
69656+ /* we catch error codes which are not interesting for us because we
69657+ run renew_sibling_link() only for znode connection. */
69658+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
69659+ return 0;
69660+
69661+ return ret;
69662+}
69663+
69664+/* if @child is not in `connected' state, performs hash searches for left and
69665+ right neighbor nodes and establishes horizontal sibling links */
69666+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69667+int connect_znode(coord_t * parent_coord, znode * child)
69668+{
69669+ reiser4_tree *tree = znode_get_tree(child);
69670+ int ret = 0;
69671+
69672+ assert("zam-330", parent_coord != NULL);
69673+ assert("zam-331", child != NULL);
69674+ assert("zam-332", parent_coord->node != NULL);
69675+ assert("umka-305", tree != NULL);
69676+
69677+ /* it is trivial to `connect' root znode because it can't have
69678+ neighbors */
69679+ if (znode_above_root(parent_coord->node)) {
69680+ child->left = NULL;
69681+ child->right = NULL;
69682+ ZF_SET(child, JNODE_LEFT_CONNECTED);
69683+ ZF_SET(child, JNODE_RIGHT_CONNECTED);
69684+
69685+ ON_DEBUG(child->left_version =
69686+ atomic_inc_return(&delim_key_version);
69687+ child->right_version =
69688+ atomic_inc_return(&delim_key_version););
69689+
69690+ return 0;
69691+ }
69692+
69693+ /* load parent node */
69694+ coord_clear_iplug(parent_coord);
69695+ ret = zload(parent_coord->node);
69696+
69697+ if (ret != 0)
69698+ return ret;
69699+
69700+ /* protect `connected' state check by tree_lock */
69701+ read_lock_tree(tree);
69702+
69703+ if (!znode_is_right_connected(child)) {
69704+ read_unlock_tree(tree);
69705+ /* connect right (default is right) */
69706+ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
69707+ if (ret)
69708+ goto zrelse_and_ret;
69709+
69710+ read_lock_tree(tree);
69711+ }
69712+
69713+ ret = znode_is_left_connected(child);
69714+
69715+ read_unlock_tree(tree);
69716+
69717+ if (!ret) {
69718+ ret =
69719+ connect_one_side(parent_coord, child,
69720+ GN_NO_ALLOC | GN_GO_LEFT);
69721+ } else
69722+ ret = 0;
69723+
69724+ zrelse_and_ret:
69725+ zrelse(parent_coord->node);
69726+
69727+ return ret;
69728+}
69729+
69730+/* this function is like renew_sibling_link() but allocates neighbor node if
69731+ it doesn't exist and `connects' it. It may require making two steps in
69732+ horizontal direction, first one for neighbor node finding/allocation,
69733+ second one is for finding neighbor of neighbor to connect freshly allocated
69734+ znode. */
69735+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69736+static int
69737+renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
69738+{
69739+ coord_t local;
69740+ lock_handle empty[2];
69741+ reiser4_tree *tree = znode_get_tree(node);
69742+ znode *neighbor = NULL;
69743+ int nr_locked = 0;
69744+ int ret;
69745+
69746+ assert("umka-250", coord != NULL);
69747+ assert("umka-251", node != NULL);
69748+ assert("umka-307", tree != NULL);
69749+ assert("umka-308", level <= tree->height);
69750+
69751+ /* umka (2002.06.14)
69752+ Here probably should be a check for given "level" validness.
69753+ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
69754+ */
69755+
69756+ coord_dup(&local, coord);
69757+
69758+ ret =
69759+ renew_sibling_link(&local, &empty[0], node, level,
69760+ flags & ~GN_NO_ALLOC, &nr_locked);
69761+ if (ret)
69762+ goto out;
69763+
69764+ /* tree lock is not needed here because we keep parent node(s) locked
69765+ and reference to neighbor znode incremented */
69766+ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
69767+
69768+ read_lock_tree(tree);
69769+ ret = znode_is_connected(neighbor);
69770+ read_unlock_tree(tree);
69771+ if (ret) {
69772+ ret = 0;
69773+ goto out;
69774+ }
69775+
69776+ ret =
69777+ renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
69778+ flags | GN_NO_ALLOC, &nr_locked);
69779+ /* second renew_sibling_link() call is used for znode connection only,
69780+ so we can live with these errors */
69781+ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
69782+ ret = 0;
69783+
69784+ out:
69785+
69786+ for (--nr_locked; nr_locked >= 0; --nr_locked) {
69787+ zrelse(empty[nr_locked].node);
69788+ longterm_unlock_znode(&empty[nr_locked]);
69789+ }
69790+
69791+ if (neighbor != NULL)
69792+ /* decrement znode reference counter without actually
69793+ releasing it. */
69794+ atomic_dec(&ZJNODE(neighbor)->x_count);
69795+
69796+ return ret;
69797+}
69798+
69799+/*
69800+ reiser4_get_neighbor() -- lock node's neighbor.
69801+
69802+ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
69803+ given parameter) using sibling link to it. If sibling link is not available
69804+ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
69805+ level up for information about neighbor's disk address. We lock node's
69806+ parent, if it is common parent for both 'node' and its neighbor, neighbor's
69807+ disk address is in next (to left or to right) down link from link that points
69808+ to original node. If not, we need to lock parent's neighbor, read its content
69809+ and take first(last) downlink with neighbor's disk address. That locking
69810+ could be done by using sibling link and lock_neighbor() function, if sibling
69811+ link exists. In another case we have to go level up again until we find
69812+ common parent or valid sibling link. Then go down
69813+ allocating/connecting/locking/reading nodes until neighbor of first one is
69814+ locked.
69815+
69816+ @neighbor: result lock handle,
69817+ @node: a node which we lock neighbor of,
69818+ @lock_mode: lock mode {LM_READ, LM_WRITE},
69819+ @flags: logical OR of {GN_*} (see description above) subset.
69820+
69821+ @return: 0 if success, negative value if lock was impossible due to an error
69822+ or lack of neighbor node.
69823+*/
69824+
69825+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69826+int
69827+reiser4_get_neighbor(lock_handle * neighbor, znode * node,
69828+ znode_lock_mode lock_mode, int flags)
69829+{
69830+ reiser4_tree *tree = znode_get_tree(node);
69831+ lock_handle path[REAL_MAX_ZTREE_HEIGHT];
69832+
69833+ coord_t coord;
69834+
69835+ tree_level base_level;
69836+ tree_level h = 0;
69837+ int ret;
69838+
69839+ assert("umka-252", tree != NULL);
69840+ assert("umka-253", neighbor != NULL);
69841+ assert("umka-254", node != NULL);
69842+
69843+ base_level = znode_get_level(node);
69844+
69845+ assert("umka-310", base_level <= tree->height);
69846+
69847+ coord_init_zero(&coord);
69848+
69849+ again:
69850+ /* first, we try to use simple lock_neighbor() which requires sibling
69851+ link existence */
69852+ read_lock_tree(tree);
69853+ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
69854+ read_unlock_tree(tree);
69855+ if (!ret) {
69856+ /* load znode content if it was specified */
69857+ if (flags & GN_LOAD_NEIGHBOR) {
69858+ ret = zload(node);
69859+ if (ret)
69860+ longterm_unlock_znode(neighbor);
69861+ }
69862+ return ret;
69863+ }
69864+
69865+ /* only -ENOENT means we may look upward and try to connect
69866+ @node with its neighbor (if @flags allow us to do it) */
69867+ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
69868+ return ret;
69869+
69870+ /* before establishing of sibling link we lock parent node; it is
69871+ required by renew_neighbor() to work. */
69872+ init_lh(&path[0]);
69873+ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
69874+ if (ret)
69875+ return ret;
69876+ if (znode_above_root(path[0].node)) {
69877+ longterm_unlock_znode(&path[0]);
69878+ return RETERR(-E_NO_NEIGHBOR);
69879+ }
69880+
69881+ while (1) {
69882+ znode *child = (h == 0) ? node : path[h - 1].node;
69883+ znode *parent = path[h].node;
69884+
69885+ ret = zload(parent);
69886+ if (ret)
69887+ break;
69888+
69889+ ret = find_child_ptr(parent, child, &coord);
69890+
69891+ if (ret) {
69892+ zrelse(parent);
69893+ break;
69894+ }
69895+
69896+ /* try to establish missing sibling link */
69897+ ret = renew_neighbor(&coord, child, h + base_level, flags);
69898+
69899+ zrelse(parent);
69900+
69901+ switch (ret) {
69902+ case 0:
69903+ /* unlocking of parent znode prevents simple
69904+ deadlock situation */
69905+ done_lh(&path[h]);
69906+
69907+ /* depend on tree level we stay on we repeat first
69908+ locking attempt ... */
69909+ if (h == 0)
69910+ goto again;
69911+
69912+ /* ... or repeat establishing of sibling link at
69913+ one level below. */
69914+ --h;
69915+ break;
69916+
69917+ case -ENOENT:
69918+ /* sibling link is not available -- we go
69919+ upward. */
69920+ init_lh(&path[h + 1]);
69921+ ret =
69922+ reiser4_get_parent(&path[h + 1], parent,
69923+ ZNODE_READ_LOCK);
69924+ if (ret)
69925+ goto fail;
69926+ ++h;
69927+ if (znode_above_root(path[h].node)) {
69928+ ret = RETERR(-E_NO_NEIGHBOR);
69929+ goto fail;
69930+ }
69931+ break;
69932+
69933+ case -E_DEADLOCK:
69934+ /* there was lock request from hi-pri locker. if
69935+ it is possible we unlock last parent node and
69936+ re-lock it again. */
69937+ for (; reiser4_check_deadlock(); h--) {
69938+ done_lh(&path[h]);
69939+ if (h == 0)
69940+ goto fail;
69941+ }
69942+
69943+ break;
69944+
69945+ default: /* other errors. */
69946+ goto fail;
69947+ }
69948+ }
69949+ fail:
69950+ ON_DEBUG(check_lock_node_data(node));
69951+ ON_DEBUG(check_lock_data());
69952+
69953+ /* unlock path */
69954+ do {
69955+ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
69956+ fail; path[0] is already done_lh-ed, therefore
69957+ longterm_unlock_znode(&path[h]); is not applicable */
69958+ done_lh(&path[h]);
69959+ --h;
69960+ } while (h + 1 != 0);
69961+
69962+ return ret;
69963+}
69964+
69965+/* remove node from sibling list */
69966+/* Audited by: umka (2002.06.14) */
69967+void sibling_list_remove(znode * node)
69968+{
69969+ reiser4_tree *tree;
69970+
69971+ tree = znode_get_tree(node);
69972+ assert("umka-255", node != NULL);
69973+ assert_rw_write_locked(&(tree->tree_lock));
69974+ assert("nikita-3275", check_sibling_list(node));
69975+
69976+ write_lock_dk(tree);
69977+ if (znode_is_right_connected(node) && node->right != NULL &&
69978+ znode_is_left_connected(node) && node->left != NULL) {
69979+ assert("zam-32245",
69980+ keyeq(znode_get_rd_key(node),
69981+ znode_get_ld_key(node->right)));
69982+ znode_set_rd_key(node->left, znode_get_ld_key(node->right));
69983+ }
69984+ write_unlock_dk(tree);
69985+
69986+ if (znode_is_right_connected(node) && node->right != NULL) {
69987+ assert("zam-322", znode_is_left_connected(node->right));
69988+ node->right->left = node->left;
69989+ ON_DEBUG(node->right->left_version =
69990+ atomic_inc_return(&delim_key_version);
69991+ );
69992+ }
69993+ if (znode_is_left_connected(node) && node->left != NULL) {
69994+ assert("zam-323", znode_is_right_connected(node->left));
69995+ node->left->right = node->right;
69996+ ON_DEBUG(node->left->right_version =
69997+ atomic_inc_return(&delim_key_version);
69998+ );
69999+ }
70000+
70001+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
70002+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
70003+ ON_DEBUG(node->left = node->right = NULL;
70004+ node->left_version = atomic_inc_return(&delim_key_version);
70005+ node->right_version = atomic_inc_return(&delim_key_version););
70006+ assert("nikita-3276", check_sibling_list(node));
70007+}
70008+
70009+/* disconnect node from sibling list */
70010+void sibling_list_drop(znode * node)
70011+{
70012+ znode *right;
70013+ znode *left;
70014+
70015+ assert("nikita-2464", node != NULL);
70016+ assert("nikita-3277", check_sibling_list(node));
70017+
70018+ right = node->right;
70019+ if (right != NULL) {
70020+ assert("nikita-2465", znode_is_left_connected(right));
70021+ right->left = NULL;
70022+ ON_DEBUG(right->left_version =
70023+ atomic_inc_return(&delim_key_version);
70024+ );
70025+ }
70026+ left = node->left;
70027+ if (left != NULL) {
70028+ assert("zam-323", znode_is_right_connected(left));
70029+ left->right = NULL;
70030+ ON_DEBUG(left->right_version =
70031+ atomic_inc_return(&delim_key_version);
70032+ );
70033+ }
70034+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
70035+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
70036+ ON_DEBUG(node->left = node->right = NULL;
70037+ node->left_version = atomic_inc_return(&delim_key_version);
70038+ node->right_version = atomic_inc_return(&delim_key_version););
70039+}
70040+
70041+/* Insert new node into sibling list. Regular balancing inserts new node
70042+ after (at right side) existing and locked node (@before), except one case
70043+ of adding new tree root node. @before should be NULL in that case. */
70044+void sibling_list_insert_nolock(znode * new, znode * before)
70045+{
70046+ assert("zam-334", new != NULL);
70047+ assert("nikita-3298", !znode_is_left_connected(new));
70048+ assert("nikita-3299", !znode_is_right_connected(new));
70049+ assert("nikita-3300", new->left == NULL);
70050+ assert("nikita-3301", new->right == NULL);
70051+ assert("nikita-3278", check_sibling_list(new));
70052+ assert("nikita-3279", check_sibling_list(before));
70053+
70054+ if (before != NULL) {
70055+ assert("zam-333", znode_is_connected(before));
70056+ new->right = before->right;
70057+ new->left = before;
70058+ ON_DEBUG(new->right_version =
70059+ atomic_inc_return(&delim_key_version);
70060+ new->left_version =
70061+ atomic_inc_return(&delim_key_version););
70062+ if (before->right != NULL) {
70063+ before->right->left = new;
70064+ ON_DEBUG(before->right->left_version =
70065+ atomic_inc_return(&delim_key_version);
70066+ );
70067+ }
70068+ before->right = new;
70069+ ON_DEBUG(before->right_version =
70070+ atomic_inc_return(&delim_key_version);
70071+ );
70072+ } else {
70073+ new->right = NULL;
70074+ new->left = NULL;
70075+ ON_DEBUG(new->right_version =
70076+ atomic_inc_return(&delim_key_version);
70077+ new->left_version =
70078+ atomic_inc_return(&delim_key_version););
70079+ }
70080+ ZF_SET(new, JNODE_LEFT_CONNECTED);
70081+ ZF_SET(new, JNODE_RIGHT_CONNECTED);
70082+ assert("nikita-3280", check_sibling_list(new));
70083+ assert("nikita-3281", check_sibling_list(before));
70084+}
70085+
70086+/*
70087+ Local variables:
70088+ c-indentation-style: "K&R"
70089+ mode-name: "LC"
70090+ c-basic-offset: 8
70091+ tab-width: 8
70092+ fill-column: 80
70093+ End:
70094+*/
70095diff -urN linux-2.6.24.orig/fs/reiser4/tree_walk.h linux-2.6.24/fs/reiser4/tree_walk.h
70096--- linux-2.6.24.orig/fs/reiser4/tree_walk.h 1970-01-01 03:00:00.000000000 +0300
70097+++ linux-2.6.24/fs/reiser4/tree_walk.h 2008-01-25 11:39:07.100249935 +0300
70098@@ -0,0 +1,125 @@
70099+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
70100+
70101+/* definitions of reiser4 tree walk functions */
70102+
70103+#ifndef __FS_REISER4_TREE_WALK_H__
70104+#define __FS_REISER4_TREE_WALK_H__
70105+
70106+#include "debug.h"
70107+#include "forward.h"
70108+
70109+/* establishes horizontal links between cached znodes */
70110+int connect_znode(coord_t * coord, znode * node);
70111+
70112+/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
70113+ have the following common arguments:
70114+
70115+ return codes:
70116+
70117+ @return : 0 - OK,
70118+
70119+ZAM-FIXME-HANS: wrong return code name. Change them all.
70120+ -ENOENT - neighbor is not in cache, what is detected by sibling
70121+ link absence.
70122+
70123+ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
70124+ found (because we are left-/right- most node of the
70125+ tree, for example). Also, this return code is for
70126+ reiser4_get_parent() when we see no parent link -- it
70127+ means that our node is root node.
70128+
70129+ -E_DEADLOCK - deadlock detected (request from high-priority process
70130+ received), other error codes are conformed to
70131+ /usr/include/asm/errno.h .
70132+*/
70133+
70134+int
70135+reiser4_get_parent_flags(lock_handle * result, znode * node,
70136+ znode_lock_mode mode, int flags);
70137+
70138+/* bits definition for reiser4_get_neighbor function `flags' arg. */
70139+typedef enum {
70140+ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
70141+ * find not allocated not connected neigbor by going though upper
70142+ * levels */
70143+ GN_CAN_USE_UPPER_LEVELS = 0x1,
70144+ /* locking left neighbor instead of right one */
70145+ GN_GO_LEFT = 0x2,
70146+ /* automatically load neighbor node content */
70147+ GN_LOAD_NEIGHBOR = 0x4,
70148+ /* return -E_REPEAT if can't lock */
70149+ GN_TRY_LOCK = 0x8,
70150+ /* used internally in tree_walk.c, causes renew_sibling to not
70151+ allocate neighbor znode, but only search for it in znode cache */
70152+ GN_NO_ALLOC = 0x10,
70153+ /* do not go across atom boundaries */
70154+ GN_SAME_ATOM = 0x20,
70155+ /* allow to lock not connected nodes */
70156+ GN_ALLOW_NOT_CONNECTED = 0x40,
70157+ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
70158+ GN_ASYNC = 0x80
70159+} znode_get_neigbor_flags;
70160+
70161+/* A commonly used wrapper for reiser4_get_parent_flags(). */
70162+static inline int reiser4_get_parent(lock_handle * result, znode * node,
70163+ znode_lock_mode mode)
70164+{
70165+ return reiser4_get_parent_flags(result, node, mode,
70166+ GN_ALLOW_NOT_CONNECTED);
70167+}
70168+
70169+int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
70170+ znode_lock_mode lock_mode, int flags);
70171+
70172+/* there are wrappers for most common usages of reiser4_get_neighbor() */
70173+static inline int
70174+reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
70175+ int flags)
70176+{
70177+ return reiser4_get_neighbor(result, node, lock_mode,
70178+ flags | GN_GO_LEFT);
70179+}
70180+
70181+static inline int
70182+reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
70183+ int flags)
70184+{
70185+ ON_DEBUG(check_lock_node_data(node));
70186+ ON_DEBUG(check_lock_data());
70187+ return reiser4_get_neighbor(result, node, lock_mode,
70188+ flags & (~GN_GO_LEFT));
70189+}
70190+
70191+extern void sibling_list_remove(znode * node);
70192+extern void sibling_list_drop(znode * node);
70193+extern void sibling_list_insert_nolock(znode * new, znode * before);
70194+extern void link_left_and_right(znode * left, znode * right);
70195+
70196+/* Functions called by tree_walk() when tree_walk() ... */
70197+struct tree_walk_actor {
70198+ /* ... meets a formatted node, */
70199+ int (*process_znode) (tap_t *, void *);
70200+ /* ... meets an extent, */
70201+ int (*process_extent) (tap_t *, void *);
70202+ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
70203+ * node or extent processing functions. */
70204+ int (*before) (void *);
70205+};
70206+
70207+#if REISER4_DEBUG
70208+int check_sibling_list(znode * node);
70209+#else
70210+#define check_sibling_list(n) (1)
70211+#endif
70212+
70213+#endif /* __FS_REISER4_TREE_WALK_H__ */
70214+
70215+/*
70216+ Local variables:
70217+ c-indentation-style: "K&R"
70218+ mode-name: "LC"
70219+ c-basic-offset: 8
70220+ tab-width: 8
70221+ fill-column: 120
70222+ End:
70223+*/
70224diff -urN linux-2.6.24.orig/fs/reiser4/txnmgr.c linux-2.6.24/fs/reiser4/txnmgr.c
70225--- linux-2.6.24.orig/fs/reiser4/txnmgr.c 1970-01-01 03:00:00.000000000 +0300
70226+++ linux-2.6.24/fs/reiser4/txnmgr.c 2008-01-25 11:39:07.108251996 +0300
70227@@ -0,0 +1,3164 @@
70228+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70229+ * reiser4/README */
70230+
70231+/* Joshua MacDonald wrote the first draft of this code. */
70232+
70233+/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
70234+filesystem scales only as well as its worst locking design. You need to
70235+substantially restructure this code. Josh was not as experienced a programmer
70236+as you. Particularly review how the locking style differs from what you did
70237+for znodes usingt hi-lo priority locking, and present to me an opinion on
70238+whether the differences are well founded. */
70239+
70240+/* I cannot help but to disagree with the sentiment above. Locking of
70241+ * transaction manager is _not_ badly designed, and, at the very least, is not
70242+ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
70243+ * locking on znodes, especially on the root node of the tree. --nikita,
70244+ * 2003.10.13 */
70245+
70246+/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
70247+ txnmgr processes capture_block requests and manages the relationship between jnodes and
70248+ atoms through the various stages of a transcrash, and it also oversees the fusion and
70249+ capture-on-copy processes. The main difficulty with this task is maintaining a
70250+ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
70251+ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
70252+ must be broken. The main requirement is that atom-fusion be deadlock free, so once you
70253+ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
70254+ that any time you check the atom-pointer of a jnode or handle and then try to lock that
70255+ atom, you must use trylock() and possibly reverse the order.
70256+
70257+ This code implements the design documented at:
70258+
70259+ http://namesys.com/txn-doc.html
70260+
70261+ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
70262+above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
70263+topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
70264+year old --- define all technical terms used.
70265+
70266+*/
70267+
70268+/* Thoughts on the external transaction interface:
70269+
70270+ In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
70271+ creates state that lasts for the duration of a system call and is called at the start
70272+ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
70273+ occupying the scope of a single system call. We wish to give certain applications an
70274+ interface to begin and close (commit) transactions. Since our implementation of
70275+ transactions does not yet support isolation, allowing an application to open a
70276+ transaction implies trusting it to later close the transaction. Part of the
70277+ transaction interface will be aimed at enabling that trust, but the interface for
70278+ actually using transactions is fairly narrow.
70279+
70280+ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
70281+ this identifier into a string that a shell-script could use, allowing you to start a
70282+ transaction by issuing a command. Once open, the transcrash should be set in the task
70283+ structure, and there should be options (I suppose) to allow it to be carried across
70284+ fork/exec. A transcrash has several options:
70285+
70286+ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
70287+ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
70288+ capture on reads as well, it should set READ_FUSING.
70289+
70290+ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
70291+ eventually close (or else the machine must crash). If the application dies an
70292+ unexpected death with an open transcrash, for example, or if it hangs for a long
70293+ duration, one solution (to avoid crashing the machine) is to simply close it anyway.
70294+ This is a dangerous option, but it is one way to solve the problem until isolated
70295+ transcrashes are available for untrusted applications.
70296+
70297+ It seems to be what databases do, though it is unclear how one avoids a DoS attack
70298+ creating a vulnerability based on resource starvation. Guaranteeing that some
70299+ minimum amount of computational resources are made available would seem more correct
70300+ than guaranteeing some amount of time. When we again have someone to code the work,
70301+ this issue should be considered carefully. -Hans
70302+
70303+ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
70304+ many dirty blocks it expects. The reserve_blocks interface should be called at a point
70305+ where it is safe for the application to fail, because the system may not be able to
70306+ grant the allocation and the application must be able to back-out. For this reason,
70307+ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
70308+ the application may also wish to extend the allocation after beginning its transcrash.
70309+
70310+ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
70311+ modifications that require transaction protection. When isolated transactions are
70312+ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
70313+ RESERVE_BLOCKS call fails for the application, it should "abort" by calling
70314+ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
70315+ why, for safety, the application should call RESERVE_BLOCKS before making any changes).
70316+
70317+ For actually implementing these out-of-system-call-scopped transcrashes, the
70318+ reiser4_context has a "txn_handle *trans" pointer that may be set to an open
70319+ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
70320+ "struct kmem_cache *_txnh_slab" created for that purpose in this file.
70321+*/
70322+
70323+/* Extending the other system call interfaces for future transaction features:
70324+
70325+ Specialized applications may benefit from passing flags to the ordinary system call
70326+ interface such as read(), write(), or stat(). For example, the application specifies
70327+ WRITE_FUSING by default but wishes to add that a certain read() command should be
70328+ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
70329+ read, or the file-data read? These issues are straight-forward, but there are a lot of
70330+ them and adding the necessary flags-passing code will be tedious.
70331+
70332+ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
70333+ flag, which specifies that although it is a read operation being requested, a
70334+ write-lock should be taken. The reason is that read-locks are shared while write-locks
70335+ are exclusive, so taking a read-lock when a later-write is known in advance will often
70336+ leads to deadlock. If a reader knows it will write later, it should issue read
70337+ requests with the RMW flag set.
70338+*/
70339+
70340+/*
70341+ The znode/atom deadlock avoidance.
70342+
70343+ FIXME(Zam): writing of this comment is in progress.
70344+
70345+ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
70346+ long-term locking, which makes reiser4 locking scheme more complex. It had
70347+ deadlocks until we implement deadlock avoidance algorithms. That deadlocks
70348+ looked as the following: one stopped thread waits for a long-term lock on
70349+ znode, the thread who owns that lock waits when fusion with another atom will
70350+ be allowed.
70351+
70352+ The source of the deadlocks is an optimization of not capturing index nodes
70353+ for read. Let's prove it. Suppose we have dumb node capturing scheme which
70354+ unconditionally captures each block before locking it.
70355+
70356+ That scheme has no deadlocks. Let's begin with the thread which stage is
70357+ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
70358+ a capture because it's stage allows fusion with any atom except which are
70359+ being committed currently. A process of atom commit can't deadlock because
70360+ atom commit procedure does not acquire locks and does not fuse with other
70361+ atoms. Reiser4 does capturing right before going to sleep inside the
70362+ longtertm_lock_znode() function, it means the znode which we want to lock is
70363+ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
70364+ continue the analysis we understand that no one process in the sequence may
70365+ waits atom fusion. Thereby there are no deadlocks of described kind.
70366+
70367+ The capturing optimization makes the deadlocks possible. A thread can wait a
70368+ lock which owner did not captured that node. The lock owner's current atom
70369+ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
70370+ state. A deadlock is possible when that atom meets another one which is in
70371+ ASTAGE_CAPTURE_WAIT already.
70372+
70373+ The deadlock avoidance scheme includes two algorithms:
70374+
70375+ First algorithm is used when a thread captures a node which is locked but not
70376+ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
70377+ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
70378+ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
70379+ routine which forces all lock owners to join with current atom is executed.
70380+
70381+ Second algorithm does not allow to skip capturing of already captured nodes.
70382+
70383+ Both algorithms together prevent waiting a longterm lock without atom fusion
70384+ with atoms of all lock owners, which is a key thing for getting atom/znode
70385+ locking deadlocks.
70386+*/
70387+
70388+/*
70389+ * Transactions and mmap(2).
70390+ *
70391+ * 1. Transactions are not supported for accesses through mmap(2), because
70392+ * this would effectively amount to user-level transactions whose duration
70393+ * is beyond control of the kernel.
70394+ *
70395+ * 2. That said, we still want to preserve some decency with regard to
70396+ * mmap(2). During normal write(2) call, following sequence of events
70397+ * happens:
70398+ *
70399+ * 1. page is created;
70400+ *
70401+ * 2. jnode is created, dirtied and captured into current atom.
70402+ *
70403+ * 3. extent is inserted and modified.
70404+ *
70405+ * Steps (2) and (3) take place under long term lock on the twig node.
70406+ *
70407+ * When file is accessed through mmap(2) page is always created during
70408+ * page fault.
70409+ * After this (in reiser4_readpage()->reiser4_readpage_extent()):
70410+ *
70411+ * 1. if access is made to non-hole page new jnode is created, (if
70412+ * necessary)
70413+ *
70414+ * 2. if access is made to the hole page, jnode is not created (XXX
70415+ * not clear why).
70416+ *
70417+ * Also, even if page is created by write page fault it is not marked
70418+ * dirty immediately by handle_mm_fault(). Probably this is to avoid races
70419+ * with page write-out.
70420+ *
70421+ * Dirty bit installed by hardware is only transferred to the struct page
70422+ * later, when page is unmapped (in zap_pte_range(), or
70423+ * try_to_unmap_one()).
70424+ *
70425+ * So, with mmap(2) we have to handle following irksome situations:
70426+ *
70427+ * 1. there exists modified page (clean or dirty) without jnode
70428+ *
70429+ * 2. there exists modified page (clean or dirty) with clean jnode
70430+ *
70431+ * 3. clean page which is a part of atom can be transparently modified
70432+ * at any moment through mapping without becoming dirty.
70433+ *
70434+ * (1) and (2) can lead to the out-of-memory situation: ->writepage()
70435+ * doesn't know what to do with such pages and ->sync_sb()/->writepages()
70436+ * don't see them, because these methods operate on atoms.
70437+ *
70438+ * (3) can lead to the loss of data: suppose we have dirty page with dirty
70439+ * captured jnode captured by some atom. As part of early flush (for
70440+ * example) page was written out. Dirty bit was cleared on both page and
70441+ * jnode. After this page is modified through mapping, but kernel doesn't
70442+ * notice and just discards page and jnode as part of commit. (XXX
70443+ * actually it doesn't, because to reclaim page ->releasepage() has to be
70444+ * called and before this dirty bit will be transferred to the struct
70445+ * page).
70446+ *
70447+ */
70448+
70449+#include "debug.h"
70450+#include "txnmgr.h"
70451+#include "jnode.h"
70452+#include "znode.h"
70453+#include "block_alloc.h"
70454+#include "tree.h"
70455+#include "wander.h"
70456+#include "ktxnmgrd.h"
70457+#include "super.h"
70458+#include "page_cache.h"
70459+#include "reiser4.h"
70460+#include "vfs_ops.h"
70461+#include "inode.h"
70462+#include "flush.h"
70463+
70464+#include <asm/atomic.h>
70465+#include <linux/types.h>
70466+#include <linux/fs.h>
70467+#include <linux/mm.h>
70468+#include <linux/slab.h>
70469+#include <linux/pagemap.h>
70470+#include <linux/writeback.h>
70471+#include <linux/swap.h> /* for totalram_pages */
70472+
70473+static void atom_free(txn_atom * atom);
70474+
70475+static int commit_txnh(txn_handle * txnh);
70476+
70477+static void wakeup_atom_waitfor_list(txn_atom * atom);
70478+static void wakeup_atom_waiting_list(txn_atom * atom);
70479+
70480+static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
70481+
70482+static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
70483+
70484+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
70485+
70486+static int capture_init_fusion(jnode * node, txn_handle * txnh,
70487+ txn_capture mode);
70488+
70489+static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
70490+
70491+static void capture_fuse_into(txn_atom * small, txn_atom * large);
70492+
70493+void reiser4_invalidate_list(struct list_head *);
70494+
70495+/* GENERIC STRUCTURES */
70496+
70497+typedef struct _txn_wait_links txn_wait_links;
70498+
70499+struct _txn_wait_links {
70500+ lock_stack *_lock_stack;
70501+ struct list_head _fwaitfor_link;
70502+ struct list_head _fwaiting_link;
70503+ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70504+ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70505+};
70506+
70507+/* FIXME: In theory, we should be using the slab cache init & destructor
70508+ methods instead of, e.g., jnode_init, etc. */
70509+static struct kmem_cache *_atom_slab = NULL;
70510+/* this is for user-visible, cross system-call transactions. */
70511+static struct kmem_cache *_txnh_slab = NULL;
70512+
70513+/**
70514+ * init_txnmgr_static - create transaction manager slab caches
70515+ *
70516+ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
70517+ * initialization.
70518+ */
70519+int init_txnmgr_static(void)
70520+{
70521+ assert("jmacd-600", _atom_slab == NULL);
70522+ assert("jmacd-601", _txnh_slab == NULL);
70523+
70524+ ON_DEBUG(atomic_set(&flush_cnt, 0));
70525+
70526+ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
70527+ SLAB_HWCACHE_ALIGN |
70528+ SLAB_RECLAIM_ACCOUNT, NULL);
70529+ if (_atom_slab == NULL)
70530+ return RETERR(-ENOMEM);
70531+
70532+ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
70533+ SLAB_HWCACHE_ALIGN, NULL);
70534+ if (_txnh_slab == NULL) {
70535+ kmem_cache_destroy(_atom_slab);
70536+ _atom_slab = NULL;
70537+ return RETERR(-ENOMEM);
70538+ }
70539+
70540+ return 0;
70541+}
70542+
70543+/**
70544+ * done_txnmgr_static - delete txn_atom and txn_handle caches
70545+ *
70546+ * This is called on reiser4 module unloading or system shutdown.
70547+ */
70548+void done_txnmgr_static(void)
70549+{
70550+ destroy_reiser4_cache(&_atom_slab);
70551+ destroy_reiser4_cache(&_txnh_slab);
70552+}
70553+
70554+/**
70555+ * init_txnmgr - initialize a new transaction manager
70556+ * @mgr: pointer to transaction manager embedded in reiser4 super block
70557+ *
70558+ * This is called on mount. Makes necessary initializations.
70559+ */
70560+void reiser4_init_txnmgr(txn_mgr *mgr)
70561+{
70562+ assert("umka-169", mgr != NULL);
70563+
70564+ mgr->atom_count = 0;
70565+ mgr->id_count = 1;
70566+ INIT_LIST_HEAD(&mgr->atoms_list);
70567+ spin_lock_init(&mgr->tmgr_lock);
70568+ mutex_init(&mgr->commit_mutex);
70569+}
70570+
70571+/**
70572+ * reiser4_done_txnmgr - stop transaction manager
70573+ * @mgr: pointer to transaction manager embedded in reiser4 super block
70574+ *
70575+ * This is called on umount. Does sanity checks.
70576+ */
70577+void reiser4_done_txnmgr(txn_mgr *mgr)
70578+{
70579+ assert("umka-170", mgr != NULL);
70580+ assert("umka-1701", list_empty_careful(&mgr->atoms_list));
70581+ assert("umka-1702", mgr->atom_count == 0);
70582+}
70583+
70584+/* Initialize a transaction handle. */
70585+/* Audited by: umka (2002.06.13) */
70586+static void txnh_init(txn_handle * txnh, txn_mode mode)
70587+{
70588+ assert("umka-171", txnh != NULL);
70589+
70590+ txnh->mode = mode;
70591+ txnh->atom = NULL;
70592+ reiser4_ctx_gfp_mask_set();
70593+ txnh->flags = 0;
70594+ spin_lock_init(&txnh->hlock);
70595+ INIT_LIST_HEAD(&txnh->txnh_link);
70596+}
70597+
70598+#if REISER4_DEBUG
70599+/* Check if a transaction handle is clean. */
70600+static int txnh_isclean(txn_handle * txnh)
70601+{
70602+ assert("umka-172", txnh != NULL);
70603+ return txnh->atom == NULL &&
70604+ LOCK_CNT_NIL(spin_locked_txnh);
70605+}
70606+#endif
70607+
70608+/* Initialize an atom. */
70609+static void atom_init(txn_atom * atom)
70610+{
70611+ int level;
70612+
70613+ assert("umka-173", atom != NULL);
70614+
70615+ memset(atom, 0, sizeof(txn_atom));
70616+
70617+ atom->stage = ASTAGE_FREE;
70618+ atom->start_time = jiffies;
70619+
70620+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
70621+ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
70622+
70623+ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
70624+ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
70625+ INIT_LIST_HEAD(ATOM_WB_LIST(atom));
70626+ INIT_LIST_HEAD(&atom->inodes);
70627+ spin_lock_init(&(atom->alock));
70628+ /* list of transaction handles */
70629+ INIT_LIST_HEAD(&atom->txnh_list);
70630+ /* link to transaction manager's list of atoms */
70631+ INIT_LIST_HEAD(&atom->atom_link);
70632+ INIT_LIST_HEAD(&atom->fwaitfor_list);
70633+ INIT_LIST_HEAD(&atom->fwaiting_list);
70634+ blocknr_set_init(&atom->delete_set);
70635+ blocknr_set_init(&atom->wandered_map);
70636+
70637+ init_atom_fq_parts(atom);
70638+}
70639+
70640+#if REISER4_DEBUG
70641+/* Check if an atom is clean. */
70642+static int atom_isclean(txn_atom * atom)
70643+{
70644+ int level;
70645+
70646+ assert("umka-174", atom != NULL);
70647+
70648+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
70649+ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
70650+ return 0;
70651+ }
70652+ }
70653+
70654+ return atom->stage == ASTAGE_FREE &&
70655+ atom->txnh_count == 0 &&
70656+ atom->capture_count == 0 &&
70657+ atomic_read(&atom->refcount) == 0 &&
70658+ (&atom->atom_link == atom->atom_link.next &&
70659+ &atom->atom_link == atom->atom_link.prev) &&
70660+ list_empty_careful(&atom->txnh_list) &&
70661+ list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
70662+ list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
70663+ list_empty_careful(ATOM_WB_LIST(atom)) &&
70664+ list_empty_careful(&atom->fwaitfor_list) &&
70665+ list_empty_careful(&atom->fwaiting_list) &&
70666+ atom_fq_parts_are_clean(atom);
70667+}
70668+#endif
70669+
70670+/* Begin a transaction in this context. Currently this uses the reiser4_context's
70671+ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
70672+ this will be extended to allow transaction handles to span several contexts. */
70673+/* Audited by: umka (2002.06.13) */
70674+void reiser4_txn_begin(reiser4_context * context)
70675+{
70676+ assert("jmacd-544", context->trans == NULL);
70677+
70678+ context->trans = &context->trans_in_ctx;
70679+
70680+ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
70681+ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
70682+ stack allocated right now, but we would like to allow for dynamically allocated
70683+ transcrashes that span multiple system calls.
70684+ */
70685+ txnh_init(context->trans, TXN_WRITE_FUSING);
70686+}
70687+
70688+/* Finish a transaction handle context. */
70689+int reiser4_txn_end(reiser4_context * context)
70690+{
70691+ long ret = 0;
70692+ txn_handle *txnh;
70693+
70694+ assert("umka-283", context != NULL);
70695+ assert("nikita-3012", reiser4_schedulable());
70696+ assert("vs-24", context == get_current_context());
70697+ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
70698+
70699+ txnh = context->trans;
70700+ if (txnh != NULL) {
70701+ if (txnh->atom != NULL)
70702+ ret = commit_txnh(txnh);
70703+ assert("jmacd-633", txnh_isclean(txnh));
70704+ context->trans = NULL;
70705+ }
70706+ return ret;
70707+}
70708+
70709+void reiser4_txn_restart(reiser4_context * context)
70710+{
70711+ reiser4_txn_end(context);
70712+ reiser4_preempt_point();
70713+ reiser4_txn_begin(context);
70714+}
70715+
70716+void reiser4_txn_restart_current(void)
70717+{
70718+ reiser4_txn_restart(get_current_context());
70719+}
70720+
70721+/* TXN_ATOM */
70722+
70723+/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
70724+ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
70725+ return NULL. */
70726+static txn_atom *txnh_get_atom(txn_handle * txnh)
70727+{
70728+ txn_atom *atom;
70729+
70730+ assert("umka-180", txnh != NULL);
70731+ assert_spin_not_locked(&(txnh->hlock));
70732+
70733+ while (1) {
70734+ spin_lock_txnh(txnh);
70735+ atom = txnh->atom;
70736+
70737+ if (atom == NULL)
70738+ break;
70739+
70740+ if (spin_trylock_atom(atom))
70741+ break;
70742+
70743+ atomic_inc(&atom->refcount);
70744+
70745+ spin_unlock_txnh(txnh);
70746+ spin_lock_atom(atom);
70747+ spin_lock_txnh(txnh);
70748+
70749+ if (txnh->atom == atom) {
70750+ atomic_dec(&atom->refcount);
70751+ break;
70752+ }
70753+
70754+ spin_unlock_txnh(txnh);
70755+ atom_dec_and_unlock(atom);
70756+ }
70757+
70758+ return atom;
70759+}
70760+
70761+/* Get the current atom and spinlock it if current atom present. May return NULL */
70762+txn_atom *get_current_atom_locked_nocheck(void)
70763+{
70764+ reiser4_context *cx;
70765+ txn_atom *atom;
70766+ txn_handle *txnh;
70767+
70768+ cx = get_current_context();
70769+ assert("zam-437", cx != NULL);
70770+
70771+ txnh = cx->trans;
70772+ assert("zam-435", txnh != NULL);
70773+
70774+ atom = txnh_get_atom(txnh);
70775+
70776+ spin_unlock_txnh(txnh);
70777+ return atom;
70778+}
70779+
70780+/* Get the atom belonging to a jnode, which is initially locked. Return with
70781+ both jnode and atom locked. This performs the necessary spin_trylock to
70782+ break the lock-ordering cycle. Assumes the jnode is already locked, and
70783+ returns NULL if atom is not set. */
70784+txn_atom *jnode_get_atom(jnode * node)
70785+{
70786+ txn_atom *atom;
70787+
70788+ assert("umka-181", node != NULL);
70789+
70790+ while (1) {
70791+ assert_spin_locked(&(node->guard));
70792+
70793+ atom = node->atom;
70794+ /* node is not in any atom */
70795+ if (atom == NULL)
70796+ break;
70797+
70798+ /* If atom is not locked, grab the lock and return */
70799+ if (spin_trylock_atom(atom))
70800+ break;
70801+
70802+ /* At least one jnode belongs to this atom it guarantees that
70803+ * atom->refcount > 0, we can safely increment refcount. */
70804+ atomic_inc(&atom->refcount);
70805+ spin_unlock_jnode(node);
70806+
70807+ /* re-acquire spin locks in the right order */
70808+ spin_lock_atom(atom);
70809+ spin_lock_jnode(node);
70810+
70811+ /* check if node still points to the same atom. */
70812+ if (node->atom == atom) {
70813+ atomic_dec(&atom->refcount);
70814+ break;
70815+ }
70816+
70817+ /* releasing of atom lock and reference requires not holding
70818+ * locks on jnodes. */
70819+ spin_unlock_jnode(node);
70820+
70821+ /* We do not sure that this atom has extra references except our
70822+ * one, so we should call proper function which may free atom if
70823+ * last reference is released. */
70824+ atom_dec_and_unlock(atom);
70825+
70826+ /* lock jnode again for getting valid node->atom pointer
70827+ * value. */
70828+ spin_lock_jnode(node);
70829+ }
70830+
70831+ return atom;
70832+}
70833+
70834+/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
70835+ by flush code to indicate whether the next node (in some direction) is suitable for
70836+ flushing. */
70837+int
70838+same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
70839+{
70840+ int compat;
70841+ txn_atom *atom;
70842+
70843+ assert("umka-182", node != NULL);
70844+ assert("umka-183", check != NULL);
70845+
70846+ /* Not sure what this function is supposed to do if supplied with @check that is
70847+ neither formatted nor unformatted (bitmap or so). */
70848+ assert("nikita-2373", jnode_is_znode(check)
70849+ || jnode_is_unformatted(check));
70850+
70851+ /* Need a lock on CHECK to get its atom and to check various state bits.
70852+ Don't need a lock on NODE once we get the atom lock. */
70853+ /* It is not enough to lock two nodes and check (node->atom ==
70854+ check->atom) because atom could be locked and being fused at that
70855+ moment, jnodes of the atom of that state (being fused) can point to
70856+ different objects, but the atom is the same. */
70857+ spin_lock_jnode(check);
70858+
70859+ atom = jnode_get_atom(check);
70860+
70861+ if (atom == NULL) {
70862+ compat = 0;
70863+ } else {
70864+ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
70865+
70866+ if (compat && jnode_is_znode(check)) {
70867+ compat &= znode_is_connected(JZNODE(check));
70868+ }
70869+
70870+ if (compat && alloc_check) {
70871+ compat &= (alloc_value == jnode_is_flushprepped(check));
70872+ }
70873+
70874+ spin_unlock_atom(atom);
70875+ }
70876+
70877+ spin_unlock_jnode(check);
70878+
70879+ return compat;
70880+}
70881+
70882+/* Decrement the atom's reference count and if it falls to zero, free it. */
70883+void atom_dec_and_unlock(txn_atom * atom)
70884+{
70885+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70886+
70887+ assert("umka-186", atom != NULL);
70888+ assert_spin_locked(&(atom->alock));
70889+ assert("zam-1039", atomic_read(&atom->refcount) > 0);
70890+
70891+ if (atomic_dec_and_test(&atom->refcount)) {
70892+ /* take txnmgr lock and atom lock in proper order. */
70893+ if (!spin_trylock_txnmgr(mgr)) {
70894+ /* This atom should exist after we re-acquire its
70895+ * spinlock, so we increment its reference counter. */
70896+ atomic_inc(&atom->refcount);
70897+ spin_unlock_atom(atom);
70898+ spin_lock_txnmgr(mgr);
70899+ spin_lock_atom(atom);
70900+
70901+ if (!atomic_dec_and_test(&atom->refcount)) {
70902+ spin_unlock_atom(atom);
70903+ spin_unlock_txnmgr(mgr);
70904+ return;
70905+ }
70906+ }
70907+ assert_spin_locked(&(mgr->tmgr_lock));
70908+ atom_free(atom);
70909+ spin_unlock_txnmgr(mgr);
70910+ } else
70911+ spin_unlock_atom(atom);
70912+}
70913+
70914+/* Create new atom and connect it to given transaction handle. This adds the
70915+ atom to the transaction manager's list and sets its reference count to 1, an
70916+ artificial reference which is kept until it commits. We play strange games
70917+ to avoid allocation under jnode & txnh spinlocks.*/
70918+
70919+static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
70920+{
70921+ txn_atom *atom;
70922+ txn_mgr *mgr;
70923+
70924+ if (REISER4_DEBUG && rofs_tree(current_tree)) {
70925+ warning("nikita-3366", "Creating atom on rofs");
70926+ dump_stack();
70927+ }
70928+
70929+ if (*atom_alloc == NULL) {
70930+ (*atom_alloc) = kmem_cache_alloc(_atom_slab,
70931+ reiser4_ctx_gfp_mask_get());
70932+
70933+ if (*atom_alloc == NULL)
70934+ return RETERR(-ENOMEM);
70935+ }
70936+
70937+ /* and, also, txnmgr spin lock should be taken before jnode and txnh
70938+ locks. */
70939+ mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70940+ spin_lock_txnmgr(mgr);
70941+ spin_lock_txnh(txnh);
70942+
70943+ /* Check whether new atom still needed */
70944+ if (txnh->atom != NULL) {
70945+ /* NOTE-NIKITA probably it is rather better to free
70946+ * atom_alloc here than thread it up to reiser4_try_capture() */
70947+
70948+ spin_unlock_txnh(txnh);
70949+ spin_unlock_txnmgr(mgr);
70950+
70951+ return -E_REPEAT;
70952+ }
70953+
70954+ atom = *atom_alloc;
70955+ *atom_alloc = NULL;
70956+
70957+ atom_init(atom);
70958+
70959+ assert("jmacd-17", atom_isclean(atom));
70960+
70961+ /*
70962+ * lock ordering is broken here. It is ok, as long as @atom is new
70963+ * and inaccessible for others. We can't use spin_lock_atom or
70964+ * spin_lock(&atom->alock) because they care about locking
70965+ * dependencies. spin_trylock_lock doesn't.
70966+ */
70967+ check_me("", spin_trylock_atom(atom));
70968+
70969+ /* add atom to the end of transaction manager's list of atoms */
70970+ list_add_tail(&atom->atom_link, &mgr->atoms_list);
70971+ atom->atom_id = mgr->id_count++;
70972+ mgr->atom_count += 1;
70973+
70974+ /* Release txnmgr lock */
70975+ spin_unlock_txnmgr(mgr);
70976+
70977+ /* One reference until it commits. */
70978+ atomic_inc(&atom->refcount);
70979+ atom->stage = ASTAGE_CAPTURE_FUSE;
70980+ atom->super = reiser4_get_current_sb();
70981+ capture_assign_txnh_nolock(atom, txnh);
70982+
70983+ spin_unlock_atom(atom);
70984+ spin_unlock_txnh(txnh);
70985+
70986+ return -E_REPEAT;
70987+}
70988+
70989+/* Return true if an atom is currently "open". */
70990+static int atom_isopen(const txn_atom * atom)
70991+{
70992+ assert("umka-185", atom != NULL);
70993+
70994+ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
70995+}
70996+
70997+/* Return the number of pointers to this atom that must be updated during fusion. This
70998+ approximates the amount of work to be done. Fusion chooses the atom with fewer
70999+ pointers to fuse into the atom with more pointers. */
71000+static int atom_pointer_count(const txn_atom * atom)
71001+{
71002+ assert("umka-187", atom != NULL);
71003+
71004+ /* This is a measure of the amount of work needed to fuse this atom
71005+ * into another. */
71006+ return atom->txnh_count + atom->capture_count;
71007+}
71008+
71009+/* Called holding the atom lock, this removes the atom from the transaction manager list
71010+ and frees it. */
71011+static void atom_free(txn_atom * atom)
71012+{
71013+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
71014+
71015+ assert("umka-188", atom != NULL);
71016+ assert_spin_locked(&(atom->alock));
71017+
71018+ /* Remove from the txn_mgr's atom list */
71019+ assert_spin_locked(&(mgr->tmgr_lock));
71020+ mgr->atom_count -= 1;
71021+ list_del_init(&atom->atom_link);
71022+
71023+ /* Clean the atom */
71024+ assert("jmacd-16",
71025+ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
71026+ atom->stage = ASTAGE_FREE;
71027+
71028+ blocknr_set_destroy(&atom->delete_set);
71029+ blocknr_set_destroy(&atom->wandered_map);
71030+
71031+ assert("jmacd-16", atom_isclean(atom));
71032+
71033+ spin_unlock_atom(atom);
71034+
71035+ kmem_cache_free(_atom_slab, atom);
71036+}
71037+
71038+static int atom_is_dotard(const txn_atom * atom)
71039+{
71040+ return time_after(jiffies, atom->start_time +
71041+ get_current_super_private()->tmgr.atom_max_age);
71042+}
71043+
71044+static int atom_can_be_committed(txn_atom * atom)
71045+{
71046+ assert_spin_locked(&(atom->alock));
71047+ assert("zam-885", atom->txnh_count > atom->nr_waiters);
71048+ return atom->txnh_count == atom->nr_waiters + 1;
71049+}
71050+
71051+/* Return true if an atom should commit now. This is determined by aging, atom
71052+ size or atom flags. */
71053+static int atom_should_commit(const txn_atom * atom)
71054+{
71055+ assert("umka-189", atom != NULL);
71056+ return
71057+ (atom->flags & ATOM_FORCE_COMMIT) ||
71058+ ((unsigned)atom_pointer_count(atom) >
71059+ get_current_super_private()->tmgr.atom_max_size)
71060+ || atom_is_dotard(atom);
71061+}
71062+
71063+/* return 1 if current atom exists and requires commit. */
71064+int current_atom_should_commit(void)
71065+{
71066+ txn_atom *atom;
71067+ int result = 0;
71068+
71069+ atom = get_current_atom_locked_nocheck();
71070+ if (atom) {
71071+ result = atom_should_commit(atom);
71072+ spin_unlock_atom(atom);
71073+ }
71074+ return result;
71075+}
71076+
71077+static int atom_should_commit_asap(const txn_atom * atom)
71078+{
71079+ unsigned int captured;
71080+ unsigned int pinnedpages;
71081+
71082+ assert("nikita-3309", atom != NULL);
71083+
71084+ captured = (unsigned)atom->capture_count;
71085+ pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
71086+
71087+ return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
71088+}
71089+
71090+static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
71091+{
71092+ jnode *first_dirty;
71093+
71094+ list_for_each_entry(first_dirty, head, capture_link) {
71095+ if (!(flags & JNODE_FLUSH_COMMIT)) {
71096+ /*
71097+ * skip jnodes which "heard banshee" or having active
71098+ * I/O
71099+ */
71100+ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
71101+ JF_ISSET(first_dirty, JNODE_WRITEBACK))
71102+ continue;
71103+ }
71104+ return first_dirty;
71105+ }
71106+ return NULL;
71107+}
71108+
71109+/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
71110+ nodes on atom's lists */
71111+jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
71112+{
71113+ jnode *first_dirty;
71114+ tree_level level;
71115+
71116+ assert_spin_locked(&(atom->alock));
71117+
71118+ /* The flush starts from LEAF_LEVEL (=1). */
71119+ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
71120+ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
71121+ continue;
71122+
71123+ first_dirty =
71124+ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
71125+ flags);
71126+ if (first_dirty)
71127+ return first_dirty;
71128+ }
71129+
71130+ /* znode-above-root is on the list #0. */
71131+ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
71132+}
71133+
71134+static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
71135+{
71136+ jnode *cur;
71137+
71138+ assert("zam-905", atom_is_protected(atom));
71139+
71140+ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
71141+ while (ATOM_WB_LIST(atom) != &cur->capture_link) {
71142+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
71143+
71144+ spin_lock_jnode(cur);
71145+ if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
71146+ if (JF_ISSET(cur, JNODE_DIRTY)) {
71147+ queue_jnode(fq, cur);
71148+ } else {
71149+ /* move jnode to atom's clean list */
71150+ list_move_tail(&cur->capture_link,
71151+ ATOM_CLEAN_LIST(atom));
71152+ }
71153+ }
71154+ spin_unlock_jnode(cur);
71155+
71156+ cur = next;
71157+ }
71158+}
71159+
71160+/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
71161+ * jnodes to disk. */
71162+static int submit_wb_list(void)
71163+{
71164+ int ret;
71165+ flush_queue_t *fq;
71166+
71167+ fq = get_fq_for_current_atom();
71168+ if (IS_ERR(fq))
71169+ return PTR_ERR(fq);
71170+
71171+ dispatch_wb_list(fq->atom, fq);
71172+ spin_unlock_atom(fq->atom);
71173+
71174+ ret = reiser4_write_fq(fq, NULL, 1);
71175+ reiser4_fq_put(fq);
71176+
71177+ return ret;
71178+}
71179+
71180+/* Wait completion of all writes, re-submit atom writeback list if needed. */
71181+static int current_atom_complete_writes(void)
71182+{
71183+ int ret;
71184+
71185+ /* Each jnode from that list was modified and dirtied when it had i/o
71186+ * request running already. After i/o completion we have to resubmit
71187+ * them to disk again.*/
71188+ ret = submit_wb_list();
71189+ if (ret < 0)
71190+ return ret;
71191+
71192+ /* Wait all i/o completion */
71193+ ret = current_atom_finish_all_fq();
71194+ if (ret)
71195+ return ret;
71196+
71197+ /* Scan wb list again; all i/o should be completed, we re-submit dirty
71198+ * nodes to disk */
71199+ ret = submit_wb_list();
71200+ if (ret < 0)
71201+ return ret;
71202+
71203+ /* Wait all nodes we just submitted */
71204+ return current_atom_finish_all_fq();
71205+}
71206+
71207+#if REISER4_DEBUG
71208+
71209+static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
71210+{
71211+ if (atom == NULL) {
71212+ printk("%s: no atom\n", prefix);
71213+ return;
71214+ }
71215+
71216+ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
71217+ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
71218+ atomic_read(&atom->refcount), atom->atom_id, atom->flags,
71219+ atom->txnh_count, atom->capture_count, atom->stage,
71220+ atom->start_time, atom->flushed);
71221+}
71222+
71223+#else /* REISER4_DEBUG */
71224+
71225+static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
71226+
71227+#endif /* REISER4_DEBUG */
71228+
71229+#define TOOMANYFLUSHES (1 << 13)
71230+
71231+/* Called with the atom locked and no open "active" transaction handlers except
71232+ ours, this function calls flush_current_atom() until all dirty nodes are
71233+ processed. Then it initiates commit processing.
71234+
71235+ Called by the single remaining open "active" txnh, which is closing. Other
71236+ open txnhs belong to processes which wait atom commit in commit_txnh()
71237+ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
71238+ long as we hold the atom lock none of the jnodes can be captured and/or
71239+ locked.
71240+
71241+ Return value is an error code if commit fails.
71242+*/
71243+static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
71244+{
71245+ reiser4_super_info_data *sbinfo = get_current_super_private();
71246+ long ret = 0;
71247+ /* how many times jnode_flush() was called as a part of attempt to
71248+ * commit this atom. */
71249+ int flushiters;
71250+
71251+ assert("zam-888", atom != NULL && *atom != NULL);
71252+ assert_spin_locked(&((*atom)->alock));
71253+ assert("zam-887", get_current_context()->trans->atom == *atom);
71254+ assert("jmacd-151", atom_isopen(*atom));
71255+
71256+ assert("nikita-3184",
71257+ get_current_super_private()->delete_mutex_owner != current);
71258+
71259+ for (flushiters = 0;; ++flushiters) {
71260+ ret =
71261+ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
71262+ JNODE_FLUSH_COMMIT,
71263+ LONG_MAX /* nr_to_write */ ,
71264+ nr_submitted, atom, NULL);
71265+ if (ret != -E_REPEAT)
71266+ break;
71267+
71268+ /* if atom's dirty list contains one znode which is
71269+ HEARD_BANSHEE and is locked we have to allow lock owner to
71270+ continue and uncapture that znode */
71271+ reiser4_preempt_point();
71272+
71273+ *atom = get_current_atom_locked();
71274+ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
71275+ warning("nikita-3176",
71276+ "Flushing like mad: %i", flushiters);
71277+ reiser4_info_atom("atom", *atom);
71278+ DEBUGON(flushiters > (1 << 20));
71279+ }
71280+ }
71281+
71282+ if (ret)
71283+ return ret;
71284+
71285+ assert_spin_locked(&((*atom)->alock));
71286+
71287+ if (!atom_can_be_committed(*atom)) {
71288+ spin_unlock_atom(*atom);
71289+ return RETERR(-E_REPEAT);
71290+ }
71291+
71292+ if ((*atom)->capture_count == 0)
71293+ goto done;
71294+
71295+ /* Up to this point we have been flushing and after flush is called we
71296+ return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
71297+ at this point, commit should be successful. */
71298+ reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
71299+ ON_DEBUG(((*atom)->committer = current));
71300+ spin_unlock_atom(*atom);
71301+
71302+ ret = current_atom_complete_writes();
71303+ if (ret)
71304+ return ret;
71305+
71306+ assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
71307+
71308+ /* isolate critical code path which should be executed by only one
71309+ * thread using tmgr mutex */
71310+ mutex_lock(&sbinfo->tmgr.commit_mutex);
71311+
71312+ ret = reiser4_write_logs(nr_submitted);
71313+ if (ret < 0)
71314+ reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
71315+
71316+ /* The atom->ovrwr_nodes list is processed under commit mutex held
71317+ because of bitmap nodes which are captured by special way in
71318+ reiser4_pre_commit_hook_bitmap(), that way does not include
71319+ capture_fuse_wait() as a capturing of other nodes does -- the commit
71320+ mutex is used for transaction isolation instead. */
71321+ reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
71322+ mutex_unlock(&sbinfo->tmgr.commit_mutex);
71323+
71324+ reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
71325+ reiser4_invalidate_list(ATOM_WB_LIST(*atom));
71326+ assert("zam-927", list_empty(&(*atom)->inodes));
71327+
71328+ spin_lock_atom(*atom);
71329+ done:
71330+ reiser4_atom_set_stage(*atom, ASTAGE_DONE);
71331+ ON_DEBUG((*atom)->committer = NULL);
71332+
71333+ /* Atom's state changes, so wake up everybody waiting for this
71334+ event. */
71335+ wakeup_atom_waiting_list(*atom);
71336+
71337+ /* Decrement the "until commit" reference, at least one txnh (the caller) is
71338+ still open. */
71339+ atomic_dec(&(*atom)->refcount);
71340+
71341+ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
71342+ assert("jmacd-1062", (*atom)->capture_count == 0);
71343+ BUG_ON((*atom)->capture_count != 0);
71344+ assert_spin_locked(&((*atom)->alock));
71345+
71346+ return ret;
71347+}
71348+
71349+/* TXN_TXNH */
71350+
71351+/**
71352+ * force_commit_atom - commit current atom and wait commit completion
71353+ * @txnh:
71354+ *
71355+ * Commits current atom and wait commit completion; current atom and @txnh have
71356+ * to be spinlocked before call, this function unlocks them on exit.
71357+ */
71358+int force_commit_atom(txn_handle *txnh)
71359+{
71360+ txn_atom *atom;
71361+
71362+ assert("zam-837", txnh != NULL);
71363+ assert_spin_locked(&(txnh->hlock));
71364+ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
71365+
71366+ atom = txnh->atom;
71367+
71368+ assert("zam-834", atom != NULL);
71369+ assert_spin_locked(&(atom->alock));
71370+
71371+ /*
71372+ * Set flags for atom and txnh: forcing atom commit and waiting for
71373+ * commit completion
71374+ */
71375+ txnh->flags |= TXNH_WAIT_COMMIT;
71376+ atom->flags |= ATOM_FORCE_COMMIT;
71377+
71378+ spin_unlock_txnh(txnh);
71379+ spin_unlock_atom(atom);
71380+
71381+ /* commit is here */
71382+ reiser4_txn_restart_current();
71383+ return 0;
71384+}
71385+
71386+/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
71387+ * should we commit all atoms including new ones which are created after this
71388+ * functions is called. */
71389+int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
71390+{
71391+ int ret;
71392+ txn_atom *atom;
71393+ txn_mgr *mgr;
71394+ txn_handle *txnh;
71395+ unsigned long start_time = jiffies;
71396+ reiser4_context *ctx = get_current_context();
71397+
71398+ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
71399+ assert("nikita-3058", reiser4_commit_check_locks());
71400+
71401+ reiser4_txn_restart_current();
71402+
71403+ mgr = &get_super_private(super)->tmgr;
71404+
71405+ txnh = ctx->trans;
71406+
71407+ again:
71408+
71409+ spin_lock_txnmgr(mgr);
71410+
71411+ list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
71412+ spin_lock_atom(atom);
71413+
71414+ /* Commit any atom which can be committed. If @commit_new_atoms
71415+ * is not set we commit only atoms which were created before
71416+ * this call is started. */
71417+ if (commit_all_atoms
71418+ || time_before_eq(atom->start_time, start_time)) {
71419+ if (atom->stage <= ASTAGE_POST_COMMIT) {
71420+ spin_unlock_txnmgr(mgr);
71421+
71422+ if (atom->stage < ASTAGE_PRE_COMMIT) {
71423+ spin_lock_txnh(txnh);
71424+ /* Add force-context txnh */
71425+ capture_assign_txnh_nolock(atom, txnh);
71426+ ret = force_commit_atom(txnh);
71427+ if (ret)
71428+ return ret;
71429+ } else
71430+ /* wait atom commit */
71431+ reiser4_atom_wait_event(atom);
71432+
71433+ goto again;
71434+ }
71435+ }
71436+
71437+ spin_unlock_atom(atom);
71438+ }
71439+
71440+#if REISER4_DEBUG
71441+ if (commit_all_atoms) {
71442+ reiser4_super_info_data *sbinfo = get_super_private(super);
71443+ spin_lock_reiser4_super(sbinfo);
71444+ assert("zam-813",
71445+ sbinfo->blocks_fake_allocated_unformatted == 0);
71446+ assert("zam-812", sbinfo->blocks_fake_allocated == 0);
71447+ spin_unlock_reiser4_super(sbinfo);
71448+ }
71449+#endif
71450+
71451+ spin_unlock_txnmgr(mgr);
71452+
71453+ return 0;
71454+}
71455+
71456+/* check whether commit_some_atoms() can commit @atom. Locking is up to the
71457+ * caller */
71458+static int atom_is_committable(txn_atom * atom)
71459+{
71460+ return
71461+ atom->stage < ASTAGE_PRE_COMMIT &&
71462+ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
71463+}
71464+
71465+/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
71466+ * lock at exit */
71467+int commit_some_atoms(txn_mgr * mgr)
71468+{
71469+ int ret = 0;
71470+ txn_atom *atom;
71471+ txn_handle *txnh;
71472+ reiser4_context *ctx;
71473+ struct list_head *pos, *tmp;
71474+
71475+ ctx = get_current_context();
71476+ assert("nikita-2444", ctx != NULL);
71477+
71478+ txnh = ctx->trans;
71479+ spin_lock_txnmgr(mgr);
71480+
71481+ /*
71482+ * this is to avoid gcc complain that atom might be used
71483+ * uninitialized
71484+ */
71485+ atom = NULL;
71486+
71487+ /* look for atom to commit */
71488+ list_for_each_safe(pos, tmp, &mgr->atoms_list) {
71489+ atom = list_entry(pos, txn_atom, atom_link);
71490+ /*
71491+ * first test without taking atom spin lock, whether it is
71492+ * eligible for committing at all
71493+ */
71494+ if (atom_is_committable(atom)) {
71495+ /* now, take spin lock and re-check */
71496+ spin_lock_atom(atom);
71497+ if (atom_is_committable(atom))
71498+ break;
71499+ spin_unlock_atom(atom);
71500+ }
71501+ }
71502+
71503+ ret = (&mgr->atoms_list == pos);
71504+ spin_unlock_txnmgr(mgr);
71505+
71506+ if (ret) {
71507+ /* nothing found */
71508+ spin_unlock(&mgr->daemon->guard);
71509+ return 0;
71510+ }
71511+
71512+ spin_lock_txnh(txnh);
71513+
71514+ BUG_ON(atom == NULL);
71515+ /* Set the atom to force committing */
71516+ atom->flags |= ATOM_FORCE_COMMIT;
71517+
71518+ /* Add force-context txnh */
71519+ capture_assign_txnh_nolock(atom, txnh);
71520+
71521+ spin_unlock_txnh(txnh);
71522+ spin_unlock_atom(atom);
71523+
71524+ /* we are about to release daemon spin lock, notify daemon it
71525+ has to rescan atoms */
71526+ mgr->daemon->rescan = 1;
71527+ spin_unlock(&mgr->daemon->guard);
71528+ reiser4_txn_restart_current();
71529+ return 0;
71530+}
71531+
71532+static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
71533+{
71534+ int atom_stage;
71535+ txn_atom *atom_2;
71536+ int repeat;
71537+
71538+ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
71539+
71540+ atom_stage = atom->stage;
71541+ repeat = 0;
71542+
71543+ if (!spin_trylock_txnmgr(tmgr)) {
71544+ atomic_inc(&atom->refcount);
71545+ spin_unlock_atom(atom);
71546+ spin_lock_txnmgr(tmgr);
71547+ spin_lock_atom(atom);
71548+ repeat = 1;
71549+ if (atom->stage != atom_stage) {
71550+ spin_unlock_txnmgr(tmgr);
71551+ atom_dec_and_unlock(atom);
71552+ return -E_REPEAT;
71553+ }
71554+ atomic_dec(&atom->refcount);
71555+ }
71556+
71557+ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
71558+ if (atom == atom_2)
71559+ continue;
71560+ /*
71561+ * if trylock does not succeed we just do not fuse with that
71562+ * atom.
71563+ */
71564+ if (spin_trylock_atom(atom_2)) {
71565+ if (atom_2->stage < ASTAGE_PRE_COMMIT) {
71566+ spin_unlock_txnmgr(tmgr);
71567+ capture_fuse_into(atom_2, atom);
71568+ /* all locks are lost we can only repeat here */
71569+ return -E_REPEAT;
71570+ }
71571+ spin_unlock_atom(atom_2);
71572+ }
71573+ }
71574+ atom->flags |= ATOM_CANCEL_FUSION;
71575+ spin_unlock_txnmgr(tmgr);
71576+ if (repeat) {
71577+ spin_unlock_atom(atom);
71578+ return -E_REPEAT;
71579+ }
71580+ return 0;
71581+}
71582+
71583+/* Calls jnode_flush for current atom if it exists; if not, just take another
71584+ atom and call jnode_flush() for him. If current transaction handle has
71585+ already assigned atom (current atom) we have to close current transaction
71586+ prior to switch to another atom or do something with current atom. This
71587+ code tries to flush current atom.
71588+
71589+ flush_some_atom() is called as part of memory clearing process. It is
71590+ invoked from balance_dirty_pages(), pdflushd, and entd.
71591+
71592+ If we can flush no nodes, atom is committed, because this frees memory.
71593+
71594+ If atom is too large or too old it is committed also.
71595+*/
71596+int
71597+flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
71598+ int flags)
71599+{
71600+ reiser4_context *ctx = get_current_context();
71601+ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
71602+ txn_handle *txnh = ctx->trans;
71603+ txn_atom *atom;
71604+ int ret;
71605+
71606+ BUG_ON(wbc->nr_to_write == 0);
71607+ BUG_ON(*nr_submitted != 0);
71608+ assert("zam-1042", txnh != NULL);
71609+ repeat:
71610+ if (txnh->atom == NULL) {
71611+ /* current atom is not available, take first from txnmgr */
71612+ spin_lock_txnmgr(tmgr);
71613+
71614+ /* traverse the list of all atoms */
71615+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71616+ /* lock atom before checking its state */
71617+ spin_lock_atom(atom);
71618+
71619+ /*
71620+ * we need an atom which is not being committed and
71621+ * which has no flushers (jnode_flush() add one flusher
71622+ * at the beginning and subtract one at the end).
71623+ */
71624+ if (atom->stage < ASTAGE_PRE_COMMIT &&
71625+ atom->nr_flushers == 0) {
71626+ spin_lock_txnh(txnh);
71627+ capture_assign_txnh_nolock(atom, txnh);
71628+ spin_unlock_txnh(txnh);
71629+
71630+ goto found;
71631+ }
71632+
71633+ spin_unlock_atom(atom);
71634+ }
71635+
71636+ /*
71637+ * Write throttling is case of no one atom can be
71638+ * flushed/committed.
71639+ */
71640+ if (!current_is_pdflush() && !wbc->nonblocking) {
71641+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71642+ spin_lock_atom(atom);
71643+ /* Repeat the check from the above. */
71644+ if (atom->stage < ASTAGE_PRE_COMMIT
71645+ && atom->nr_flushers == 0) {
71646+ spin_lock_txnh(txnh);
71647+ capture_assign_txnh_nolock(atom, txnh);
71648+ spin_unlock_txnh(txnh);
71649+
71650+ goto found;
71651+ }
71652+ if (atom->stage <= ASTAGE_POST_COMMIT) {
71653+ spin_unlock_txnmgr(tmgr);
71654+ /*
71655+ * we just wait until atom's flusher
71656+ * makes a progress in flushing or
71657+ * committing the atom
71658+ */
71659+ reiser4_atom_wait_event(atom);
71660+ goto repeat;
71661+ }
71662+ spin_unlock_atom(atom);
71663+ }
71664+ }
71665+ spin_unlock_txnmgr(tmgr);
71666+ return 0;
71667+ found:
71668+ spin_unlock_txnmgr(tmgr);
71669+ } else
71670+ atom = get_current_atom_locked();
71671+
71672+ BUG_ON(atom->super != ctx->super);
71673+ assert("vs-35", atom->super == ctx->super);
71674+ if (start) {
71675+ spin_lock_jnode(start);
71676+ ret = (atom == start->atom) ? 1 : 0;
71677+ spin_unlock_jnode(start);
71678+ if (ret == 0)
71679+ start = NULL;
71680+ }
71681+ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
71682+ if (ret == 0) {
71683+ /* flush_current_atom returns 0 only if it submitted for write
71684+ nothing */
71685+ BUG_ON(*nr_submitted != 0);
71686+ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
71687+ if (atom->capture_count < tmgr->atom_min_size &&
71688+ !(atom->flags & ATOM_CANCEL_FUSION)) {
71689+ ret = txn_try_to_fuse_small_atom(tmgr, atom);
71690+ if (ret == -E_REPEAT) {
71691+ reiser4_preempt_point();
71692+ goto repeat;
71693+ }
71694+ }
71695+ /* if early flushing could not make more nodes clean,
71696+ * or atom is too old/large,
71697+ * we force current atom to commit */
71698+ /* wait for commit completion but only if this
71699+ * wouldn't stall pdflushd and ent thread. */
71700+ if (!wbc->nonblocking && !ctx->entd)
71701+ txnh->flags |= TXNH_WAIT_COMMIT;
71702+ atom->flags |= ATOM_FORCE_COMMIT;
71703+ }
71704+ spin_unlock_atom(atom);
71705+ } else if (ret == -E_REPEAT) {
71706+ if (*nr_submitted == 0) {
71707+ /* let others who hampers flushing (hold longterm locks,
71708+ for instance) to free the way for flush */
71709+ reiser4_preempt_point();
71710+ goto repeat;
71711+ }
71712+ ret = 0;
71713+ }
71714+/*
71715+ if (*nr_submitted > wbc->nr_to_write)
71716+ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
71717+*/
71718+ reiser4_txn_restart(ctx);
71719+
71720+ return ret;
71721+}
71722+
71723+/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
71724+void reiser4_invalidate_list(struct list_head *head)
71725+{
71726+ while (!list_empty(head)) {
71727+ jnode *node;
71728+
71729+ node = list_entry(head->next, jnode, capture_link);
71730+ spin_lock_jnode(node);
71731+ reiser4_uncapture_block(node);
71732+ jput(node);
71733+ }
71734+}
71735+
71736+static void init_wlinks(txn_wait_links * wlinks)
71737+{
71738+ wlinks->_lock_stack = get_current_lock_stack();
71739+ INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
71740+ INIT_LIST_HEAD(&wlinks->_fwaiting_link);
71741+ wlinks->waitfor_cb = NULL;
71742+ wlinks->waiting_cb = NULL;
71743+}
71744+
71745+/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
71746+void reiser4_atom_wait_event(txn_atom * atom)
71747+{
71748+ txn_wait_links _wlinks;
71749+
71750+ assert_spin_locked(&(atom->alock));
71751+ assert("nikita-3156",
71752+ lock_stack_isclean(get_current_lock_stack()) ||
71753+ atom->nr_running_queues > 0);
71754+
71755+ init_wlinks(&_wlinks);
71756+ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
71757+ atomic_inc(&atom->refcount);
71758+ spin_unlock_atom(atom);
71759+
71760+ reiser4_prepare_to_sleep(_wlinks._lock_stack);
71761+ reiser4_go_to_sleep(_wlinks._lock_stack);
71762+
71763+ spin_lock_atom(atom);
71764+ list_del(&_wlinks._fwaitfor_link);
71765+ atom_dec_and_unlock(atom);
71766+}
71767+
71768+void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
71769+{
71770+ assert("nikita-3535", atom != NULL);
71771+ assert_spin_locked(&(atom->alock));
71772+ assert("nikita-3536", stage <= ASTAGE_INVALID);
71773+ /* Excelsior! */
71774+ assert("nikita-3537", stage >= atom->stage);
71775+ if (atom->stage != stage) {
71776+ atom->stage = stage;
71777+ reiser4_atom_send_event(atom);
71778+ }
71779+}
71780+
71781+/* wake all threads which wait for an event */
71782+void reiser4_atom_send_event(txn_atom * atom)
71783+{
71784+ assert_spin_locked(&(atom->alock));
71785+ wakeup_atom_waitfor_list(atom);
71786+}
71787+
71788+/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
71789+ example, because it does fsync(2)) */
71790+static int should_wait_commit(txn_handle * h)
71791+{
71792+ return h->flags & TXNH_WAIT_COMMIT;
71793+}
71794+
71795+typedef struct commit_data {
71796+ txn_atom *atom;
71797+ txn_handle *txnh;
71798+ long nr_written;
71799+ /* as an optimization we start committing atom by first trying to
71800+ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
71801+ * allows to reduce stalls due to other threads waiting for atom in
71802+ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
71803+ * preliminary flushes. */
71804+ int preflush;
71805+ /* have we waited on atom. */
71806+ int wait;
71807+ int failed;
71808+ int wake_ktxnmgrd_up;
71809+} commit_data;
71810+
71811+/*
71812+ * Called from commit_txnh() repeatedly, until either error happens, or atom
71813+ * commits successfully.
71814+ */
71815+static int try_commit_txnh(commit_data * cd)
71816+{
71817+ int result;
71818+
71819+ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
71820+
71821+ /* Get the atom and txnh locked. */
71822+ cd->atom = txnh_get_atom(cd->txnh);
71823+ assert("jmacd-309", cd->atom != NULL);
71824+ spin_unlock_txnh(cd->txnh);
71825+
71826+ if (cd->wait) {
71827+ cd->atom->nr_waiters--;
71828+ cd->wait = 0;
71829+ }
71830+
71831+ if (cd->atom->stage == ASTAGE_DONE)
71832+ return 0;
71833+
71834+ if (cd->failed)
71835+ return 0;
71836+
71837+ if (atom_should_commit(cd->atom)) {
71838+ /* if atom is _very_ large schedule it for commit as soon as
71839+ * possible. */
71840+ if (atom_should_commit_asap(cd->atom)) {
71841+ /*
71842+ * When atom is in PRE_COMMIT or later stage following
71843+ * invariant (encoded in atom_can_be_committed())
71844+ * holds: there is exactly one non-waiter transaction
71845+ * handle opened on this atom. When thread wants to
71846+ * wait until atom commits (for example sync()) it
71847+ * waits on atom event after increasing
71848+ * atom->nr_waiters (see blow in this function). It
71849+ * cannot be guaranteed that atom is already committed
71850+ * after receiving event, so loop has to be
71851+ * re-started. But if atom switched into PRE_COMMIT
71852+ * stage and became too large, we cannot change its
71853+ * state back to CAPTURE_WAIT (atom stage can only
71854+ * increase monotonically), hence this check.
71855+ */
71856+ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
71857+ reiser4_atom_set_stage(cd->atom,
71858+ ASTAGE_CAPTURE_WAIT);
71859+ cd->atom->flags |= ATOM_FORCE_COMMIT;
71860+ }
71861+ if (cd->txnh->flags & TXNH_DONT_COMMIT) {
71862+ /*
71863+ * this thread (transaction handle that is) doesn't
71864+ * want to commit atom. Notify waiters that handle is
71865+ * closed. This can happen, for example, when we are
71866+ * under VFS directory lock and don't want to commit
71867+ * atom right now to avoid stalling other threads
71868+ * working in the same directory.
71869+ */
71870+
71871+ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
71872+ * commit this atom: no atom waiters and only one
71873+ * (our) open transaction handle. */
71874+ cd->wake_ktxnmgrd_up =
71875+ cd->atom->txnh_count == 1 &&
71876+ cd->atom->nr_waiters == 0;
71877+ reiser4_atom_send_event(cd->atom);
71878+ result = 0;
71879+ } else if (!atom_can_be_committed(cd->atom)) {
71880+ if (should_wait_commit(cd->txnh)) {
71881+ /* sync(): wait for commit */
71882+ cd->atom->nr_waiters++;
71883+ cd->wait = 1;
71884+ reiser4_atom_wait_event(cd->atom);
71885+ result = RETERR(-E_REPEAT);
71886+ } else {
71887+ result = 0;
71888+ }
71889+ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
71890+ /*
71891+ * optimization: flush atom without switching it into
71892+ * ASTAGE_CAPTURE_WAIT.
71893+ *
71894+ * But don't do this for ktxnmgrd, because ktxnmgrd
71895+ * should never block on atom fusion.
71896+ */
71897+ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
71898+ LONG_MAX, &cd->nr_written,
71899+ &cd->atom, NULL);
71900+ if (result == 0) {
71901+ spin_unlock_atom(cd->atom);
71902+ cd->preflush = 0;
71903+ result = RETERR(-E_REPEAT);
71904+ } else /* Atoms wasn't flushed
71905+ * completely. Rinse. Repeat. */
71906+ --cd->preflush;
71907+ } else {
71908+ /* We change atom state to ASTAGE_CAPTURE_WAIT to
71909+ prevent atom fusion and count ourself as an active
71910+ flusher */
71911+ reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
71912+ cd->atom->flags |= ATOM_FORCE_COMMIT;
71913+
71914+ result =
71915+ commit_current_atom(&cd->nr_written, &cd->atom);
71916+ if (result != 0 && result != -E_REPEAT)
71917+ cd->failed = 1;
71918+ }
71919+ } else
71920+ result = 0;
71921+
71922+#if REISER4_DEBUG
71923+ if (result == 0)
71924+ assert_spin_locked(&(cd->atom->alock));
71925+#endif
71926+
71927+ /* perfectly valid assertion, except that when atom/txnh is not locked
71928+ * fusion can take place, and cd->atom points nowhere. */
71929+ /*
71930+ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
71931+ */
71932+ return result;
71933+}
71934+
71935+/* Called to commit a transaction handle. This decrements the atom's number of open
71936+ handles and if it is the last handle to commit and the atom should commit, initiates
71937+ atom commit. if commit does not fail, return number of written blocks */
71938+static int commit_txnh(txn_handle * txnh)
71939+{
71940+ commit_data cd;
71941+ assert("umka-192", txnh != NULL);
71942+
71943+ memset(&cd, 0, sizeof cd);
71944+ cd.txnh = txnh;
71945+ cd.preflush = 10;
71946+
71947+ /* calls try_commit_txnh() until either atom commits, or error
71948+ * happens */
71949+ while (try_commit_txnh(&cd) != 0)
71950+ reiser4_preempt_point();
71951+
71952+ spin_lock_txnh(txnh);
71953+
71954+ cd.atom->txnh_count -= 1;
71955+ txnh->atom = NULL;
71956+ /* remove transaction handle from atom's list of transaction handles */
71957+ list_del_init(&txnh->txnh_link);
71958+
71959+ spin_unlock_txnh(txnh);
71960+ atom_dec_and_unlock(cd.atom);
71961+ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
71962+ * because it takes time) by current thread, we do that work
71963+ * asynchronously by ktxnmgrd daemon. */
71964+ if (cd.wake_ktxnmgrd_up)
71965+ ktxnmgrd_kick(&get_current_super_private()->tmgr);
71966+
71967+ return 0;
71968+}
71969+
71970+/* TRY_CAPTURE */
71971+
71972+/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
71973+ condition indicates that the request should be retried, and it may block if the
71974+ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
71975+
71976+ This routine encodes the basic logic of block capturing described by:
71977+
71978+ http://namesys.com/v4/v4.html
71979+
71980+ Our goal here is to ensure that any two blocks that contain dependent modifications
71981+ should commit at the same time. This function enforces this discipline by initiating
71982+ fusion whenever a transaction handle belonging to one atom requests to read or write a
71983+ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
71984+
71985+ In addition, this routine handles the initial assignment of atoms to blocks and
71986+ transaction handles. These are possible outcomes of this function:
71987+
71988+ 1. The block and handle are already part of the same atom: return immediate success
71989+
71990+ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
71991+ the handle to the block's atom.
71992+
71993+ 3. The handle is assigned but the block is not: call capture_assign_block to assign
71994+ the block to the handle's atom.
71995+
71996+ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
71997+ to fuse atoms.
71998+
71999+ 5. Neither block nor handle are assigned: create a new atom and assign them both.
72000+
72001+ 6. A read request for a non-captured block: return immediate success.
72002+
72003+ This function acquires and releases the handle's spinlock. This function is called
72004+ under the jnode lock and if the return value is 0, it returns with the jnode lock still
72005+ held. If the return is -E_REPEAT or some other error condition, the jnode lock is
72006+ released. The external interface (reiser4_try_capture) manages re-aquiring the jnode
72007+ lock in the failure case.
72008+*/
72009+static int try_capture_block(
72010+ txn_handle * txnh, jnode * node, txn_capture mode,
72011+ txn_atom ** atom_alloc)
72012+{
72013+ txn_atom *block_atom;
72014+ txn_atom *txnh_atom;
72015+
72016+ /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
72017+ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
72018+
72019+ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
72020+ * node->tree somewhere. */
72021+ assert("umka-194", txnh != NULL);
72022+ assert("umka-195", node != NULL);
72023+
72024+ /* The jnode is already locked! Being called from reiser4_try_capture(). */
72025+ assert_spin_locked(&(node->guard));
72026+ block_atom = node->atom;
72027+
72028+ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
72029+ let us touch the atoms themselves. */
72030+ spin_lock_txnh(txnh);
72031+ txnh_atom = txnh->atom;
72032+ /* Process of capturing continues into one of four branches depends on
72033+ which atoms from (block atom (node->atom), current atom (txnh->atom))
72034+ exist. */
72035+ if (txnh_atom == NULL) {
72036+ if (block_atom == NULL) {
72037+ spin_unlock_txnh(txnh);
72038+ spin_unlock_jnode(node);
72039+ /* assign empty atom to the txnh and repeat */
72040+ return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
72041+ } else {
72042+ atomic_inc(&block_atom->refcount);
72043+ /* node spin-lock isn't needed anymore */
72044+ spin_unlock_jnode(node);
72045+ if (!spin_trylock_atom(block_atom)) {
72046+ spin_unlock_txnh(txnh);
72047+ spin_lock_atom(block_atom);
72048+ spin_lock_txnh(txnh);
72049+ }
72050+ /* re-check state after getting txnh and the node
72051+ * atom spin-locked */
72052+ if (node->atom != block_atom || txnh->atom != NULL) {
72053+ spin_unlock_txnh(txnh);
72054+ atom_dec_and_unlock(block_atom);
72055+ return RETERR(-E_REPEAT);
72056+ }
72057+ atomic_dec(&block_atom->refcount);
72058+ if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
72059+ (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
72060+ block_atom->txnh_count != 0))
72061+ return capture_fuse_wait(txnh, block_atom, NULL, mode);
72062+ capture_assign_txnh_nolock(block_atom, txnh);
72063+ spin_unlock_txnh(txnh);
72064+ spin_unlock_atom(block_atom);
72065+ return RETERR(-E_REPEAT);
72066+ }
72067+ } else {
72068+ /* It is time to perform deadlock prevention check over the
72069+ node we want to capture. It is possible this node was locked
72070+ for read without capturing it. The optimization which allows
72071+ to do it helps us in keeping atoms independent as long as
72072+ possible but it may cause lock/fuse deadlock problems.
72073+
72074+ A number of similar deadlock situations with locked but not
72075+ captured nodes were found. In each situation there are two
72076+ or more threads: one of them does flushing while another one
72077+ does routine balancing or tree lookup. The flushing thread
72078+ (F) sleeps in long term locking request for node (N), another
72079+ thread (A) sleeps in trying to capture some node already
72080+ belonging the atom F, F has a state which prevents
72081+ immediately fusion .
72082+
72083+ Deadlocks of this kind cannot happen if node N was properly
72084+ captured by thread A. The F thread fuse atoms before locking
72085+ therefore current atom of thread F and current atom of thread
72086+ A became the same atom and thread A may proceed. This does
72087+ not work if node N was not captured because the fusion of
72088+ atom does not happens.
72089+
72090+ The following scheme solves the deadlock: If
72091+ longterm_lock_znode locks and does not capture a znode, that
72092+ znode is marked as MISSED_IN_CAPTURE. A node marked this way
72093+ is processed by the code below which restores the missed
72094+ capture and fuses current atoms of all the node lock owners
72095+ by calling the fuse_not_fused_lock_owners() function. */
72096+ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
72097+ JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
72098+ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
72099+ spin_unlock_txnh(txnh);
72100+ spin_unlock_jnode(node);
72101+ fuse_not_fused_lock_owners(txnh, JZNODE(node));
72102+ return RETERR(-E_REPEAT);
72103+ }
72104+ }
72105+ if (block_atom == NULL) {
72106+ atomic_inc(&txnh_atom->refcount);
72107+ spin_unlock_txnh(txnh);
72108+ if (!spin_trylock_atom(txnh_atom)) {
72109+ spin_unlock_jnode(node);
72110+ spin_lock_atom(txnh_atom);
72111+ spin_lock_jnode(node);
72112+ }
72113+ if (txnh->atom != txnh_atom || node->atom != NULL
72114+ || JF_ISSET(node, JNODE_IS_DYING)) {
72115+ spin_unlock_jnode(node);
72116+ atom_dec_and_unlock(txnh_atom);
72117+ return RETERR(-E_REPEAT);
72118+ }
72119+ atomic_dec(&txnh_atom->refcount);
72120+ capture_assign_block_nolock(txnh_atom, node);
72121+ spin_unlock_atom(txnh_atom);
72122+ } else {
72123+ if (txnh_atom != block_atom) {
72124+ if (mode & TXN_CAPTURE_DONT_FUSE) {
72125+ spin_unlock_txnh(txnh);
72126+ spin_unlock_jnode(node);
72127+ /* we are in a "no-fusion" mode and @node is
72128+ * already part of transaction. */
72129+ return RETERR(-E_NO_NEIGHBOR);
72130+ }
72131+ return capture_init_fusion(node, txnh, mode);
72132+ }
72133+ spin_unlock_txnh(txnh);
72134+ }
72135+ }
72136+ return 0;
72137+}
72138+
72139+static txn_capture
72140+build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
72141+{
72142+ txn_capture cap_mode;
72143+
72144+ assert_spin_locked(&(node->guard));
72145+
72146+ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
72147+
72148+ if (lock_mode == ZNODE_WRITE_LOCK) {
72149+ cap_mode = TXN_CAPTURE_WRITE;
72150+ } else if (node->atom != NULL) {
72151+ cap_mode = TXN_CAPTURE_WRITE;
72152+ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
72153+ jnode_get_level(node) == LEAF_LEVEL) {
72154+ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
72155+ /* We only need a READ_FUSING capture at the leaf level. This
72156+ is because the internal levels of the tree (twigs included)
72157+ are redundant from the point of the user that asked for a
72158+ read-fusing transcrash. The user only wants to read-fuse
72159+ atoms due to reading uncommitted data that another user has
72160+ written. It is the file system that reads/writes the
72161+ internal tree levels, the user only reads/writes leaves. */
72162+ cap_mode = TXN_CAPTURE_READ_ATOMIC;
72163+ } else {
72164+ /* In this case (read lock at a non-leaf) there's no reason to
72165+ * capture. */
72166+ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
72167+ return 0;
72168+ }
72169+
72170+ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
72171+ assert("nikita-3186", cap_mode != 0);
72172+ return cap_mode;
72173+}
72174+
72175+/* This is an external interface to try_capture_block(), it calls
72176+ try_capture_block() repeatedly as long as -E_REPEAT is returned.
72177+
72178+ @node: node to capture,
72179+ @lock_mode: read or write lock is used in capture mode calculation,
72180+ @flags: see txn_capture flags enumeration,
72181+ @can_coc : can copy-on-capture
72182+
72183+ @return: 0 - node was successfully captured, -E_REPEAT - capture request
72184+ cannot be processed immediately as it was requested in flags,
72185+ < 0 - other errors.
72186+*/
72187+int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
72188+ txn_capture flags)
72189+{
72190+ txn_atom *atom_alloc = NULL;
72191+ txn_capture cap_mode;
72192+ txn_handle *txnh = get_current_context()->trans;
72193+ int ret;
72194+
72195+ assert_spin_locked(&(node->guard));
72196+
72197+ repeat:
72198+ if (JF_ISSET(node, JNODE_IS_DYING))
72199+ return RETERR(-EINVAL);
72200+ if (node->atom != NULL && txnh->atom == node->atom)
72201+ return 0;
72202+ cap_mode = build_capture_mode(node, lock_mode, flags);
72203+ if (cap_mode == 0 ||
72204+ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
72205+ /* Mark this node as "MISSED". It helps in further deadlock
72206+ * analysis */
72207+ if (jnode_is_znode(node))
72208+ JF_SET(node, JNODE_MISSED_IN_CAPTURE);
72209+ return 0;
72210+ }
72211+ /* Repeat try_capture as long as -E_REPEAT is returned. */
72212+ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
72213+ /* Regardless of non_blocking:
72214+
72215+ If ret == 0 then jnode is still locked.
72216+ If ret != 0 then jnode is unlocked.
72217+ */
72218+#if REISER4_DEBUG
72219+ if (ret == 0)
72220+ assert_spin_locked(&(node->guard));
72221+ else
72222+ assert_spin_not_locked(&(node->guard));
72223+#endif
72224+ assert_spin_not_locked(&(txnh->guard));
72225+
72226+ if (ret == -E_REPEAT) {
72227+ /* E_REPEAT implies all locks were released, therefore we need
72228+ to take the jnode's lock again. */
72229+ spin_lock_jnode(node);
72230+
72231+ /* Although this may appear to be a busy loop, it is not.
72232+ There are several conditions that cause E_REPEAT to be
72233+ returned by the call to try_capture_block, all cases
72234+ indicating some kind of state change that means you should
72235+ retry the request and will get a different result. In some
72236+ cases this could be avoided with some extra code, but
72237+ generally it is done because the necessary locks were
72238+ released as a result of the operation and repeating is the
72239+ simplest thing to do (less bug potential). The cases are:
72240+ atom fusion returns E_REPEAT after it completes (jnode and
72241+ txnh were unlocked); race conditions in assign_block,
72242+ assign_txnh, and init_fusion return E_REPEAT (trylock
72243+ failure); after going to sleep in capture_fuse_wait
72244+ (request was blocked but may now succeed). I'm not quite
72245+ sure how capture_copy works yet, but it may also return
72246+ E_REPEAT. When the request is legitimately blocked, the
72247+ requestor goes to sleep in fuse_wait, so this is not a busy
72248+ loop. */
72249+ /* NOTE-NIKITA: still don't understand:
72250+
72251+ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
72252+
72253+ looks like busy loop?
72254+ */
72255+ goto repeat;
72256+ }
72257+
72258+ /* free extra atom object that was possibly allocated by
72259+ try_capture_block().
72260+
72261+ Do this before acquiring jnode spin lock to
72262+ minimize time spent under lock. --nikita */
72263+ if (atom_alloc != NULL) {
72264+ kmem_cache_free(_atom_slab, atom_alloc);
72265+ }
72266+
72267+ if (ret != 0) {
72268+ if (ret == -E_BLOCK) {
72269+ assert("nikita-3360",
72270+ cap_mode & TXN_CAPTURE_NONBLOCKING);
72271+ ret = -E_REPEAT;
72272+ }
72273+
72274+ /* Failure means jnode is not locked. FIXME_LATER_JMACD May
72275+ want to fix the above code to avoid releasing the lock and
72276+ re-acquiring it, but there are cases were failure occurs
72277+ when the lock is not held, and those cases would need to be
72278+ modified to re-take the lock. */
72279+ spin_lock_jnode(node);
72280+ }
72281+
72282+ /* Jnode is still locked. */
72283+ assert_spin_locked(&(node->guard));
72284+ return ret;
72285+}
72286+
72287+static void release_two_atoms(txn_atom *one, txn_atom *two)
72288+{
72289+ spin_unlock_atom(one);
72290+ atom_dec_and_unlock(two);
72291+ spin_lock_atom(one);
72292+ atom_dec_and_unlock(one);
72293+}
72294+
72295+/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
72296+ returned by that routine. The txn_capture request mode is computed here depending on
72297+ the transaction handle's type and the lock request. This is called from the depths of
72298+ the lock manager with the jnode lock held and it always returns with the jnode lock
72299+ held.
72300+*/
72301+
72302+/* fuse all 'active' atoms of lock owners of given node. */
72303+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
72304+{
72305+ lock_handle *lh;
72306+ int repeat;
72307+ txn_atom *atomh, *atomf;
72308+ reiser4_context *me = get_current_context();
72309+ reiser4_context *ctx = NULL;
72310+
72311+ assert_spin_not_locked(&(ZJNODE(node)->guard));
72312+ assert_spin_not_locked(&(txnh->hlock));
72313+
72314+ repeat:
72315+ repeat = 0;
72316+ atomh = txnh_get_atom(txnh);
72317+ spin_unlock_txnh(txnh);
72318+ assert("zam-692", atomh != NULL);
72319+
72320+ spin_lock_zlock(&node->lock);
72321+ /* inspect list of lock owners */
72322+ list_for_each_entry(lh, &node->lock.owners, owners_link) {
72323+ ctx = get_context_by_lock_stack(lh->owner);
72324+ if (ctx == me)
72325+ continue;
72326+ /* below we use two assumptions to avoid addition spin-locks
72327+ for checking the condition :
72328+
72329+ 1) if the lock stack has lock, the transaction should be
72330+ opened, i.e. ctx->trans != NULL;
72331+
72332+ 2) reading of well-aligned ctx->trans->atom is atomic, if it
72333+ equals to the address of spin-locked atomh, we take that
72334+ the atoms are the same, nothing has to be captured. */
72335+ if (atomh != ctx->trans->atom) {
72336+ reiser4_wake_up(lh->owner);
72337+ repeat = 1;
72338+ break;
72339+ }
72340+ }
72341+ if (repeat) {
72342+ if (!spin_trylock_txnh(ctx->trans)) {
72343+ spin_unlock_zlock(&node->lock);
72344+ spin_unlock_atom(atomh);
72345+ goto repeat;
72346+ }
72347+ atomf = ctx->trans->atom;
72348+ if (atomf == NULL) {
72349+ capture_assign_txnh_nolock(atomh, ctx->trans);
72350+ /* release zlock lock _after_ assigning the atom to the
72351+ * transaction handle, otherwise the lock owner thread
72352+ * may unlock all znodes, exit kernel context and here
72353+ * we would access an invalid transaction handle. */
72354+ spin_unlock_zlock(&node->lock);
72355+ spin_unlock_atom(atomh);
72356+ spin_unlock_txnh(ctx->trans);
72357+ goto repeat;
72358+ }
72359+ assert("zam-1059", atomf != atomh);
72360+ spin_unlock_zlock(&node->lock);
72361+ atomic_inc(&atomh->refcount);
72362+ atomic_inc(&atomf->refcount);
72363+ spin_unlock_txnh(ctx->trans);
72364+ if (atomf > atomh) {
72365+ spin_lock_atom_nested(atomf);
72366+ } else {
72367+ spin_unlock_atom(atomh);
72368+ spin_lock_atom(atomf);
72369+ spin_lock_atom_nested(atomh);
72370+ }
72371+ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
72372+ release_two_atoms(atomf, atomh);
72373+ goto repeat;
72374+ }
72375+ atomic_dec(&atomh->refcount);
72376+ atomic_dec(&atomf->refcount);
72377+ capture_fuse_into(atomf, atomh);
72378+ goto repeat;
72379+ }
72380+ spin_unlock_zlock(&node->lock);
72381+ spin_unlock_atom(atomh);
72382+}
72383+
72384+/* This is the interface to capture unformatted nodes via their struct page
72385+ reference. Currently it is only used in reiser4_invalidatepage */
72386+int try_capture_page_to_invalidate(struct page *pg)
72387+{
72388+ int ret;
72389+ jnode *node;
72390+
72391+ assert("umka-292", pg != NULL);
72392+ assert("nikita-2597", PageLocked(pg));
72393+
72394+ if (IS_ERR(node = jnode_of_page(pg))) {
72395+ return PTR_ERR(node);
72396+ }
72397+
72398+ spin_lock_jnode(node);
72399+ unlock_page(pg);
72400+
72401+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
72402+ spin_unlock_jnode(node);
72403+ jput(node);
72404+ lock_page(pg);
72405+ return ret;
72406+}
72407+
72408+/* This informs the transaction manager when a node is deleted. Add the block to the
72409+ atom's delete set and uncapture the block.
72410+
72411+VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
72412+explanations. find all the functions that use it, and unless there is some very
72413+good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
72414+move the loop to inside the function.
72415+
72416+VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
72417+ */
72418+void reiser4_uncapture_page(struct page *pg)
72419+{
72420+ jnode *node;
72421+ txn_atom *atom;
72422+
72423+ assert("umka-199", pg != NULL);
72424+ assert("nikita-3155", PageLocked(pg));
72425+
72426+ clear_page_dirty_for_io(pg);
72427+
72428+ reiser4_wait_page_writeback(pg);
72429+
72430+ node = jprivate(pg);
72431+ BUG_ON(node == NULL);
72432+
72433+ spin_lock_jnode(node);
72434+
72435+ atom = jnode_get_atom(node);
72436+ if (atom == NULL) {
72437+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72438+ spin_unlock_jnode(node);
72439+ return;
72440+ }
72441+
72442+ /* We can remove jnode from transaction even if it is on flush queue
72443+ * prepped list, we only need to be sure that flush queue is not being
72444+ * written by reiser4_write_fq(). reiser4_write_fq() does not use atom
72445+ * spin lock for protection of the prepped nodes list, instead
72446+ * write_fq() increments atom's nr_running_queues counters for the time
72447+ * when prepped list is not protected by spin lock. Here we check this
72448+ * counter if we want to remove jnode from flush queue and, if the
72449+ * counter is not zero, wait all reiser4_write_fq() for this atom to
72450+ * complete. This is not significant overhead. */
72451+ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
72452+ spin_unlock_jnode(node);
72453+ /*
72454+ * at this moment we want to wait for "atom event", viz. wait
72455+ * until @node can be removed from flush queue. But
72456+ * reiser4_atom_wait_event() cannot be called with page locked,
72457+ * because it deadlocks with jnode_extent_write(). Unlock page,
72458+ * after making sure (through page_cache_get()) that it cannot
72459+ * be released from memory.
72460+ */
72461+ page_cache_get(pg);
72462+ unlock_page(pg);
72463+ reiser4_atom_wait_event(atom);
72464+ lock_page(pg);
72465+ /*
72466+ * page may has been detached by ->writepage()->releasepage().
72467+ */
72468+ reiser4_wait_page_writeback(pg);
72469+ spin_lock_jnode(node);
72470+ page_cache_release(pg);
72471+ atom = jnode_get_atom(node);
72472+/* VS-FIXME-HANS: improve the commenting in this function */
72473+ if (atom == NULL) {
72474+ spin_unlock_jnode(node);
72475+ return;
72476+ }
72477+ }
72478+ reiser4_uncapture_block(node);
72479+ spin_unlock_atom(atom);
72480+ jput(node);
72481+}
72482+
72483+/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
72484+ * inode's tree of jnodes */
72485+void reiser4_uncapture_jnode(jnode * node)
72486+{
72487+ txn_atom *atom;
72488+
72489+ assert_spin_locked(&(node->guard));
72490+ assert("", node->pg == 0);
72491+
72492+ atom = jnode_get_atom(node);
72493+ if (atom == NULL) {
72494+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72495+ spin_unlock_jnode(node);
72496+ return;
72497+ }
72498+
72499+ reiser4_uncapture_block(node);
72500+ spin_unlock_atom(atom);
72501+ jput(node);
72502+}
72503+
72504+/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
72505+ increases atom refcount and txnh_count, adds to txnh_list. */
72506+static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
72507+{
72508+ assert("umka-200", atom != NULL);
72509+ assert("umka-201", txnh != NULL);
72510+
72511+ assert_spin_locked(&(txnh->hlock));
72512+ assert_spin_locked(&(atom->alock));
72513+ assert("jmacd-824", txnh->atom == NULL);
72514+ assert("nikita-3540", atom_isopen(atom));
72515+ BUG_ON(txnh->atom != NULL);
72516+
72517+ atomic_inc(&atom->refcount);
72518+ txnh->atom = atom;
72519+ reiser4_ctx_gfp_mask_set();
72520+ list_add_tail(&txnh->txnh_link, &atom->txnh_list);
72521+ atom->txnh_count += 1;
72522+}
72523+
72524+/* No-locking version of assign_block. Sets the block's atom pointer, references the
72525+ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
72526+static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
72527+{
72528+ assert("umka-202", atom != NULL);
72529+ assert("umka-203", node != NULL);
72530+ assert_spin_locked(&(node->guard));
72531+ assert_spin_locked(&(atom->alock));
72532+ assert("jmacd-323", node->atom == NULL);
72533+ BUG_ON(!list_empty_careful(&node->capture_link));
72534+ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
72535+
72536+ /* Pointer from jnode to atom is not counted in atom->refcount. */
72537+ node->atom = atom;
72538+
72539+ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
72540+ atom->capture_count += 1;
72541+ /* reference to jnode is acquired by atom. */
72542+ jref(node);
72543+
72544+ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
72545+
72546+ LOCK_CNT_INC(t_refs);
72547+}
72548+
72549+/* common code for dirtying both unformatted jnodes and formatted znodes. */
72550+static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
72551+{
72552+ assert_spin_locked(&(node->guard));
72553+ assert_spin_locked(&(atom->alock));
72554+ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
72555+
72556+ JF_SET(node, JNODE_DIRTY);
72557+
72558+ get_current_context()->nr_marked_dirty++;
72559+
72560+ /* We grab2flush_reserve one additional block only if node was
72561+ not CREATED and jnode_flush did not sort it into neither
72562+ relocate set nor overwrite one. If node is in overwrite or
72563+ relocate set we assume that atom's flush reserved counter was
72564+ already adjusted. */
72565+ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
72566+ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
72567+ && !jnode_is_cluster_page(node)) {
72568+ assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
72569+ assert("vs-1506", *jnode_get_block(node) != 0);
72570+ grabbed2flush_reserved_nolock(atom, (__u64) 1);
72571+ JF_SET(node, JNODE_FLUSH_RESERVED);
72572+ }
72573+
72574+ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
72575+ /* If the atom is not set yet, it will be added to the appropriate list in
72576+ capture_assign_block_nolock. */
72577+ /* Sometimes a node is set dirty before being captured -- the case for new
72578+ jnodes. In that case the jnode will be added to the appropriate list
72579+ in capture_assign_block_nolock. Another reason not to re-link jnode is
72580+ that jnode is on a flush queue (see flush.c for details) */
72581+
72582+ int level = jnode_get_level(node);
72583+
72584+ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
72585+ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
72586+ assert("nikita-2607", 0 <= level);
72587+ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
72588+
72589+ /* move node to atom's dirty list */
72590+ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
72591+ ON_DEBUG(count_jnode
72592+ (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
72593+ }
72594+}
72595+
72596+/* Set the dirty status for this (spin locked) jnode. */
72597+void jnode_make_dirty_locked(jnode * node)
72598+{
72599+ assert("umka-204", node != NULL);
72600+ assert_spin_locked(&(node->guard));
72601+
72602+ if (REISER4_DEBUG && rofs_jnode(node)) {
72603+ warning("nikita-3365", "Dirtying jnode on rofs");
72604+ dump_stack();
72605+ }
72606+
72607+ /* Fast check for already dirty node */
72608+ if (!JF_ISSET(node, JNODE_DIRTY)) {
72609+ txn_atom *atom;
72610+
72611+ atom = jnode_get_atom(node);
72612+ assert("vs-1094", atom);
72613+ /* Check jnode dirty status again because node spin lock might
72614+ * be released inside jnode_get_atom(). */
72615+ if (likely(!JF_ISSET(node, JNODE_DIRTY)))
72616+ do_jnode_make_dirty(node, atom);
72617+ spin_unlock_atom(atom);
72618+ }
72619+}
72620+
72621+/* Set the dirty status for this znode. */
72622+void znode_make_dirty(znode * z)
72623+{
72624+ jnode *node;
72625+ struct page *page;
72626+
72627+ assert("umka-204", z != NULL);
72628+ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
72629+ assert("nikita-3560", znode_is_write_locked(z));
72630+
72631+ node = ZJNODE(z);
72632+ /* znode is longterm locked, we can check dirty bit without spinlock */
72633+ if (JF_ISSET(node, JNODE_DIRTY)) {
72634+ /* znode is dirty already. All we have to do is to change znode version */
72635+ z->version = znode_build_version(jnode_get_tree(node));
72636+ return;
72637+ }
72638+
72639+ spin_lock_jnode(node);
72640+ jnode_make_dirty_locked(node);
72641+ page = jnode_page(node);
72642+ if (page != NULL) {
72643+ /* this is useful assertion (allows one to check that no
72644+ * modifications are lost due to update of in-flight page),
72645+ * but it requires locking on page to check PG_writeback
72646+ * bit. */
72647+ /* assert("nikita-3292",
72648+ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
72649+ page_cache_get(page);
72650+
72651+ /* jnode lock is not needed for the rest of
72652+ * znode_set_dirty(). */
72653+ spin_unlock_jnode(node);
72654+ /* reiser4 file write code calls set_page_dirty for
72655+ * unformatted nodes, for formatted nodes we do it here. */
72656+ reiser4_set_page_dirty_internal(page);
72657+ page_cache_release(page);
72658+ /* bump version counter in znode */
72659+ z->version = znode_build_version(jnode_get_tree(node));
72660+ } else {
72661+ assert("zam-596", znode_above_root(JZNODE(node)));
72662+ spin_unlock_jnode(node);
72663+ }
72664+
72665+ assert("nikita-1900", znode_is_write_locked(z));
72666+ assert("jmacd-9777", node->atom != NULL);
72667+}
72668+
72669+int reiser4_sync_atom(txn_atom * atom)
72670+{
72671+ int result;
72672+ txn_handle *txnh;
72673+
72674+ txnh = get_current_context()->trans;
72675+
72676+ result = 0;
72677+ if (atom != NULL) {
72678+ if (atom->stage < ASTAGE_PRE_COMMIT) {
72679+ spin_lock_txnh(txnh);
72680+ capture_assign_txnh_nolock(atom, txnh);
72681+ result = force_commit_atom(txnh);
72682+ } else if (atom->stage < ASTAGE_POST_COMMIT) {
72683+ /* wait atom commit */
72684+ reiser4_atom_wait_event(atom);
72685+ /* try once more */
72686+ result = RETERR(-E_REPEAT);
72687+ } else
72688+ spin_unlock_atom(atom);
72689+ }
72690+ return result;
72691+}
72692+
72693+#if REISER4_DEBUG
72694+
72695+/* move jnode form one list to another
72696+ call this after atom->capture_count is updated */
72697+void
72698+count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
72699+ atom_list new_list, int check_lists)
72700+{
72701+ struct list_head *pos;
72702+
72703+ assert("zam-1018", atom_is_protected(atom));
72704+ assert_spin_locked(&(node->guard));
72705+ assert("", NODE_LIST(node) == old_list);
72706+
72707+ switch (NODE_LIST(node)) {
72708+ case NOT_CAPTURED:
72709+ break;
72710+ case DIRTY_LIST:
72711+ assert("", atom->dirty > 0);
72712+ atom->dirty--;
72713+ break;
72714+ case CLEAN_LIST:
72715+ assert("", atom->clean > 0);
72716+ atom->clean--;
72717+ break;
72718+ case FQ_LIST:
72719+ assert("", atom->fq > 0);
72720+ atom->fq--;
72721+ break;
72722+ case WB_LIST:
72723+ assert("", atom->wb > 0);
72724+ atom->wb--;
72725+ break;
72726+ case OVRWR_LIST:
72727+ assert("", atom->ovrwr > 0);
72728+ atom->ovrwr--;
72729+ break;
72730+ default:
72731+ impossible("", "");
72732+ }
72733+
72734+ switch (new_list) {
72735+ case NOT_CAPTURED:
72736+ break;
72737+ case DIRTY_LIST:
72738+ atom->dirty++;
72739+ break;
72740+ case CLEAN_LIST:
72741+ atom->clean++;
72742+ break;
72743+ case FQ_LIST:
72744+ atom->fq++;
72745+ break;
72746+ case WB_LIST:
72747+ atom->wb++;
72748+ break;
72749+ case OVRWR_LIST:
72750+ atom->ovrwr++;
72751+ break;
72752+ default:
72753+ impossible("", "");
72754+ }
72755+ ASSIGN_NODE_LIST(node, new_list);
72756+ if (0 && check_lists) {
72757+ int count;
72758+ tree_level level;
72759+
72760+ count = 0;
72761+
72762+ /* flush queue list */
72763+ /* reiser4_check_fq(atom); */
72764+
72765+ /* dirty list */
72766+ count = 0;
72767+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
72768+ list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
72769+ count++;
72770+ }
72771+ if (count != atom->dirty)
72772+ warning("", "dirty counter %d, real %d\n", atom->dirty,
72773+ count);
72774+
72775+ /* clean list */
72776+ count = 0;
72777+ list_for_each(pos, ATOM_CLEAN_LIST(atom))
72778+ count++;
72779+ if (count != atom->clean)
72780+ warning("", "clean counter %d, real %d\n", atom->clean,
72781+ count);
72782+
72783+ /* wb list */
72784+ count = 0;
72785+ list_for_each(pos, ATOM_WB_LIST(atom))
72786+ count++;
72787+ if (count != atom->wb)
72788+ warning("", "wb counter %d, real %d\n", atom->wb,
72789+ count);
72790+
72791+ /* overwrite list */
72792+ count = 0;
72793+ list_for_each(pos, ATOM_OVRWR_LIST(atom))
72794+ count++;
72795+
72796+ if (count != atom->ovrwr)
72797+ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
72798+ count);
72799+ }
72800+ assert("vs-1624", atom->num_queued == atom->fq);
72801+ if (atom->capture_count !=
72802+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
72803+ printk
72804+ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
72805+ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
72806+ atom->wb, atom->fq);
72807+ assert("vs-1622",
72808+ atom->capture_count ==
72809+ atom->dirty + atom->clean + atom->ovrwr + atom->wb +
72810+ atom->fq);
72811+ }
72812+}
72813+
72814+#endif
72815+
72816+/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
72817+ * lock should be taken before calling this function. */
72818+void jnode_make_wander_nolock(jnode * node)
72819+{
72820+ txn_atom *atom;
72821+
72822+ assert("nikita-2431", node != NULL);
72823+ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
72824+ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
72825+ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
72826+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
72827+
72828+ atom = node->atom;
72829+
72830+ assert("zam-895", atom != NULL);
72831+ assert("zam-894", atom_is_protected(atom));
72832+
72833+ JF_SET(node, JNODE_OVRWR);
72834+ /* move node to atom's overwrite list */
72835+ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
72836+ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
72837+}
72838+
72839+/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
72840+ * this function. */
72841+void jnode_make_wander(jnode * node)
72842+{
72843+ txn_atom *atom;
72844+
72845+ spin_lock_jnode(node);
72846+ atom = jnode_get_atom(node);
72847+ assert("zam-913", atom != NULL);
72848+ assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
72849+
72850+ jnode_make_wander_nolock(node);
72851+ spin_unlock_atom(atom);
72852+ spin_unlock_jnode(node);
72853+}
72854+
72855+/* this just sets RELOC bit */
72856+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
72857+{
72858+ assert_spin_locked(&(node->guard));
72859+ assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
72860+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
72861+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
72862+ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
72863+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
72864+ jnode_set_reloc(node);
72865+}
72866+
72867+/* Make znode RELOC and put it on flush queue */
72868+void znode_make_reloc(znode * z, flush_queue_t * fq)
72869+{
72870+ jnode *node;
72871+ txn_atom *atom;
72872+
72873+ node = ZJNODE(z);
72874+ spin_lock_jnode(node);
72875+
72876+ atom = jnode_get_atom(node);
72877+ assert("zam-919", atom != NULL);
72878+
72879+ jnode_make_reloc_nolock(fq, node);
72880+ queue_jnode(fq, node);
72881+
72882+ spin_unlock_atom(atom);
72883+ spin_unlock_jnode(node);
72884+
72885+}
72886+
72887+/* Make unformatted node RELOC and put it on flush queue */
72888+void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
72889+{
72890+ assert("vs-1479", jnode_is_unformatted(node));
72891+
72892+ jnode_make_reloc_nolock(fq, node);
72893+ queue_jnode(fq, node);
72894+}
72895+
72896+int reiser4_capture_super_block(struct super_block *s)
72897+{
72898+ int result;
72899+ znode *uber;
72900+ lock_handle lh;
72901+
72902+ init_lh(&lh);
72903+ result = get_uber_znode(reiser4_get_tree(s),
72904+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
72905+ if (result)
72906+ return result;
72907+
72908+ uber = lh.node;
72909+ /* Grabbing one block for superblock */
72910+ result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
72911+ if (result != 0)
72912+ return result;
72913+
72914+ znode_make_dirty(uber);
72915+
72916+ done_lh(&lh);
72917+ return 0;
72918+}
72919+
72920+/* Wakeup every handle on the atom's WAITFOR list */
72921+static void wakeup_atom_waitfor_list(txn_atom * atom)
72922+{
72923+ txn_wait_links *wlinks;
72924+
72925+ assert("umka-210", atom != NULL);
72926+
72927+ /* atom is locked */
72928+ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
72929+ if (wlinks->waitfor_cb == NULL ||
72930+ wlinks->waitfor_cb(atom, wlinks))
72931+ /* Wake up. */
72932+ reiser4_wake_up(wlinks->_lock_stack);
72933+ }
72934+}
72935+
72936+/* Wakeup every handle on the atom's WAITING list */
72937+static void wakeup_atom_waiting_list(txn_atom * atom)
72938+{
72939+ txn_wait_links *wlinks;
72940+
72941+ assert("umka-211", atom != NULL);
72942+
72943+ /* atom is locked */
72944+ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
72945+ if (wlinks->waiting_cb == NULL ||
72946+ wlinks->waiting_cb(atom, wlinks))
72947+ /* Wake up. */
72948+ reiser4_wake_up(wlinks->_lock_stack);
72949+ }
72950+}
72951+
72952+/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
72953+static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
72954+{
72955+ assert("nikita-3330", atom != NULL);
72956+ assert_spin_locked(&(atom->alock));
72957+
72958+ /* atom->txnh_count == 1 is for waking waiters up if we are releasing
72959+ * last transaction handle. */
72960+ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
72961+}
72962+
72963+/* The general purpose of this function is to wait on the first of two possible events.
72964+ The situation is that a handle (and its atom atomh) is blocked trying to capture a
72965+ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
72966+ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
72967+ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
72968+ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
72969+ proceed and fuse the two atoms in the CAPTURE_WAIT state.
72970+
72971+ In other words, if either atomh or atomf change state, the handle will be awakened,
72972+ thus there are two lists per atom: WAITING and WAITFOR.
72973+
72974+ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
72975+ close but it is not assigned to an atom of its own.
72976+
72977+ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
72978+ BOTH_ATOM_LOCKS. Result: all four locks are released.
72979+*/
72980+static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
72981+ txn_atom * atomh, txn_capture mode)
72982+{
72983+ int ret;
72984+ txn_wait_links wlinks;
72985+
72986+ assert("umka-213", txnh != NULL);
72987+ assert("umka-214", atomf != NULL);
72988+
72989+ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
72990+ spin_unlock_txnh(txnh);
72991+ spin_unlock_atom(atomf);
72992+
72993+ if (atomh) {
72994+ spin_unlock_atom(atomh);
72995+ }
72996+
72997+ return RETERR(-E_BLOCK);
72998+ }
72999+
73000+ /* Initialize the waiting list links. */
73001+ init_wlinks(&wlinks);
73002+
73003+ /* Add txnh to atomf's waitfor list, unlock atomf. */
73004+ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
73005+ wlinks.waitfor_cb = wait_for_fusion;
73006+ atomic_inc(&atomf->refcount);
73007+ spin_unlock_atom(atomf);
73008+
73009+ if (atomh) {
73010+ /* Add txnh to atomh's waiting list, unlock atomh. */
73011+ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
73012+ atomic_inc(&atomh->refcount);
73013+ spin_unlock_atom(atomh);
73014+ }
73015+
73016+ /* Go to sleep. */
73017+ spin_unlock_txnh(txnh);
73018+
73019+ ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
73020+ if (ret == 0) {
73021+ reiser4_go_to_sleep(wlinks._lock_stack);
73022+ ret = RETERR(-E_REPEAT);
73023+ }
73024+
73025+ /* Remove from the waitfor list. */
73026+ spin_lock_atom(atomf);
73027+
73028+ list_del(&wlinks._fwaitfor_link);
73029+ atom_dec_and_unlock(atomf);
73030+
73031+ if (atomh) {
73032+ /* Remove from the waiting list. */
73033+ spin_lock_atom(atomh);
73034+ list_del(&wlinks._fwaiting_link);
73035+ atom_dec_and_unlock(atomh);
73036+ }
73037+ return ret;
73038+}
73039+
73040+static void lock_two_atoms(txn_atom * one, txn_atom * two)
73041+{
73042+ assert("zam-1067", one != two);
73043+
73044+ /* lock the atom with lesser address first */
73045+ if (one < two) {
73046+ spin_lock_atom(one);
73047+ spin_lock_atom_nested(two);
73048+ } else {
73049+ spin_lock_atom(two);
73050+ spin_lock_atom_nested(one);
73051+ }
73052+}
73053+
73054+/* Perform the necessary work to prepare for fusing two atoms, which involves
73055+ * acquiring two atom locks in the proper order. If one of the node's atom is
73056+ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
73057+ * atom is not then the handle's request is put to sleep. If the node's atom
73058+ * is committing, then the node can be copy-on-captured. Otherwise, pick the
73059+ * atom with fewer pointers to be fused into the atom with more pointer and
73060+ * call capture_fuse_into.
73061+ */
73062+static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
73063+{
73064+ txn_atom * txnh_atom = txnh->atom;
73065+ txn_atom * block_atom = node->atom;
73066+
73067+ atomic_inc(&txnh_atom->refcount);
73068+ atomic_inc(&block_atom->refcount);
73069+
73070+ spin_unlock_txnh(txnh);
73071+ spin_unlock_jnode(node);
73072+
73073+ lock_two_atoms(txnh_atom, block_atom);
73074+
73075+ if (txnh->atom != txnh_atom || node->atom != block_atom ) {
73076+ release_two_atoms(txnh_atom, block_atom);
73077+ return RETERR(-E_REPEAT);
73078+ }
73079+
73080+ atomic_dec(&txnh_atom->refcount);
73081+ atomic_dec(&block_atom->refcount);
73082+
73083+ assert ("zam-1066", atom_isopen(txnh_atom));
73084+
73085+ if (txnh_atom->stage >= block_atom->stage ||
73086+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
73087+ capture_fuse_into(txnh_atom, block_atom);
73088+ return RETERR(-E_REPEAT);
73089+ }
73090+ spin_lock_txnh(txnh);
73091+ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
73092+}
73093+
73094+/* This function splices together two jnode lists (small and large) and sets all jnodes in
73095+ the small list to point to the large atom. Returns the length of the list. */
73096+static int
73097+capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
73098+ struct list_head *small_head)
73099+{
73100+ int count = 0;
73101+ jnode *node;
73102+
73103+ assert("umka-218", large != NULL);
73104+ assert("umka-219", large_head != NULL);
73105+ assert("umka-220", small_head != NULL);
73106+ /* small atom should be locked also. */
73107+ assert_spin_locked(&(large->alock));
73108+
73109+ /* For every jnode on small's capture list... */
73110+ list_for_each_entry(node, small_head, capture_link) {
73111+ count += 1;
73112+
73113+ /* With the jnode lock held, update atom pointer. */
73114+ spin_lock_jnode(node);
73115+ node->atom = large;
73116+ spin_unlock_jnode(node);
73117+ }
73118+
73119+ /* Splice the lists. */
73120+ list_splice_init(small_head, large_head->prev);
73121+
73122+ return count;
73123+}
73124+
73125+/* This function splices together two txnh lists (small and large) and sets all txn handles in
73126+ the small list to point to the large atom. Returns the length of the list. */
73127+static int
73128+capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
73129+ struct list_head *small_head)
73130+{
73131+ int count = 0;
73132+ txn_handle *txnh;
73133+
73134+ assert("umka-221", large != NULL);
73135+ assert("umka-222", large_head != NULL);
73136+ assert("umka-223", small_head != NULL);
73137+
73138+ /* Adjust every txnh to the new atom. */
73139+ list_for_each_entry(txnh, small_head, txnh_link) {
73140+ count += 1;
73141+
73142+ /* With the txnh lock held, update atom pointer. */
73143+ spin_lock_txnh(txnh);
73144+ txnh->atom = large;
73145+ spin_unlock_txnh(txnh);
73146+ }
73147+
73148+ /* Splice the txn_handle list. */
73149+ list_splice_init(small_head, large_head->prev);
73150+
73151+ return count;
73152+}
73153+
73154+/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
73155+ added to LARGE and their ->atom pointers are all updated. The associated counts are
73156+ updated as well, and any waiting handles belonging to either are awakened. Finally the
73157+ smaller atom's refcount is decremented.
73158+*/
73159+static void capture_fuse_into(txn_atom * small, txn_atom * large)
73160+{
73161+ int level;
73162+ unsigned zcount = 0;
73163+ unsigned tcount = 0;
73164+
73165+ assert("umka-224", small != NULL);
73166+ assert("umka-225", small != NULL);
73167+
73168+ assert_spin_locked(&(large->alock));
73169+ assert_spin_locked(&(small->alock));
73170+
73171+ assert("jmacd-201", atom_isopen(small));
73172+ assert("jmacd-202", atom_isopen(large));
73173+
73174+ /* Splice and update the per-level dirty jnode lists */
73175+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73176+ zcount +=
73177+ capture_fuse_jnode_lists(large,
73178+ ATOM_DIRTY_LIST(large, level),
73179+ ATOM_DIRTY_LIST(small, level));
73180+ }
73181+
73182+ /* Splice and update the [clean,dirty] jnode and txnh lists */
73183+ zcount +=
73184+ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
73185+ ATOM_CLEAN_LIST(small));
73186+ zcount +=
73187+ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
73188+ ATOM_OVRWR_LIST(small));
73189+ zcount +=
73190+ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
73191+ ATOM_WB_LIST(small));
73192+ zcount +=
73193+ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
73194+ tcount +=
73195+ capture_fuse_txnh_lists(large, &large->txnh_list,
73196+ &small->txnh_list);
73197+
73198+ /* Check our accounting. */
73199+ assert("jmacd-1063",
73200+ zcount + small->num_queued == small->capture_count);
73201+ assert("jmacd-1065", tcount == small->txnh_count);
73202+
73203+ /* sum numbers of waiters threads */
73204+ large->nr_waiters += small->nr_waiters;
73205+ small->nr_waiters = 0;
73206+
73207+ /* splice flush queues */
73208+ reiser4_fuse_fq(large, small);
73209+
73210+ /* update counter of jnode on every atom' list */
73211+ ON_DEBUG(large->dirty += small->dirty;
73212+ small->dirty = 0;
73213+ large->clean += small->clean;
73214+ small->clean = 0;
73215+ large->ovrwr += small->ovrwr;
73216+ small->ovrwr = 0;
73217+ large->wb += small->wb;
73218+ small->wb = 0;
73219+ large->fq += small->fq;
73220+ small->fq = 0;);
73221+
73222+ /* count flushers in result atom */
73223+ large->nr_flushers += small->nr_flushers;
73224+ small->nr_flushers = 0;
73225+
73226+ /* update counts of flushed nodes */
73227+ large->flushed += small->flushed;
73228+ small->flushed = 0;
73229+
73230+ /* Transfer list counts to large. */
73231+ large->txnh_count += small->txnh_count;
73232+ large->capture_count += small->capture_count;
73233+
73234+ /* Add all txnh references to large. */
73235+ atomic_add(small->txnh_count, &large->refcount);
73236+ atomic_sub(small->txnh_count, &small->refcount);
73237+
73238+ /* Reset small counts */
73239+ small->txnh_count = 0;
73240+ small->capture_count = 0;
73241+
73242+ /* Assign the oldest start_time, merge flags. */
73243+ large->start_time = min(large->start_time, small->start_time);
73244+ large->flags |= small->flags;
73245+
73246+ /* Merge blocknr sets. */
73247+ blocknr_set_merge(&small->delete_set, &large->delete_set);
73248+ blocknr_set_merge(&small->wandered_map, &large->wandered_map);
73249+
73250+ /* Merge allocated/deleted file counts */
73251+ large->nr_objects_deleted += small->nr_objects_deleted;
73252+ large->nr_objects_created += small->nr_objects_created;
73253+
73254+ small->nr_objects_deleted = 0;
73255+ small->nr_objects_created = 0;
73256+
73257+ /* Merge allocated blocks counts */
73258+ large->nr_blocks_allocated += small->nr_blocks_allocated;
73259+
73260+ large->nr_running_queues += small->nr_running_queues;
73261+ small->nr_running_queues = 0;
73262+
73263+ /* Merge blocks reserved for overwrite set. */
73264+ large->flush_reserved += small->flush_reserved;
73265+ small->flush_reserved = 0;
73266+
73267+ if (large->stage < small->stage) {
73268+ /* Large only needs to notify if it has changed state. */
73269+ reiser4_atom_set_stage(large, small->stage);
73270+ wakeup_atom_waiting_list(large);
73271+ }
73272+
73273+ reiser4_atom_set_stage(small, ASTAGE_INVALID);
73274+
73275+ /* Notify any waiters--small needs to unload its wait lists. Waiters
73276+ actually remove themselves from the list before returning from the
73277+ fuse_wait function. */
73278+ wakeup_atom_waiting_list(small);
73279+
73280+ /* Unlock atoms */
73281+ spin_unlock_atom(large);
73282+ atom_dec_and_unlock(small);
73283+}
73284+
73285+/* TXNMGR STUFF */
73286+
73287+/* Release a block from the atom, reversing the effects of being captured,
73288+ do not release atom's reference to jnode due to holding spin-locks.
73289+ Currently this is only called when the atom commits.
73290+
73291+ NOTE: this function does not release a (journal) reference to jnode
73292+ due to locking optimizations, you should call jput() somewhere after
73293+ calling reiser4_uncapture_block(). */
73294+void reiser4_uncapture_block(jnode * node)
73295+{
73296+ txn_atom *atom;
73297+
73298+ assert("umka-226", node != NULL);
73299+ atom = node->atom;
73300+ assert("umka-228", atom != NULL);
73301+
73302+ assert("jmacd-1021", node->atom == atom);
73303+ assert_spin_locked(&(node->guard));
73304+ assert("jmacd-1023", atom_is_protected(atom));
73305+
73306+ JF_CLR(node, JNODE_DIRTY);
73307+ JF_CLR(node, JNODE_RELOC);
73308+ JF_CLR(node, JNODE_OVRWR);
73309+ JF_CLR(node, JNODE_CREATED);
73310+ JF_CLR(node, JNODE_WRITEBACK);
73311+ JF_CLR(node, JNODE_REPACK);
73312+
73313+ list_del_init(&node->capture_link);
73314+ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
73315+ assert("zam-925", atom_isopen(atom));
73316+ assert("vs-1623", NODE_LIST(node) == FQ_LIST);
73317+ ON_DEBUG(atom->num_queued--);
73318+ JF_CLR(node, JNODE_FLUSH_QUEUED);
73319+ }
73320+ atom->capture_count -= 1;
73321+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
73322+ node->atom = NULL;
73323+
73324+ spin_unlock_jnode(node);
73325+ LOCK_CNT_DEC(t_refs);
73326+}
73327+
73328+/* Unconditional insert of jnode into atom's overwrite list. Currently used in
73329+ bitmap-based allocator code for adding modified bitmap blocks the
73330+ transaction. @atom and @node are spin locked */
73331+void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
73332+{
73333+ assert("zam-538", atom_is_protected(atom));
73334+ assert_spin_locked(&(node->guard));
73335+ assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
73336+ assert("zam-543", node->atom == NULL);
73337+ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
73338+
73339+ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
73340+ jref(node);
73341+ node->atom = atom;
73342+ atom->capture_count++;
73343+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
73344+}
73345+
73346+static int count_deleted_blocks_actor(txn_atom * atom,
73347+ const reiser4_block_nr * a,
73348+ const reiser4_block_nr * b, void *data)
73349+{
73350+ reiser4_block_nr *counter = data;
73351+
73352+ assert("zam-995", data != NULL);
73353+ assert("zam-996", a != NULL);
73354+ if (b == NULL)
73355+ *counter += 1;
73356+ else
73357+ *counter += *b;
73358+ return 0;
73359+}
73360+
73361+reiser4_block_nr txnmgr_count_deleted_blocks(void)
73362+{
73363+ reiser4_block_nr result;
73364+ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73365+ txn_atom *atom;
73366+
73367+ result = 0;
73368+
73369+ spin_lock_txnmgr(tmgr);
73370+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73371+ spin_lock_atom(atom);
73372+ if (atom_isopen(atom))
73373+ blocknr_set_iterator(
73374+ atom, &atom->delete_set,
73375+ count_deleted_blocks_actor, &result, 0);
73376+ spin_unlock_atom(atom);
73377+ }
73378+ spin_unlock_txnmgr(tmgr);
73379+
73380+ return result;
73381+}
73382+
73383+/*
73384+ * Local variables:
73385+ * c-indentation-style: "K&R"
73386+ * mode-name: "LC"
73387+ * c-basic-offset: 8
73388+ * tab-width: 8
73389+ * fill-column: 79
73390+ * End:
73391+ */
73392diff -urN linux-2.6.24.orig/fs/reiser4/txnmgr.h linux-2.6.24/fs/reiser4/txnmgr.h
73393--- linux-2.6.24.orig/fs/reiser4/txnmgr.h 1970-01-01 03:00:00.000000000 +0300
73394+++ linux-2.6.24/fs/reiser4/txnmgr.h 2008-01-25 11:39:07.112253026 +0300
73395@@ -0,0 +1,701 @@
73396+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
73397+ * reiser4/README */
73398+
73399+/* data-types and function declarations for transaction manager. See txnmgr.c
73400+ * for details. */
73401+
73402+#ifndef __REISER4_TXNMGR_H__
73403+#define __REISER4_TXNMGR_H__
73404+
73405+#include "forward.h"
73406+#include "dformat.h"
73407+
73408+#include <linux/fs.h>
73409+#include <linux/mm.h>
73410+#include <linux/types.h>
73411+#include <linux/spinlock.h>
73412+#include <asm/atomic.h>
73413+#include <linux/wait.h>
73414+
73415+/* TYPE DECLARATIONS */
73416+
73417+/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
73418+ A capture request dynamically assigns a block to the calling thread's transaction
73419+ handle. */
73420+typedef enum {
73421+ /* A READ_ATOMIC request indicates that a block will be read and that the caller's
73422+ atom should fuse in order to ensure that the block commits atomically with the
73423+ caller. */
73424+ TXN_CAPTURE_READ_ATOMIC = (1 << 0),
73425+
73426+ /* A READ_NONCOM request indicates that a block will be read and that the caller is
73427+ willing to read a non-committed block without causing atoms to fuse. */
73428+ TXN_CAPTURE_READ_NONCOM = (1 << 1),
73429+
73430+ /* A READ_MODIFY request indicates that a block will be read but that the caller
73431+ wishes for the block to be captured as it will be written. This capture request
73432+ mode is not currently used, but eventually it will be useful for preventing
73433+ deadlock in read-modify-write cycles. */
73434+ TXN_CAPTURE_READ_MODIFY = (1 << 2),
73435+
73436+ /* A WRITE capture request indicates that a block will be modified and that atoms
73437+ should fuse to make the commit atomic. */
73438+ TXN_CAPTURE_WRITE = (1 << 3),
73439+
73440+ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
73441+ exclusive type designation from extra bits that may be supplied -- see
73442+ below. */
73443+ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
73444+ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
73445+ TXN_CAPTURE_WRITE),
73446+
73447+ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
73448+ indicate modification will occur. */
73449+ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
73450+
73451+ /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
73452+ prefer not to sleep waiting for an aging atom to commit. */
73453+ TXN_CAPTURE_NONBLOCKING = (1 << 4),
73454+
73455+ /* An option to reiser4_try_capture to prevent atom fusion, just simple
73456+ capturing is allowed */
73457+ TXN_CAPTURE_DONT_FUSE = (1 << 5)
73458+
73459+ /* This macro selects only the exclusive capture request types, stripping out any
73460+ options that were supplied (i.e., NONBLOCKING). */
73461+#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
73462+} txn_capture;
73463+
73464+/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
73465+ difference is in the handling of read requests. A WRITE_FUSING transaction handle
73466+ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
73467+ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
73468+typedef enum {
73469+ TXN_WRITE_FUSING = (1 << 0),
73470+ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
73471+} txn_mode;
73472+
73473+/* Every atom has a stage, which is one of these exclusive values: */
73474+typedef enum {
73475+ /* Initially an atom is free. */
73476+ ASTAGE_FREE = 0,
73477+
73478+ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
73479+ blocks and fuse with other atoms. */
73480+ ASTAGE_CAPTURE_FUSE = 1,
73481+
73482+ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
73483+
73484+ /* When an atom reaches a certain age it must do all it can to commit. An atom in
73485+ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
73486+ atoms in the CAPTURE_FUSE stage. */
73487+ ASTAGE_CAPTURE_WAIT = 2,
73488+
73489+ /* Waiting for I/O before commit. Copy-on-capture (see
73490+ http://namesys.com/v4/v4.html). */
73491+ ASTAGE_PRE_COMMIT = 3,
73492+
73493+ /* Post-commit overwrite I/O. Steal-on-capture. */
73494+ ASTAGE_POST_COMMIT = 4,
73495+
73496+ /* Atom which waits for the removal of the last reference to (it? ) to
73497+ * be deleted from memory */
73498+ ASTAGE_DONE = 5,
73499+
73500+ /* invalid atom. */
73501+ ASTAGE_INVALID = 6,
73502+
73503+} txn_stage;
73504+
73505+/* Certain flags may be set in the txn_atom->flags field. */
73506+typedef enum {
73507+ /* Indicates that the atom should commit as soon as possible. */
73508+ ATOM_FORCE_COMMIT = (1 << 0),
73509+ /* to avoid endless loop, mark the atom (which was considered as too
73510+ * small) after failed attempt to fuse it. */
73511+ ATOM_CANCEL_FUSION = (1 << 1)
73512+} txn_flags;
73513+
73514+/* Flags for controlling commit_txnh */
73515+typedef enum {
73516+ /* Wait commit atom completion in commit_txnh */
73517+ TXNH_WAIT_COMMIT = 0x2,
73518+ /* Don't commit atom when this handle is closed */
73519+ TXNH_DONT_COMMIT = 0x4
73520+} txn_handle_flags_t;
73521+
73522+/* TYPE DEFINITIONS */
73523+
73524+/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
73525+ fields, so typically an operation on the atom through either of these objects must (1)
73526+ lock the object, (2) read the atom pointer, (3) lock the atom.
73527+
73528+ During atom fusion, the process holds locks on both atoms at once. Then, it iterates
73529+ through the list of handles and pages held by the smaller of the two atoms. For each
73530+ handle and page referencing the smaller atom, the fusing process must: (1) lock the
73531+ object, and (2) update the atom pointer.
73532+
73533+ You can see that there is a conflict of lock ordering here, so the more-complex
73534+ procedure should have priority, i.e., the fusing process has priority so that it is
73535+ guaranteed to make progress and to avoid restarts.
73536+
73537+ This decision, however, means additional complexity for aquiring the atom lock in the
73538+ first place.
73539+
73540+ The general original procedure followed in the code was:
73541+
73542+ TXN_OBJECT *obj = ...;
73543+ TXN_ATOM *atom;
73544+
73545+ spin_lock (& obj->_lock);
73546+
73547+ atom = obj->_atom;
73548+
73549+ if (! spin_trylock_atom (atom))
73550+ {
73551+ spin_unlock (& obj->_lock);
73552+ RESTART OPERATION, THERE WAS A RACE;
73553+ }
73554+
73555+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73556+
73557+ It has however been found that this wastes CPU a lot in a manner that is
73558+ hard to profile. So, proper refcounting was added to atoms, and new
73559+ standard locking sequence is like following:
73560+
73561+ TXN_OBJECT *obj = ...;
73562+ TXN_ATOM *atom;
73563+
73564+ spin_lock (& obj->_lock);
73565+
73566+ atom = obj->_atom;
73567+
73568+ if (! spin_trylock_atom (atom))
73569+ {
73570+ atomic_inc (& atom->refcount);
73571+ spin_unlock (& obj->_lock);
73572+ spin_lock (&atom->_lock);
73573+ atomic_dec (& atom->refcount);
73574+ // HERE atom is locked
73575+ spin_unlock (&atom->_lock);
73576+ RESTART OPERATION, THERE WAS A RACE;
73577+ }
73578+
73579+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73580+
73581+ (core of this is implemented in trylock_throttle() function)
73582+
73583+ See the jnode_get_atom() function for a common case.
73584+
73585+ As an additional (and important) optimization allowing to avoid restarts,
73586+ it is possible to re-check required pre-conditions at the HERE point in
73587+ code above and proceed without restarting if they are still satisfied.
73588+*/
73589+
73590+/* An atomic transaction: this is the underlying system representation
73591+ of a transaction, not the one seen by clients.
73592+
73593+ Invariants involving this data-type:
73594+
73595+ [sb-fake-allocated]
73596+*/
73597+struct txn_atom {
73598+ /* The spinlock protecting the atom, held during fusion and various other state
73599+ changes. */
73600+ spinlock_t alock;
73601+
73602+ /* The atom's reference counter, increasing (in case of a duplication
73603+ of an existing reference or when we are sure that some other
73604+ reference exists) may be done without taking spinlock, decrementing
73605+ of the ref. counter requires a spinlock to be held.
73606+
73607+ Each transaction handle counts in ->refcount. All jnodes count as
73608+ one reference acquired in atom_begin_andlock(), released in
73609+ commit_current_atom().
73610+ */
73611+ atomic_t refcount;
73612+
73613+ /* The atom_id identifies the atom in persistent records such as the log. */
73614+ __u32 atom_id;
73615+
73616+ /* Flags holding any of the txn_flags enumerated values (e.g.,
73617+ ATOM_FORCE_COMMIT). */
73618+ __u32 flags;
73619+
73620+ /* Number of open handles. */
73621+ __u32 txnh_count;
73622+
73623+ /* The number of znodes captured by this atom. Equal to the sum of lengths of the
73624+ dirty_nodes[level] and clean_nodes lists. */
73625+ __u32 capture_count;
73626+
73627+#if REISER4_DEBUG
73628+ int clean;
73629+ int dirty;
73630+ int ovrwr;
73631+ int wb;
73632+ int fq;
73633+#endif
73634+
73635+ __u32 flushed;
73636+
73637+ /* Current transaction stage. */
73638+ txn_stage stage;
73639+
73640+ /* Start time. */
73641+ unsigned long start_time;
73642+
73643+ /* The atom's delete set. It collects block numbers of the nodes
73644+ which were deleted during the transaction. */
73645+ struct list_head delete_set;
73646+
73647+ /* The atom's wandered_block mapping. */
73648+ struct list_head wandered_map;
73649+
73650+ /* The transaction's list of dirty captured nodes--per level. Index
73651+ by (level). dirty_nodes[0] is for znode-above-root */
73652+ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
73653+
73654+ /* The transaction's list of clean captured nodes. */
73655+ struct list_head clean_nodes;
73656+
73657+ /* The atom's overwrite set */
73658+ struct list_head ovrwr_nodes;
73659+
73660+ /* nodes which are being written to disk */
73661+ struct list_head writeback_nodes;
73662+
73663+ /* list of inodes */
73664+ struct list_head inodes;
73665+
73666+ /* List of handles associated with this atom. */
73667+ struct list_head txnh_list;
73668+
73669+ /* Transaction list link: list of atoms in the transaction manager. */
73670+ struct list_head atom_link;
73671+
73672+ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
73673+ struct list_head fwaitfor_list;
73674+
73675+ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
73676+ struct list_head fwaiting_list;
73677+
73678+ /* Numbers of objects which were deleted/created in this transaction
73679+ thereby numbers of objects IDs which were released/deallocated. */
73680+ int nr_objects_deleted;
73681+ int nr_objects_created;
73682+ /* number of blocks allocated during the transaction */
73683+ __u64 nr_blocks_allocated;
73684+ /* All atom's flush queue objects are on this list */
73685+ struct list_head flush_queues;
73686+#if REISER4_DEBUG
73687+ /* number of flush queues for this atom. */
73688+ int nr_flush_queues;
73689+ /* Number of jnodes which were removed from atom's lists and put
73690+ on flush_queue */
73691+ int num_queued;
73692+#endif
73693+ /* number of threads who wait for this atom to complete commit */
73694+ int nr_waiters;
73695+ /* number of threads which do jnode_flush() over this atom */
73696+ int nr_flushers;
73697+ /* number of flush queues which are IN_USE and jnodes from fq->prepped
73698+ are submitted to disk by the reiser4_write_fq() routine. */
73699+ int nr_running_queues;
73700+ /* A counter of grabbed unformatted nodes, see a description of the
73701+ * reiser4 space reservation scheme at block_alloc.c */
73702+ reiser4_block_nr flush_reserved;
73703+#if REISER4_DEBUG
73704+ void *committer;
73705+#endif
73706+ struct super_block *super;
73707+};
73708+
73709+#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
73710+#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
73711+#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
73712+#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
73713+#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
73714+
73715+#define NODE_LIST(node) (node)->list
73716+#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
73717+ON_DEBUG(void
73718+ count_jnode(txn_atom *, jnode *, atom_list old_list,
73719+ atom_list new_list, int check_lists));
73720+
73721+/* A transaction handle: the client obtains and commits this handle which is assigned by
73722+ the system to a txn_atom. */
73723+struct txn_handle {
73724+ /* Spinlock protecting ->atom pointer */
73725+ spinlock_t hlock;
73726+
73727+ /* Flags for controlling commit_txnh() behavior */
73728+ /* from txn_handle_flags_t */
73729+ txn_handle_flags_t flags;
73730+
73731+ /* Whether it is READ_FUSING or WRITE_FUSING. */
73732+ txn_mode mode;
73733+
73734+ /* If assigned, the atom it is part of. */
73735+ txn_atom *atom;
73736+
73737+ /* Transaction list link. Head is in txn_atom. */
73738+ struct list_head txnh_link;
73739+};
73740+
73741+/* The transaction manager: one is contained in the reiser4_super_info_data */
73742+struct txn_mgr {
73743+ /* A spinlock protecting the atom list, id_count, flush_control */
73744+ spinlock_t tmgr_lock;
73745+
73746+ /* List of atoms. */
73747+ struct list_head atoms_list;
73748+
73749+ /* Number of atoms. */
73750+ int atom_count;
73751+
73752+ /* A counter used to assign atom->atom_id values. */
73753+ __u32 id_count;
73754+
73755+ /* a mutex object for commit serialization */
73756+ struct mutex commit_mutex;
73757+
73758+ /* a list of all txnmrgs served by particular daemon. */
73759+ struct list_head linkage;
73760+
73761+ /* description of daemon for this txnmgr */
73762+ ktxnmgrd_context *daemon;
73763+
73764+ /* parameters. Adjustable through mount options. */
73765+ unsigned int atom_max_size;
73766+ unsigned int atom_max_age;
73767+ unsigned int atom_min_size;
73768+ /* max number of concurrent flushers for one atom, 0 - unlimited. */
73769+ unsigned int atom_max_flushers;
73770+ struct dentry *debugfs_atom_count;
73771+ struct dentry *debugfs_id_count;
73772+};
73773+
73774+/* FUNCTION DECLARATIONS */
73775+
73776+/* These are the externally (within Reiser4) visible transaction functions, therefore they
73777+ are prefixed with "txn_". For comments, see txnmgr.c. */
73778+
73779+extern int init_txnmgr_static(void);
73780+extern void done_txnmgr_static(void);
73781+
73782+extern void reiser4_init_txnmgr(txn_mgr *);
73783+extern void reiser4_done_txnmgr(txn_mgr *);
73784+
73785+extern int reiser4_txn_reserve(int reserved);
73786+
73787+extern void reiser4_txn_begin(reiser4_context * context);
73788+extern int reiser4_txn_end(reiser4_context * context);
73789+
73790+extern void reiser4_txn_restart(reiser4_context * context);
73791+extern void reiser4_txn_restart_current(void);
73792+
73793+extern int txnmgr_force_commit_all(struct super_block *, int);
73794+extern int current_atom_should_commit(void);
73795+
73796+extern jnode *find_first_dirty_jnode(txn_atom *, int);
73797+
73798+extern int commit_some_atoms(txn_mgr *);
73799+extern int force_commit_atom(txn_handle *);
73800+extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
73801+
73802+extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
73803+
73804+extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
73805+
73806+extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
73807+ int alloc_value);
73808+extern void atom_dec_and_unlock(txn_atom * atom);
73809+
73810+extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
73811+extern int try_capture_page_to_invalidate(struct page *pg);
73812+
73813+extern void reiser4_uncapture_page(struct page *pg);
73814+extern void reiser4_uncapture_block(jnode *);
73815+extern void reiser4_uncapture_jnode(jnode *);
73816+
73817+extern int reiser4_capture_inode(struct inode *);
73818+extern int reiser4_uncapture_inode(struct inode *);
73819+
73820+extern txn_atom *get_current_atom_locked_nocheck(void);
73821+
73822+#if REISER4_DEBUG
73823+
73824+/**
73825+ * atom_is_protected - make sure that nobody but us can do anything with atom
73826+ * @atom: atom to be checked
73827+ *
73828+ * This is used to assert that atom either entered commit stages or is spin
73829+ * locked.
73830+ */
73831+static inline int atom_is_protected(txn_atom *atom)
73832+{
73833+ if (atom->stage >= ASTAGE_PRE_COMMIT)
73834+ return 1;
73835+ assert_spin_locked(&(atom->alock));
73836+ return 1;
73837+}
73838+
73839+#endif
73840+
73841+/* Get the current atom and spinlock it if current atom present. May not return NULL */
73842+static inline txn_atom *get_current_atom_locked(void)
73843+{
73844+ txn_atom *atom;
73845+
73846+ atom = get_current_atom_locked_nocheck();
73847+ assert("zam-761", atom != NULL);
73848+
73849+ return atom;
73850+}
73851+
73852+extern txn_atom *jnode_get_atom(jnode *);
73853+
73854+extern void reiser4_atom_wait_event(txn_atom *);
73855+extern void reiser4_atom_send_event(txn_atom *);
73856+
73857+extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
73858+extern int reiser4_capture_super_block(struct super_block *s);
73859+int capture_bulk(jnode **, int count);
73860+
73861+/* See the comment on the function blocknrset.c:blocknr_set_add for the
73862+ calling convention of these three routines. */
73863+extern void blocknr_set_init(struct list_head * bset);
73864+extern void blocknr_set_destroy(struct list_head * bset);
73865+extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
73866+extern int blocknr_set_add_extent(txn_atom * atom,
73867+ struct list_head * bset,
73868+ blocknr_set_entry ** new_bsep,
73869+ const reiser4_block_nr * start,
73870+ const reiser4_block_nr * len);
73871+extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
73872+ blocknr_set_entry ** new_bsep,
73873+ const reiser4_block_nr * a,
73874+ const reiser4_block_nr * b);
73875+
73876+typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
73877+ const reiser4_block_nr *, void *);
73878+
73879+extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
73880+ blocknr_set_actor_f actor, void *data,
73881+ int delete);
73882+
73883+/* flush code takes care about how to fuse flush queues */
73884+extern void flush_init_atom(txn_atom * atom);
73885+extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
73886+
73887+static inline void spin_lock_atom(txn_atom *atom)
73888+{
73889+ /* check that spinlocks of lower priorities are not held */
73890+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
73891+ LOCK_CNT_NIL(spin_locked_atom) &&
73892+ LOCK_CNT_NIL(spin_locked_jnode) &&
73893+ LOCK_CNT_NIL(spin_locked_zlock) &&
73894+ LOCK_CNT_NIL(rw_locked_dk) &&
73895+ LOCK_CNT_NIL(rw_locked_tree)));
73896+
73897+ spin_lock(&(atom->alock));
73898+
73899+ LOCK_CNT_INC(spin_locked_atom);
73900+ LOCK_CNT_INC(spin_locked);
73901+}
73902+
73903+static inline void spin_lock_atom_nested(txn_atom *atom)
73904+{
73905+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
73906+ LOCK_CNT_NIL(spin_locked_jnode) &&
73907+ LOCK_CNT_NIL(spin_locked_zlock) &&
73908+ LOCK_CNT_NIL(rw_locked_dk) &&
73909+ LOCK_CNT_NIL(rw_locked_tree)));
73910+
73911+ spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
73912+
73913+ LOCK_CNT_INC(spin_locked_atom);
73914+ LOCK_CNT_INC(spin_locked);
73915+}
73916+
73917+static inline int spin_trylock_atom(txn_atom *atom)
73918+{
73919+ if (spin_trylock(&(atom->alock))) {
73920+ LOCK_CNT_INC(spin_locked_atom);
73921+ LOCK_CNT_INC(spin_locked);
73922+ return 1;
73923+ }
73924+ return 0;
73925+}
73926+
73927+static inline void spin_unlock_atom(txn_atom *atom)
73928+{
73929+ assert_spin_locked(&(atom->alock));
73930+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
73931+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73932+
73933+ LOCK_CNT_DEC(spin_locked_atom);
73934+ LOCK_CNT_DEC(spin_locked);
73935+
73936+ spin_unlock(&(atom->alock));
73937+}
73938+
73939+static inline void spin_lock_txnh(txn_handle *txnh)
73940+{
73941+ /* check that spinlocks of lower priorities are not held */
73942+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
73943+ LOCK_CNT_NIL(spin_locked_zlock) &&
73944+ LOCK_CNT_NIL(rw_locked_tree)));
73945+
73946+ spin_lock(&(txnh->hlock));
73947+
73948+ LOCK_CNT_INC(spin_locked_txnh);
73949+ LOCK_CNT_INC(spin_locked);
73950+}
73951+
73952+static inline int spin_trylock_txnh(txn_handle *txnh)
73953+{
73954+ if (spin_trylock(&(txnh->hlock))) {
73955+ LOCK_CNT_INC(spin_locked_txnh);
73956+ LOCK_CNT_INC(spin_locked);
73957+ return 1;
73958+ }
73959+ return 0;
73960+}
73961+
73962+static inline void spin_unlock_txnh(txn_handle *txnh)
73963+{
73964+ assert_spin_locked(&(txnh->hlock));
73965+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
73966+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73967+
73968+ LOCK_CNT_DEC(spin_locked_txnh);
73969+ LOCK_CNT_DEC(spin_locked);
73970+
73971+ spin_unlock(&(txnh->hlock));
73972+}
73973+
73974+#define spin_ordering_pred_txnmgr(tmgr) \
73975+ ( LOCK_CNT_NIL(spin_locked_atom) && \
73976+ LOCK_CNT_NIL(spin_locked_txnh) && \
73977+ LOCK_CNT_NIL(spin_locked_jnode) && \
73978+ LOCK_CNT_NIL(rw_locked_zlock) && \
73979+ LOCK_CNT_NIL(rw_locked_dk) && \
73980+ LOCK_CNT_NIL(rw_locked_tree) )
73981+
73982+static inline void spin_lock_txnmgr(txn_mgr *mgr)
73983+{
73984+ /* check that spinlocks of lower priorities are not held */
73985+ assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
73986+ LOCK_CNT_NIL(spin_locked_txnh) &&
73987+ LOCK_CNT_NIL(spin_locked_jnode) &&
73988+ LOCK_CNT_NIL(spin_locked_zlock) &&
73989+ LOCK_CNT_NIL(rw_locked_dk) &&
73990+ LOCK_CNT_NIL(rw_locked_tree)));
73991+
73992+ spin_lock(&(mgr->tmgr_lock));
73993+
73994+ LOCK_CNT_INC(spin_locked_txnmgr);
73995+ LOCK_CNT_INC(spin_locked);
73996+}
73997+
73998+static inline int spin_trylock_txnmgr(txn_mgr *mgr)
73999+{
74000+ if (spin_trylock(&(mgr->tmgr_lock))) {
74001+ LOCK_CNT_INC(spin_locked_txnmgr);
74002+ LOCK_CNT_INC(spin_locked);
74003+ return 1;
74004+ }
74005+ return 0;
74006+}
74007+
74008+static inline void spin_unlock_txnmgr(txn_mgr *mgr)
74009+{
74010+ assert_spin_locked(&(mgr->tmgr_lock));
74011+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
74012+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
74013+
74014+ LOCK_CNT_DEC(spin_locked_txnmgr);
74015+ LOCK_CNT_DEC(spin_locked);
74016+
74017+ spin_unlock(&(mgr->tmgr_lock));
74018+}
74019+
74020+typedef enum {
74021+ FQ_IN_USE = 0x1
74022+} flush_queue_state_t;
74023+
74024+typedef struct flush_queue flush_queue_t;
74025+
74026+/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
74027+ is filled by the jnode_flush() routine, and written to disk under memory
74028+ pressure or at atom commit time. */
74029+/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
74030+ field and fq->prepped list can be modified if atom is spin-locked and fq
74031+ object is "in-use" state. For read-only traversal of the fq->prepped list
74032+ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
74033+ only have atom spin-locked. */
74034+struct flush_queue {
74035+ /* linkage element is the first in this structure to make debugging
74036+ easier. See field in atom struct for description of list. */
74037+ struct list_head alink;
74038+ /* A spinlock to protect changes of fq state and fq->atom pointer */
74039+ spinlock_t guard;
74040+ /* flush_queue state: [in_use | ready] */
74041+ flush_queue_state_t state;
74042+ /* A list which contains queued nodes, queued nodes are removed from any
74043+ * atom's list and put on this ->prepped one. */
74044+ struct list_head prepped;
74045+ /* number of submitted i/o requests */
74046+ atomic_t nr_submitted;
74047+ /* number of i/o errors */
74048+ atomic_t nr_errors;
74049+ /* An atom this flush queue is attached to */
74050+ txn_atom *atom;
74051+ /* A wait queue head to wait on i/o completion */
74052+ wait_queue_head_t wait;
74053+#if REISER4_DEBUG
74054+ /* A thread which took this fq in exclusive use, NULL if fq is free,
74055+ * used for debugging. */
74056+ struct task_struct *owner;
74057+#endif
74058+};
74059+
74060+extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
74061+extern void reiser4_fq_put_nolock(flush_queue_t *);
74062+extern void reiser4_fq_put(flush_queue_t *);
74063+extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
74064+extern void queue_jnode(flush_queue_t *, jnode *);
74065+
74066+extern int reiser4_write_fq(flush_queue_t *, long *, int);
74067+extern int current_atom_finish_all_fq(void);
74068+extern void init_atom_fq_parts(txn_atom *);
74069+
74070+extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
74071+
74072+extern void znode_make_dirty(znode * node);
74073+extern void jnode_make_dirty_locked(jnode * node);
74074+
74075+extern int reiser4_sync_atom(txn_atom * atom);
74076+
74077+#if REISER4_DEBUG
74078+extern int atom_fq_parts_are_clean(txn_atom *);
74079+#endif
74080+
74081+extern void add_fq_to_bio(flush_queue_t *, struct bio *);
74082+extern flush_queue_t *get_fq_for_current_atom(void);
74083+
74084+void reiser4_invalidate_list(struct list_head * head);
74085+
74086+# endif /* __REISER4_TXNMGR_H__ */
74087+
74088+/* Make Linus happy.
74089+ Local variables:
74090+ c-indentation-style: "K&R"
74091+ mode-name: "LC"
74092+ c-basic-offset: 8
74093+ tab-width: 8
74094+ fill-column: 120
74095+ End:
74096+*/
74097diff -urN linux-2.6.24.orig/fs/reiser4/type_safe_hash.h linux-2.6.24/fs/reiser4/type_safe_hash.h
74098--- linux-2.6.24.orig/fs/reiser4/type_safe_hash.h 1970-01-01 03:00:00.000000000 +0300
74099+++ linux-2.6.24/fs/reiser4/type_safe_hash.h 2008-01-25 11:39:07.112253026 +0300
74100@@ -0,0 +1,320 @@
74101+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74102+ * reiser4/README */
74103+
74104+/* A hash table class that uses hash chains (singly-linked) and is
74105+ parametrized to provide type safety. */
74106+
74107+#ifndef __REISER4_TYPE_SAFE_HASH_H__
74108+#define __REISER4_TYPE_SAFE_HASH_H__
74109+
74110+#include "debug.h"
74111+
74112+#include <asm/errno.h>
74113+/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
74114+ based on the object type. You need to declare the item type before
74115+ this definition, define it after this definition. */
74116+#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
74117+ \
74118+typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
74119+typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
74120+ \
74121+struct PREFIX##_hash_table_ \
74122+{ \
74123+ ITEM_TYPE **_table; \
74124+ __u32 _buckets; \
74125+}; \
74126+ \
74127+struct PREFIX##_hash_link_ \
74128+{ \
74129+ ITEM_TYPE *_next; \
74130+}
74131+
74132+/* Step 2: Define the object type of the hash: give it field of type
74133+ PREFIX_hash_link. */
74134+
74135+/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
74136+ the type and field name used in step 3. The arguments are:
74137+
74138+ ITEM_TYPE The item type being hashed
74139+ KEY_TYPE The type of key being hashed
74140+ KEY_NAME The name of the key field within the item
74141+ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
74142+ HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
74143+ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
74144+
74145+ It implements these functions:
74146+
74147+ prefix_hash_init Initialize the table given its size.
74148+ prefix_hash_insert Insert an item
74149+ prefix_hash_insert_index Insert an item w/ precomputed hash_index
74150+ prefix_hash_find Find an item by key
74151+ prefix_hash_find_index Find an item w/ precomputed hash_index
74152+ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
74153+ prefix_hash_remove_index Remove an item w/ precomputed hash_index
74154+
74155+ If you'd like something to be done differently, feel free to ask me
74156+ for modifications. Additional features that could be added but
74157+ have not been:
74158+
74159+ prefix_hash_remove_key Find and remove an item by key
74160+ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
74161+
74162+ The hash_function currently receives only the key as an argument,
74163+ meaning it must somehow know the number of buckets. If this is a
74164+ problem let me know.
74165+
74166+ This hash table uses a single-linked hash chain. This means
74167+ insertion is fast but deletion requires searching the chain.
74168+
74169+ There is also the doubly-linked hash chain approach, under which
74170+ deletion requires no search but the code is longer and it takes two
74171+ pointers per item.
74172+
74173+ The circularly-linked approach has the shortest code but requires
74174+ two pointers per bucket, doubling the size of the bucket array (in
74175+ addition to two pointers per item).
74176+*/
74177+#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
74178+ \
74179+static __inline__ void \
74180+PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
74181+ __u32 hash UNUSED_ARG) \
74182+{ \
74183+ assert("nikita-2780", hash < table->_buckets); \
74184+} \
74185+ \
74186+static __inline__ int \
74187+PREFIX##_hash_init (PREFIX##_hash_table *hash, \
74188+ __u32 buckets) \
74189+{ \
74190+ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
74191+ hash->_buckets = buckets; \
74192+ if (hash->_table == NULL) \
74193+ { \
74194+ return RETERR(-ENOMEM); \
74195+ } \
74196+ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
74197+ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
74198+ return 0; \
74199+} \
74200+ \
74201+static __inline__ void \
74202+PREFIX##_hash_done (PREFIX##_hash_table *hash) \
74203+{ \
74204+ if (REISER4_DEBUG && hash->_table != NULL) { \
74205+ __u32 i; \
74206+ for (i = 0 ; i < hash->_buckets ; ++ i) \
74207+ assert("nikita-2905", hash->_table[i] == NULL); \
74208+ } \
74209+ if (hash->_table != NULL) \
74210+ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
74211+ hash->_table = NULL; \
74212+} \
74213+ \
74214+static __inline__ void \
74215+PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
74216+{ \
74217+ prefetch(item->LINK_NAME._next); \
74218+} \
74219+ \
74220+static __inline__ void \
74221+PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
74222+ __u32 index) \
74223+{ \
74224+ prefetch(hash->_table[index]); \
74225+} \
74226+ \
74227+static __inline__ ITEM_TYPE* \
74228+PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
74229+ __u32 hash_index, \
74230+ KEY_TYPE const *find_key) \
74231+{ \
74232+ ITEM_TYPE *item; \
74233+ \
74234+ PREFIX##_check_hash(hash, hash_index); \
74235+ \
74236+ for (item = hash->_table[hash_index]; \
74237+ item != NULL; \
74238+ item = item->LINK_NAME._next) \
74239+ { \
74240+ prefetch(item->LINK_NAME._next); \
74241+ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
74242+ if (EQ_FUNC (& item->KEY_NAME, find_key)) \
74243+ { \
74244+ return item; \
74245+ } \
74246+ } \
74247+ \
74248+ return NULL; \
74249+} \
74250+ \
74251+static __inline__ ITEM_TYPE* \
74252+PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
74253+ __u32 hash_index, \
74254+ KEY_TYPE const *find_key) \
74255+{ \
74256+ ITEM_TYPE ** item = &hash->_table[hash_index]; \
74257+ \
74258+ PREFIX##_check_hash(hash, hash_index); \
74259+ \
74260+ while (*item != NULL) { \
74261+ prefetch(&(*item)->LINK_NAME._next); \
74262+ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
74263+ ITEM_TYPE *found; \
74264+ \
74265+ found = *item; \
74266+ *item = found->LINK_NAME._next; \
74267+ found->LINK_NAME._next = hash->_table[hash_index]; \
74268+ hash->_table[hash_index] = found; \
74269+ return found; \
74270+ } \
74271+ item = &(*item)->LINK_NAME._next; \
74272+ } \
74273+ return NULL; \
74274+} \
74275+ \
74276+static __inline__ int \
74277+PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
74278+ __u32 hash_index, \
74279+ ITEM_TYPE *del_item) \
74280+{ \
74281+ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
74282+ \
74283+ PREFIX##_check_hash(hash, hash_index); \
74284+ \
74285+ while (*hash_item_p != NULL) { \
74286+ prefetch(&(*hash_item_p)->LINK_NAME._next); \
74287+ if (*hash_item_p == del_item) { \
74288+ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
74289+ return 1; \
74290+ } \
74291+ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
74292+ } \
74293+ return 0; \
74294+} \
74295+ \
74296+static __inline__ void \
74297+PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
74298+ __u32 hash_index, \
74299+ ITEM_TYPE *ins_item) \
74300+{ \
74301+ PREFIX##_check_hash(hash, hash_index); \
74302+ \
74303+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
74304+ hash->_table[hash_index] = ins_item; \
74305+} \
74306+ \
74307+static __inline__ void \
74308+PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
74309+ __u32 hash_index, \
74310+ ITEM_TYPE *ins_item) \
74311+{ \
74312+ PREFIX##_check_hash(hash, hash_index); \
74313+ \
74314+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
74315+ smp_wmb(); \
74316+ hash->_table[hash_index] = ins_item; \
74317+} \
74318+ \
74319+static __inline__ ITEM_TYPE* \
74320+PREFIX##_hash_find (PREFIX##_hash_table *hash, \
74321+ KEY_TYPE const *find_key) \
74322+{ \
74323+ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
74324+} \
74325+ \
74326+static __inline__ ITEM_TYPE* \
74327+PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
74328+ KEY_TYPE const *find_key) \
74329+{ \
74330+ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
74331+} \
74332+ \
74333+static __inline__ int \
74334+PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
74335+ ITEM_TYPE *del_item) \
74336+{ \
74337+ return PREFIX##_hash_remove_index (hash, \
74338+ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
74339+} \
74340+ \
74341+static __inline__ int \
74342+PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
74343+ ITEM_TYPE *del_item) \
74344+{ \
74345+ return PREFIX##_hash_remove (hash, del_item); \
74346+} \
74347+ \
74348+static __inline__ void \
74349+PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
74350+ ITEM_TYPE *ins_item) \
74351+{ \
74352+ return PREFIX##_hash_insert_index (hash, \
74353+ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
74354+} \
74355+ \
74356+static __inline__ void \
74357+PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
74358+ ITEM_TYPE *ins_item) \
74359+{ \
74360+ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
74361+ ins_item); \
74362+} \
74363+ \
74364+static __inline__ ITEM_TYPE * \
74365+PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
74366+{ \
74367+ ITEM_TYPE *first; \
74368+ \
74369+ for (first = NULL; ind < hash->_buckets; ++ ind) { \
74370+ first = hash->_table[ind]; \
74371+ if (first != NULL) \
74372+ break; \
74373+ } \
74374+ return first; \
74375+} \
74376+ \
74377+static __inline__ ITEM_TYPE * \
74378+PREFIX##_hash_next (PREFIX##_hash_table *hash, \
74379+ ITEM_TYPE *item) \
74380+{ \
74381+ ITEM_TYPE *next; \
74382+ \
74383+ if (item == NULL) \
74384+ return NULL; \
74385+ next = item->LINK_NAME._next; \
74386+ if (next == NULL) \
74387+ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
74388+ return next; \
74389+} \
74390+ \
74391+typedef struct {} PREFIX##_hash_dummy
74392+
74393+#define for_all_ht_buckets(table, head) \
74394+for ((head) = &(table) -> _table[ 0 ] ; \
74395+ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
74396+
74397+#define for_all_in_bucket(bucket, item, next, field) \
74398+for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
74399+ (item) != NULL ; \
74400+ (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
74401+
74402+#define for_all_in_htable(table, prefix, item, next) \
74403+for ((item) = prefix ## _hash_first ((table), 0), \
74404+ (next) = prefix ## _hash_next ((table), (item)) ; \
74405+ (item) != NULL ; \
74406+ (item) = (next), \
74407+ (next) = prefix ## _hash_next ((table), (item)))
74408+
74409+/* __REISER4_TYPE_SAFE_HASH_H__ */
74410+#endif
74411+
74412+/* Make Linus happy.
74413+ Local variables:
74414+ c-indentation-style: "K&R"
74415+ mode-name: "LC"
74416+ c-basic-offset: 8
74417+ tab-width: 8
74418+ fill-column: 120
74419+ End:
74420+*/
74421diff -urN linux-2.6.24.orig/fs/reiser4/vfs_ops.c linux-2.6.24/fs/reiser4/vfs_ops.c
74422--- linux-2.6.24.orig/fs/reiser4/vfs_ops.c 1970-01-01 03:00:00.000000000 +0300
74423+++ linux-2.6.24/fs/reiser4/vfs_ops.c 2008-01-25 11:39:07.112253026 +0300
74424@@ -0,0 +1,259 @@
74425+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74426+ * reiser4/README */
74427+
74428+/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
74429+ here. */
74430+
74431+#include "forward.h"
74432+#include "debug.h"
74433+#include "dformat.h"
74434+#include "coord.h"
74435+#include "plugin/item/item.h"
74436+#include "plugin/file/file.h"
74437+#include "plugin/security/perm.h"
74438+#include "plugin/disk_format/disk_format.h"
74439+#include "plugin/plugin.h"
74440+#include "plugin/plugin_set.h"
74441+#include "plugin/object.h"
74442+#include "txnmgr.h"
74443+#include "jnode.h"
74444+#include "znode.h"
74445+#include "block_alloc.h"
74446+#include "tree.h"
74447+#include "vfs_ops.h"
74448+#include "inode.h"
74449+#include "page_cache.h"
74450+#include "ktxnmgrd.h"
74451+#include "super.h"
74452+#include "reiser4.h"
74453+#include "entd.h"
74454+#include "status_flags.h"
74455+#include "flush.h"
74456+#include "dscale.h"
74457+
74458+#include <linux/profile.h>
74459+#include <linux/types.h>
74460+#include <linux/mount.h>
74461+#include <linux/vfs.h>
74462+#include <linux/mm.h>
74463+#include <linux/buffer_head.h>
74464+#include <linux/dcache.h>
74465+#include <linux/list.h>
74466+#include <linux/pagemap.h>
74467+#include <linux/slab.h>
74468+#include <linux/seq_file.h>
74469+#include <linux/init.h>
74470+#include <linux/module.h>
74471+#include <linux/writeback.h>
74472+#include <linux/blkdev.h>
74473+#include <linux/quotaops.h>
74474+#include <linux/security.h>
74475+#include <linux/reboot.h>
74476+#include <linux/rcupdate.h>
74477+
74478+/* update inode stat-data by calling plugin */
74479+int reiser4_update_sd(struct inode *object)
74480+{
74481+ file_plugin *fplug;
74482+
74483+ assert("nikita-2338", object != NULL);
74484+ /* check for read-only file system. */
74485+ if (IS_RDONLY(object))
74486+ return 0;
74487+
74488+ fplug = inode_file_plugin(object);
74489+ assert("nikita-2339", fplug != NULL);
74490+ return fplug->write_sd_by_inode(object);
74491+}
74492+
74493+/* helper function: increase inode nlink count and call plugin method to save
74494+ updated stat-data.
74495+
74496+ Used by link/create and during creation of dot and dotdot in mkdir
74497+*/
74498+int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
74499+ struct inode *parent /* parent where new entry will be */
74500+ ,
74501+ int write_sd_p /* true if stat-data has to be
74502+ * updated */ )
74503+{
74504+ file_plugin *fplug;
74505+ int result;
74506+
74507+ assert("nikita-1351", object != NULL);
74508+
74509+ fplug = inode_file_plugin(object);
74510+ assert("nikita-1445", fplug != NULL);
74511+
74512+ /* ask plugin whether it can add yet another link to this
74513+ object */
74514+ if (!fplug->can_add_link(object))
74515+ return RETERR(-EMLINK);
74516+
74517+ assert("nikita-2211", fplug->add_link != NULL);
74518+ /* call plugin to do actual addition of link */
74519+ result = fplug->add_link(object, parent);
74520+
74521+ /* optionally update stat data */
74522+ if (result == 0 && write_sd_p)
74523+ result = fplug->write_sd_by_inode(object);
74524+ return result;
74525+}
74526+
74527+/* helper function: decrease inode nlink count and call plugin method to save
74528+ updated stat-data.
74529+
74530+ Used by unlink/create
74531+*/
74532+int reiser4_del_nlink(struct inode *object /* object from which link is
74533+ * removed */ ,
74534+ struct inode *parent /* parent where entry was */ ,
74535+ int write_sd_p /* true is stat-data has to be
74536+ * updated */ )
74537+{
74538+ file_plugin *fplug;
74539+ int result;
74540+
74541+ assert("nikita-1349", object != NULL);
74542+
74543+ fplug = inode_file_plugin(object);
74544+ assert("nikita-1350", fplug != NULL);
74545+ assert("nikita-1446", object->i_nlink > 0);
74546+ assert("nikita-2210", fplug->rem_link != NULL);
74547+
74548+ /* call plugin to do actual deletion of link */
74549+ result = fplug->rem_link(object, parent);
74550+
74551+ /* optionally update stat data */
74552+ if (result == 0 && write_sd_p)
74553+ result = fplug->write_sd_by_inode(object);
74554+ return result;
74555+}
74556+
74557+/* Release reiser4 dentry. This is d_op->d_release() method. */
74558+static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
74559+{
74560+ reiser4_free_dentry_fsdata(dentry);
74561+}
74562+
74563+/*
74564+ * Called by reiser4_sync_inodes(), during speculative write-back (through
74565+ * pdflush, or balance_dirty_pages()).
74566+ */
74567+void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
74568+{
74569+ long written = 0;
74570+ int repeats = 0;
74571+ int result;
74572+ struct address_space *mapping;
74573+
74574+ /*
74575+ * Performs early flushing, trying to free some memory. If there is
74576+ * nothing to flush, commits some atoms.
74577+ */
74578+
74579+ /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
74580+ sys_fsync(). */
74581+ if (wbc->sync_mode != WB_SYNC_NONE) {
74582+ txnmgr_force_commit_all(sb, 0);
74583+ return;
74584+ }
74585+
74586+ BUG_ON(reiser4_get_super_fake(sb) == NULL);
74587+ mapping = reiser4_get_super_fake(sb)->i_mapping;
74588+ do {
74589+ long nr_submitted = 0;
74590+ jnode *node = NULL;
74591+
74592+ /* do not put more requests to overload write queue */
74593+ if (wbc->nonblocking &&
74594+ bdi_write_congested(mapping->backing_dev_info)) {
74595+ blk_run_address_space(mapping);
74596+ wbc->encountered_congestion = 1;
74597+ break;
74598+ }
74599+ repeats++;
74600+ BUG_ON(wbc->nr_to_write <= 0);
74601+
74602+ if (get_current_context()->entd) {
74603+ entd_context *ent = get_entd_context(sb);
74604+
74605+ if (ent->cur_request->node)
74606+ /*
74607+ * this is ent thread and it managed to capture
74608+ * requested page itself - start flush from
74609+ * that page
74610+ */
74611+ node = jref(ent->cur_request->node);
74612+ }
74613+
74614+ result = flush_some_atom(node, &nr_submitted, wbc,
74615+ JNODE_FLUSH_WRITE_BLOCKS);
74616+ if (result != 0)
74617+ warning("nikita-31001", "Flush failed: %i", result);
74618+ if (node)
74619+ jput(node);
74620+ if (!nr_submitted)
74621+ break;
74622+
74623+ wbc->nr_to_write -= nr_submitted;
74624+ written += nr_submitted;
74625+ } while (wbc->nr_to_write > 0);
74626+}
74627+
74628+void reiser4_throttle_write(struct inode *inode)
74629+{
74630+ reiser4_txn_restart_current();
74631+ balance_dirty_pages_ratelimited(inode->i_mapping);
74632+}
74633+
74634+const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
74635+const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
74636+ * beginning of device */
74637+
74638+/*
74639+ * Reiser4 initialization/shutdown.
74640+ *
74641+ * Code below performs global reiser4 initialization that is done either as
74642+ * part of kernel initialization (when reiser4 is statically built-in), or
74643+ * during reiser4 module load (when compiled as module).
74644+ */
74645+
74646+void reiser4_handle_error(void)
74647+{
74648+ struct super_block *sb = reiser4_get_current_sb();
74649+
74650+ if (!sb)
74651+ return;
74652+ reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
74653+ "Filesystem error occured");
74654+ switch (get_super_private(sb)->onerror) {
74655+ case 0:
74656+ reiser4_panic("foobar-42", "Filesystem error occured\n");
74657+ case 1:
74658+ default:
74659+ if (sb->s_flags & MS_RDONLY)
74660+ return;
74661+ sb->s_flags |= MS_RDONLY;
74662+ break;
74663+ }
74664+}
74665+
74666+struct dentry_operations reiser4_dentry_operations = {
74667+ .d_revalidate = NULL,
74668+ .d_hash = NULL,
74669+ .d_compare = NULL,
74670+ .d_delete = NULL,
74671+ .d_release = reiser4_d_release,
74672+ .d_iput = NULL,
74673+};
74674+
74675+/* Make Linus happy.
74676+ Local variables:
74677+ c-indentation-style: "K&R"
74678+ mode-name: "LC"
74679+ c-basic-offset: 8
74680+ tab-width: 8
74681+ fill-column: 120
74682+ End:
74683+*/
74684diff -urN linux-2.6.24.orig/fs/reiser4/vfs_ops.h linux-2.6.24/fs/reiser4/vfs_ops.h
74685--- linux-2.6.24.orig/fs/reiser4/vfs_ops.h 1970-01-01 03:00:00.000000000 +0300
74686+++ linux-2.6.24/fs/reiser4/vfs_ops.h 2008-01-25 11:39:07.112253026 +0300
74687@@ -0,0 +1,53 @@
74688+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74689+ * reiser4/README */
74690+
74691+/* vfs_ops.c's exported symbols */
74692+
74693+#if !defined( __FS_REISER4_VFS_OPS_H__ )
74694+#define __FS_REISER4_VFS_OPS_H__
74695+
74696+#include "forward.h"
74697+#include "coord.h"
74698+#include "seal.h"
74699+#include "plugin/file/file.h"
74700+#include "super.h"
74701+#include "readahead.h"
74702+
74703+#include <linux/types.h> /* for loff_t */
74704+#include <linux/fs.h> /* for struct address_space */
74705+#include <linux/dcache.h> /* for struct dentry */
74706+#include <linux/mm.h>
74707+#include <linux/backing-dev.h>
74708+
74709+/* address space operations */
74710+int reiser4_writepage(struct page *, struct writeback_control *);
74711+int reiser4_set_page_dirty(struct page *);
74712+void reiser4_invalidatepage(struct page *, unsigned long offset);
74713+int reiser4_releasepage(struct page *, gfp_t);
74714+
74715+extern int reiser4_update_sd(struct inode *);
74716+extern int reiser4_add_nlink(struct inode *, struct inode *, int);
74717+extern int reiser4_del_nlink(struct inode *, struct inode *, int);
74718+
74719+extern int reiser4_start_up_io(struct page *page);
74720+extern void reiser4_throttle_write(struct inode *);
74721+extern int jnode_is_releasable(jnode *);
74722+
74723+#define CAPTURE_APAGE_BURST (1024l)
74724+void reiser4_writeout(struct super_block *, struct writeback_control *);
74725+
74726+extern void reiser4_handle_error(void);
74727+
74728+/* __FS_REISER4_VFS_OPS_H__ */
74729+#endif
74730+
74731+/* Make Linus happy.
74732+ Local variables:
74733+ c-indentation-style: "K&R"
74734+ mode-name: "LC"
74735+ c-basic-offset: 8
74736+ tab-width: 8
74737+ fill-column: 120
74738+ scroll-step: 1
74739+ End:
74740+*/
74741diff -urN linux-2.6.24.orig/fs/reiser4/wander.c linux-2.6.24/fs/reiser4/wander.c
74742--- linux-2.6.24.orig/fs/reiser4/wander.c 1970-01-01 03:00:00.000000000 +0300
74743+++ linux-2.6.24/fs/reiser4/wander.c 2008-01-25 11:39:07.116254057 +0300
74744@@ -0,0 +1,1797 @@
74745+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74746+ * reiser4/README */
74747+
74748+/* Reiser4 Wandering Log */
74749+
74750+/* You should read http://www.namesys.com/txn-doc.html
74751+
74752+ That describes how filesystem operations are performed as atomic
74753+ transactions, and how we try to arrange it so that we can write most of the
74754+ data only once while performing the operation atomically.
74755+
74756+ For the purposes of this code, it is enough for it to understand that it
74757+ has been told a given block should be written either once, or twice (if
74758+ twice then once to the wandered location and once to the real location).
74759+
74760+ This code guarantees that those blocks that are defined to be part of an
74761+ atom either all take effect or none of them take effect.
74762+
74763+ Relocate set nodes are submitted to write by the jnode_flush() routine, and
74764+ the overwrite set is submitted by reiser4_write_log(). This is because with
74765+ the overwrite set we seek to optimize writes, and with the relocate set we
74766+ seek to cause disk order to correlate with the parent first pre-order.
74767+
74768+ reiser4_write_log() allocates and writes wandered blocks and maintains
74769+ additional on-disk structures of the atom as wander records (each wander
74770+ record occupies one block) for storing of the "wandered map" (a table which
74771+ contains a relation between wandered and real block numbers) and other
74772+ information which might be needed at transaction recovery time.
74773+
74774+ The wander records are unidirectionally linked into a circle: each wander
74775+ record contains a block number of the next wander record, the last wander
74776+ record points to the first one.
74777+
74778+ One wander record (named "tx head" in this file) has a format which is
74779+ different from the other wander records. The "tx head" has a reference to the
74780+ "tx head" block of the previously committed atom. Also, "tx head" contains
74781+ fs information (the free blocks counter, and the oid allocator state) which
74782+ is logged in a special way .
74783+
74784+ There are two journal control blocks, named journal header and journal
74785+ footer which have fixed on-disk locations. The journal header has a
74786+ reference to the "tx head" block of the last committed atom. The journal
74787+ footer points to the "tx head" of the last flushed atom. The atom is
74788+ "played" when all blocks from its overwrite set are written to disk the
74789+ second time (i.e. written to their real locations).
74790+
74791+ NOTE: People who know reiserfs internals and its journal structure might be
74792+ confused with these terms journal footer and journal header. There is a table
74793+ with terms of similar semantics in reiserfs (reiser3) and reiser4:
74794+
74795+ REISER3 TERM | REISER4 TERM | DESCRIPTION
74796+ --------------------+-----------------------+----------------------------
74797+ commit record | journal header | atomic write of this record
74798+ | | ends transaction commit
74799+ --------------------+-----------------------+----------------------------
74800+ journal header | journal footer | atomic write of this record
74801+ | | ends post-commit writes.
74802+ | | After successful
74803+ | | writing of this journal
74804+ | | blocks (in reiser3) or
74805+ | | wandered blocks/records are
74806+ | | free for re-use.
74807+ --------------------+-----------------------+----------------------------
74808+
74809+ The atom commit process is the following:
74810+
74811+ 1. The overwrite set is taken from atom's clean list, and its size is
74812+ counted.
74813+
74814+ 2. The number of necessary wander records (including tx head) is calculated,
74815+ and the wander record blocks are allocated.
74816+
74817+ 3. Allocate wandered blocks and populate wander records by wandered map.
74818+
74819+ 4. submit write requests for wander records and wandered blocks.
74820+
74821+ 5. wait until submitted write requests complete.
74822+
74823+ 6. update journal header: change the pointer to the block number of just
74824+ written tx head, submit an i/o for modified journal header block and wait
74825+ for i/o completion.
74826+
74827+ NOTE: The special logging for bitmap blocks and some reiser4 super block
74828+ fields makes processes of atom commit, flush and recovering a bit more
74829+ complex (see comments in the source code for details).
74830+
74831+ The atom playing process is the following:
74832+
74833+ 1. Write atom's overwrite set in-place.
74834+
74835+ 2. Wait on i/o.
74836+
74837+ 3. Update journal footer: change the pointer to block number of tx head
74838+ block of the atom we currently flushing, submit an i/o, wait on i/o
74839+ completion.
74840+
74841+ 4. Free disk space which was used for wandered blocks and wander records.
74842+
74843+ After the freeing of wandered blocks and wander records we have that journal
74844+ footer points to the on-disk structure which might be overwritten soon.
74845+ Neither the log writer nor the journal recovery procedure use that pointer
74846+ for accessing the data. When the journal recovery procedure finds the oldest
74847+ transaction it compares the journal footer pointer value with the "prev_tx"
74848+ pointer value in tx head, if values are equal the oldest not flushed
74849+ transaction is found.
74850+
74851+ NOTE on disk space leakage: the information about of what blocks and how many
74852+ blocks are allocated for wandered blocks, wandered records is not written to
74853+ the disk because of special logging for bitmaps and some super blocks
74854+ counters. After a system crash we the reiser4 does not remember those
74855+ objects allocation, thus we have no such a kind of disk space leakage.
74856+*/
74857+
74858+/* Special logging of reiser4 super block fields. */
74859+
74860+/* There are some reiser4 super block fields (free block count and OID allocator
74861+ state (number of files and next free OID) which are logged separately from
74862+ super block to avoid unnecessary atom fusion.
74863+
74864+ So, the reiser4 super block can be not captured by a transaction with
74865+ allocates/deallocates disk blocks or create/delete file objects. Moreover,
74866+ the reiser4 on-disk super block is not touched when such a transaction is
74867+ committed and flushed. Those "counters logged specially" are logged in "tx
74868+ head" blocks and in the journal footer block.
74869+
74870+ A step-by-step description of special logging:
74871+
74872+ 0. The per-atom information about deleted or created files and allocated or
74873+ freed blocks is collected during the transaction. The atom's
74874+ ->nr_objects_created and ->nr_objects_deleted are for object
74875+ deletion/creation tracking, the numbers of allocated and freed blocks are
74876+ calculated using atom's delete set and atom's capture list -- all new and
74877+ relocated nodes should be on atom's clean list and should have JNODE_RELOC
74878+ bit set.
74879+
74880+ 1. The "logged specially" reiser4 super block fields have their "committed"
74881+ versions in the reiser4 in-memory super block. They get modified only at
74882+ atom commit time. The atom's commit thread has an exclusive access to those
74883+ "committed" fields because the log writer implementation supports only one
74884+ atom commit a time (there is a per-fs "commit" mutex). At
74885+ that time "committed" counters are modified using per-atom information
74886+ collected during the transaction. These counters are stored on disk as a
74887+ part of tx head block when atom is committed.
74888+
74889+ 2. When the atom is flushed the value of the free block counter and the OID
74890+ allocator state get written to the journal footer block. A special journal
74891+ procedure (journal_recover_sb_data()) takes those values from the journal
74892+ footer and updates the reiser4 in-memory super block.
74893+
74894+ NOTE: That means free block count and OID allocator state are logged
74895+ separately from the reiser4 super block regardless of the fact that the
74896+ reiser4 super block has fields to store both the free block counter and the
74897+ OID allocator.
74898+
74899+ Writing the whole super block at commit time requires knowing true values of
74900+ all its fields without changes made by not yet committed transactions. It is
74901+ possible by having their "committed" version of the super block like the
74902+ reiser4 bitmap blocks have "committed" and "working" versions. However,
74903+ another scheme was implemented which stores special logged values in the
74904+ unused free space inside transaction head block. In my opinion it has an
74905+ advantage of not writing whole super block when only part of it was
74906+ modified. */
74907+
74908+#include "debug.h"
74909+#include "dformat.h"
74910+#include "txnmgr.h"
74911+#include "jnode.h"
74912+#include "znode.h"
74913+#include "block_alloc.h"
74914+#include "page_cache.h"
74915+#include "wander.h"
74916+#include "reiser4.h"
74917+#include "super.h"
74918+#include "vfs_ops.h"
74919+#include "writeout.h"
74920+#include "inode.h"
74921+#include "entd.h"
74922+
74923+#include <linux/types.h>
74924+#include <linux/fs.h> /* for struct super_block */
74925+#include <linux/mm.h> /* for struct page */
74926+#include <linux/pagemap.h>
74927+#include <linux/bio.h> /* for struct bio */
74928+#include <linux/blkdev.h>
74929+
74930+static int write_jnodes_to_disk_extent(
74931+ jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
74932+
74933+/* The commit_handle is a container for objects needed at atom commit time */
74934+struct commit_handle {
74935+ /* A pointer to atom's list of OVRWR nodes */
74936+ struct list_head *overwrite_set;
74937+ /* atom's overwrite set size */
74938+ int overwrite_set_size;
74939+ /* jnodes for wander record blocks */
74940+ struct list_head tx_list;
74941+ /* number of wander records */
74942+ __u32 tx_size;
74943+ /* 'committed' sb counters are saved here until atom is completely
74944+ flushed */
74945+ __u64 free_blocks;
74946+ __u64 nr_files;
74947+ __u64 next_oid;
74948+ /* A pointer to the atom which is being committed */
74949+ txn_atom *atom;
74950+ /* A pointer to current super block */
74951+ struct super_block *super;
74952+ /* The counter of modified bitmaps */
74953+ reiser4_block_nr nr_bitmap;
74954+};
74955+
74956+static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
74957+{
74958+ memset(ch, 0, sizeof(struct commit_handle));
74959+ INIT_LIST_HEAD(&ch->tx_list);
74960+
74961+ ch->atom = atom;
74962+ ch->super = reiser4_get_current_sb();
74963+}
74964+
74965+static void done_commit_handle(struct commit_handle *ch)
74966+{
74967+ assert("zam-690", list_empty(&ch->tx_list));
74968+}
74969+
74970+static inline int reiser4_use_write_barrier(struct super_block * s)
74971+{
74972+ return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
74973+}
74974+
74975+static void disable_write_barrier(struct super_block * s)
74976+{
74977+ notice("zam-1055", "%s does not support write barriers,"
74978+ " using synchronous write instead.", s->s_id);
74979+ set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
74980+}
74981+
74982+/* fill journal header block data */
74983+static void format_journal_header(struct commit_handle *ch)
74984+{
74985+ struct reiser4_super_info_data *sbinfo;
74986+ struct journal_header *header;
74987+ jnode *txhead;
74988+
74989+ sbinfo = get_super_private(ch->super);
74990+ assert("zam-479", sbinfo != NULL);
74991+ assert("zam-480", sbinfo->journal_header != NULL);
74992+
74993+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
74994+
74995+ jload(sbinfo->journal_header);
74996+
74997+ header = (struct journal_header *)jdata(sbinfo->journal_header);
74998+ assert("zam-484", header != NULL);
74999+
75000+ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
75001+ &header->last_committed_tx);
75002+
75003+ jrelse(sbinfo->journal_header);
75004+}
75005+
75006+/* fill journal footer block data */
75007+static void format_journal_footer(struct commit_handle *ch)
75008+{
75009+ struct reiser4_super_info_data *sbinfo;
75010+ struct journal_footer *footer;
75011+ jnode *tx_head;
75012+
75013+ sbinfo = get_super_private(ch->super);
75014+
75015+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
75016+
75017+ assert("zam-493", sbinfo != NULL);
75018+ assert("zam-494", sbinfo->journal_header != NULL);
75019+
75020+ check_me("zam-691", jload(sbinfo->journal_footer) == 0);
75021+
75022+ footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
75023+ assert("zam-495", footer != NULL);
75024+
75025+ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
75026+ &footer->last_flushed_tx);
75027+ put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
75028+
75029+ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
75030+ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
75031+
75032+ jrelse(sbinfo->journal_footer);
75033+}
75034+
75035+/* wander record capacity depends on current block size */
75036+static int wander_record_capacity(const struct super_block *super)
75037+{
75038+ return (super->s_blocksize -
75039+ sizeof(struct wander_record_header)) /
75040+ sizeof(struct wander_entry);
75041+}
75042+
75043+/* Fill first wander record (tx head) in accordance with supplied given data */
75044+static void format_tx_head(struct commit_handle *ch)
75045+{
75046+ jnode *tx_head;
75047+ jnode *next;
75048+ struct tx_header *header;
75049+
75050+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
75051+ assert("zam-692", &ch->tx_list != &tx_head->capture_link);
75052+
75053+ next = list_entry(tx_head->capture_link.next, jnode, capture_link);
75054+ if (&ch->tx_list == &next->capture_link)
75055+ next = tx_head;
75056+
75057+ header = (struct tx_header *)jdata(tx_head);
75058+
75059+ assert("zam-460", header != NULL);
75060+ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
75061+
75062+ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
75063+ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
75064+
75065+ put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
75066+ put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
75067+ &header->prev_tx);
75068+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
75069+ put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
75070+ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
75071+ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
75072+}
75073+
75074+/* prepare ordinary wander record block (fill all service fields) */
75075+static void
75076+format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
75077+{
75078+ struct wander_record_header *LRH;
75079+ jnode *next;
75080+
75081+ assert("zam-464", node != NULL);
75082+
75083+ LRH = (struct wander_record_header *)jdata(node);
75084+ next = list_entry(node->capture_link.next, jnode, capture_link);
75085+
75086+ if (&ch->tx_list == &next->capture_link)
75087+ next = list_entry(ch->tx_list.next, jnode, capture_link);
75088+
75089+ assert("zam-465", LRH != NULL);
75090+ assert("zam-463",
75091+ ch->super->s_blocksize > sizeof(struct wander_record_header));
75092+
75093+ memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
75094+ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
75095+
75096+ put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
75097+ put_unaligned(cpu_to_le32(serial), &LRH->serial);
75098+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
75099+}
75100+
75101+/* add one wandered map entry to formatted wander record */
75102+static void
75103+store_entry(jnode * node, int index, const reiser4_block_nr * a,
75104+ const reiser4_block_nr * b)
75105+{
75106+ char *data;
75107+ struct wander_entry *pairs;
75108+
75109+ data = jdata(node);
75110+ assert("zam-451", data != NULL);
75111+
75112+ pairs =
75113+ (struct wander_entry *)(data + sizeof(struct wander_record_header));
75114+
75115+ put_unaligned(cpu_to_le64(*a), &pairs[index].original);
75116+ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
75117+}
75118+
75119+/* currently, wander records contains contain only wandered map, which depend on
75120+ overwrite set size */
75121+static void get_tx_size(struct commit_handle *ch)
75122+{
75123+ assert("zam-440", ch->overwrite_set_size != 0);
75124+ assert("zam-695", ch->tx_size == 0);
75125+
75126+ /* count all ordinary wander records
75127+ (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
75128+ for tx head block */
75129+ ch->tx_size =
75130+ (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
75131+ 2;
75132+}
75133+
75134+/* A special structure for using in store_wmap_actor() for saving its state
75135+ between calls */
75136+struct store_wmap_params {
75137+ jnode *cur; /* jnode of current wander record to fill */
75138+ int idx; /* free element index in wander record */
75139+ int capacity; /* capacity */
75140+
75141+#if REISER4_DEBUG
75142+ struct list_head *tx_list;
75143+#endif
75144+};
75145+
75146+/* an actor for use in blocknr_set_iterator routine which populates the list
75147+ of pre-formatted wander records by wandered map info */
75148+static int
75149+store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
75150+ const reiser4_block_nr * b, void *data)
75151+{
75152+ struct store_wmap_params *params = data;
75153+
75154+ if (params->idx >= params->capacity) {
75155+ /* a new wander record should be taken from the tx_list */
75156+ params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
75157+ assert("zam-454",
75158+ params->tx_list != &params->cur->capture_link);
75159+
75160+ params->idx = 0;
75161+ }
75162+
75163+ store_entry(params->cur, params->idx, a, b);
75164+ params->idx++;
75165+
75166+ return 0;
75167+}
75168+
75169+/* This function is called after Relocate set gets written to disk, Overwrite
75170+ set is written to wandered locations and all wander records are written
75171+ also. Updated journal header blocks contains a pointer (block number) to
75172+ first wander record of the just written transaction */
75173+static int update_journal_header(struct commit_handle *ch, int use_barrier)
75174+{
75175+ struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75176+ jnode *jh = sbinfo->journal_header;
75177+ jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
75178+ int ret;
75179+
75180+ format_journal_header(ch);
75181+
75182+ ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
75183+ use_barrier ? WRITEOUT_BARRIER : 0);
75184+ if (ret)
75185+ return ret;
75186+
75187+ // blk_run_address_space(sbinfo->fake->i_mapping);
75188+ /*blk_run_queues(); */
75189+
75190+ ret = jwait_io(jh, WRITE);
75191+
75192+ if (ret)
75193+ return ret;
75194+
75195+ sbinfo->last_committed_tx = *jnode_get_block(head);
75196+
75197+ return 0;
75198+}
75199+
75200+/* This function is called after write-back is finished. We update journal
75201+ footer block and free blocks which were occupied by wandered blocks and
75202+ transaction wander records */
75203+static int update_journal_footer(struct commit_handle *ch, int use_barrier)
75204+{
75205+ reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75206+
75207+ jnode *jf = sbinfo->journal_footer;
75208+
75209+ int ret;
75210+
75211+ format_journal_footer(ch);
75212+
75213+ ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
75214+ use_barrier ? WRITEOUT_BARRIER : 0);
75215+ if (ret)
75216+ return ret;
75217+
75218+ // blk_run_address_space(sbinfo->fake->i_mapping);
75219+ /*blk_run_queue(); */
75220+
75221+ ret = jwait_io(jf, WRITE);
75222+ if (ret)
75223+ return ret;
75224+
75225+ return 0;
75226+}
75227+
75228+/* free block numbers of wander records of already written in place transaction */
75229+static void dealloc_tx_list(struct commit_handle *ch)
75230+{
75231+ while (!list_empty(&ch->tx_list)) {
75232+ jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
75233+ list_del(&cur->capture_link);
75234+ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
75235+ reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
75236+ BA_FORMATTED);
75237+
75238+ unpin_jnode_data(cur);
75239+ reiser4_drop_io_head(cur);
75240+ }
75241+}
75242+
75243+/* An actor for use in block_nr_iterator() routine which frees wandered blocks
75244+ from atom's overwrite set. */
75245+static int
75246+dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
75247+ const reiser4_block_nr * a UNUSED_ARG,
75248+ const reiser4_block_nr * b, void *data UNUSED_ARG)
75249+{
75250+
75251+ assert("zam-499", b != NULL);
75252+ assert("zam-500", *b != 0);
75253+ assert("zam-501", !reiser4_blocknr_is_fake(b));
75254+
75255+ reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
75256+ return 0;
75257+}
75258+
75259+/* free wandered block locations of already written in place transaction */
75260+static void dealloc_wmap(struct commit_handle *ch)
75261+{
75262+ assert("zam-696", ch->atom != NULL);
75263+
75264+ blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
75265+ dealloc_wmap_actor, NULL, 1);
75266+}
75267+
75268+/* helper function for alloc wandered blocks, which refill set of block
75269+ numbers needed for wandered blocks */
75270+static int
75271+get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
75272+{
75273+ reiser4_blocknr_hint hint;
75274+ int ret;
75275+
75276+ reiser4_block_nr wide_len = count;
75277+
75278+ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
75279+ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
75280+ reserved allocation area so as to get the best qualities of fixed
75281+ journals? */
75282+ reiser4_blocknr_hint_init(&hint);
75283+ hint.block_stage = BLOCK_GRABBED;
75284+
75285+ ret = reiser4_alloc_blocks(&hint, start, &wide_len,
75286+ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
75287+ *len = (int)wide_len;
75288+
75289+ return ret;
75290+}
75291+
75292+/*
75293+ * roll back changes made before issuing BIO in the case of IO error.
75294+ */
75295+static void undo_bio(struct bio *bio)
75296+{
75297+ int i;
75298+
75299+ for (i = 0; i < bio->bi_vcnt; ++i) {
75300+ struct page *pg;
75301+ jnode *node;
75302+
75303+ pg = bio->bi_io_vec[i].bv_page;
75304+ end_page_writeback(pg);
75305+ node = jprivate(pg);
75306+ spin_lock_jnode(node);
75307+ JF_CLR(node, JNODE_WRITEBACK);
75308+ JF_SET(node, JNODE_DIRTY);
75309+ spin_unlock_jnode(node);
75310+ }
75311+ bio_put(bio);
75312+}
75313+
75314+/* put overwrite set back to atom's clean list */
75315+static void put_overwrite_set(struct commit_handle *ch)
75316+{
75317+ jnode *cur;
75318+
75319+ list_for_each_entry(cur, ch->overwrite_set, capture_link)
75320+ jrelse_tail(cur);
75321+}
75322+
75323+/* Count overwrite set size, grab disk space for wandered blocks allocation.
75324+ Since we have a separate list for atom's overwrite set we just scan the list,
75325+ count bitmap and other not leaf nodes which wandered blocks allocation we
75326+ have to grab space for. */
75327+static int get_overwrite_set(struct commit_handle *ch)
75328+{
75329+ int ret;
75330+ jnode *cur;
75331+ __u64 nr_not_leaves = 0;
75332+#if REISER4_DEBUG
75333+ __u64 nr_formatted_leaves = 0;
75334+ __u64 nr_unformatted_leaves = 0;
75335+#endif
75336+
75337+ assert("zam-697", ch->overwrite_set_size == 0);
75338+
75339+ ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
75340+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75341+
75342+ while (ch->overwrite_set != &cur->capture_link) {
75343+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
75344+
75345+ /* Count bitmap locks for getting correct statistics what number
75346+ * of blocks were cleared by the transaction commit. */
75347+ if (jnode_get_type(cur) == JNODE_BITMAP)
75348+ ch->nr_bitmap++;
75349+
75350+ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
75351+ || jnode_get_type(cur) == JNODE_BITMAP);
75352+
75353+ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
75354+ /* we replace fake znode by another (real)
75355+ znode which is suggested by disk_layout
75356+ plugin */
75357+
75358+ /* FIXME: it looks like fake znode should be
75359+ replaced by jnode supplied by
75360+ disk_layout. */
75361+
75362+ struct super_block *s = reiser4_get_current_sb();
75363+ reiser4_super_info_data *sbinfo =
75364+ get_current_super_private();
75365+
75366+ if (sbinfo->df_plug->log_super) {
75367+ jnode *sj = sbinfo->df_plug->log_super(s);
75368+
75369+ assert("zam-593", sj != NULL);
75370+
75371+ if (IS_ERR(sj))
75372+ return PTR_ERR(sj);
75373+
75374+ spin_lock_jnode(sj);
75375+ JF_SET(sj, JNODE_OVRWR);
75376+ insert_into_atom_ovrwr_list(ch->atom, sj);
75377+ spin_unlock_jnode(sj);
75378+
75379+ /* jload it as the rest of overwrite set */
75380+ jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
75381+
75382+ ch->overwrite_set_size++;
75383+ }
75384+ spin_lock_jnode(cur);
75385+ reiser4_uncapture_block(cur);
75386+ jput(cur);
75387+
75388+ } else {
75389+ int ret;
75390+ ch->overwrite_set_size++;
75391+ ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
75392+ if (ret)
75393+ reiser4_panic("zam-783",
75394+ "cannot load e-flushed jnode back (ret = %d)\n",
75395+ ret);
75396+ }
75397+
75398+ /* Count not leaves here because we have to grab disk space
75399+ * for wandered blocks. They were not counted as "flush
75400+ * reserved". Counting should be done _after_ nodes are pinned
75401+ * into memory by jload(). */
75402+ if (!jnode_is_leaf(cur))
75403+ nr_not_leaves++;
75404+ else {
75405+#if REISER4_DEBUG
75406+ /* at this point @cur either has JNODE_FLUSH_RESERVED
75407+ * or is eflushed. Locking is not strong enough to
75408+ * write an assertion checking for this. */
75409+ if (jnode_is_znode(cur))
75410+ nr_formatted_leaves++;
75411+ else
75412+ nr_unformatted_leaves++;
75413+#endif
75414+ JF_CLR(cur, JNODE_FLUSH_RESERVED);
75415+ }
75416+
75417+ cur = next;
75418+ }
75419+
75420+ /* Grab space for writing (wandered blocks) of not leaves found in
75421+ * overwrite set. */
75422+ ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
75423+ if (ret)
75424+ return ret;
75425+
75426+ /* Disk space for allocation of wandered blocks of leaf nodes already
75427+ * reserved as "flush reserved", move it to grabbed space counter. */
75428+ spin_lock_atom(ch->atom);
75429+ assert("zam-940",
75430+ nr_formatted_leaves + nr_unformatted_leaves <=
75431+ ch->atom->flush_reserved);
75432+ flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
75433+ spin_unlock_atom(ch->atom);
75434+
75435+ return ch->overwrite_set_size;
75436+}
75437+
75438+/**
75439+ * write_jnodes_to_disk_extent - submit write request
75440+ * @head:
75441+ * @first: first jnode of the list
75442+ * @nr: number of jnodes on the list
75443+ * @block_p:
75444+ * @fq:
75445+ * @flags: used to decide whether page is to get PG_reclaim flag
75446+ *
75447+ * Submits a write request for @nr jnodes beginning from the @first, other
75448+ * jnodes are after the @first on the double-linked "capture" list. All jnodes
75449+ * will be written to the disk region of @nr blocks starting with @block_p block
75450+ * number. If @fq is not NULL it means that waiting for i/o completion will be
75451+ * done more efficiently by using flush_queue_t objects.
75452+ * This function is the one which writes list of jnodes in batch mode. It does
75453+ * all low-level things as bio construction and page states manipulation.
75454+ *
75455+ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
75456+ * aggregated in this function instead of being left to the layers below
75457+ *
75458+ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
75459+ * Why that layer needed? Why BIOs cannot be constructed here?
75460+ */
75461+static int write_jnodes_to_disk_extent(
75462+ jnode *first, int nr, const reiser4_block_nr *block_p,
75463+ flush_queue_t *fq, int flags)
75464+{
75465+ struct super_block *super = reiser4_get_current_sb();
75466+ int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
75467+ int max_blocks;
75468+ jnode *cur = first;
75469+ reiser4_block_nr block;
75470+
75471+ assert("zam-571", first != NULL);
75472+ assert("zam-572", block_p != NULL);
75473+ assert("zam-570", nr > 0);
75474+
75475+ block = *block_p;
75476+ max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
75477+
75478+ while (nr > 0) {
75479+ struct bio *bio;
75480+ int nr_blocks = min(nr, max_blocks);
75481+ int i;
75482+ int nr_used;
75483+
75484+ bio = bio_alloc(GFP_NOIO, nr_blocks);
75485+ if (!bio)
75486+ return RETERR(-ENOMEM);
75487+
75488+ bio->bi_bdev = super->s_bdev;
75489+ bio->bi_sector = block * (super->s_blocksize >> 9);
75490+ for (nr_used = 0, i = 0; i < nr_blocks; i++) {
75491+ struct page *pg;
75492+
75493+ pg = jnode_page(cur);
75494+ assert("zam-573", pg != NULL);
75495+
75496+ page_cache_get(pg);
75497+
75498+ lock_and_wait_page_writeback(pg);
75499+
75500+ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
75501+ /*
75502+ * underlying device is satiated. Stop adding
75503+ * pages to the bio.
75504+ */
75505+ unlock_page(pg);
75506+ page_cache_release(pg);
75507+ break;
75508+ }
75509+
75510+ spin_lock_jnode(cur);
75511+ assert("nikita-3166",
75512+ pg->mapping == jnode_get_mapping(cur));
75513+ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
75514+#if REISER4_DEBUG
75515+ spin_lock(&cur->load);
75516+ assert("nikita-3165", !jnode_is_releasable(cur));
75517+ spin_unlock(&cur->load);
75518+#endif
75519+ JF_SET(cur, JNODE_WRITEBACK);
75520+ JF_CLR(cur, JNODE_DIRTY);
75521+ ON_DEBUG(cur->written++);
75522+ spin_unlock_jnode(cur);
75523+
75524+ ClearPageError(pg);
75525+ set_page_writeback(pg);
75526+
75527+ if (get_current_context()->entd) {
75528+ /* this is ent thread */
75529+ entd_context *ent = get_entd_context(super);
75530+ struct wbq *rq, *next;
75531+
75532+ spin_lock(&ent->guard);
75533+
75534+ if (pg == ent->cur_request->page) {
75535+ /*
75536+ * entd is called for this page. This
75537+ * request is not in th etodo list
75538+ */
75539+ ent->cur_request->written = 1;
75540+ } else {
75541+ /*
75542+ * if we have written a page for which writepage
75543+ * is called for - move request to another list.
75544+ */
75545+ list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
75546+ assert("", rq->magic == WBQ_MAGIC);
75547+ if (pg == rq->page) {
75548+ /*
75549+ * remove request from
75550+ * entd's queue, but do
75551+ * not wake up a thread
75552+ * which put this
75553+ * request
75554+ */
75555+ list_del_init(&rq->link);
75556+ ent->nr_todo_reqs --;
75557+ list_add_tail(&rq->link, &ent->done_list);
75558+ ent->nr_done_reqs ++;
75559+ rq->written = 1;
75560+ break;
75561+ }
75562+ }
75563+ }
75564+ spin_unlock(&ent->guard);
75565+ }
75566+
75567+ clear_page_dirty_for_io(pg);
75568+
75569+ unlock_page(pg);
75570+
75571+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75572+ nr_used++;
75573+ }
75574+ if (nr_used > 0) {
75575+ assert("nikita-3453",
75576+ bio->bi_size == super->s_blocksize * nr_used);
75577+ assert("nikita-3454", bio->bi_vcnt == nr_used);
75578+
75579+ /* Check if we are allowed to write at all */
75580+ if (super->s_flags & MS_RDONLY)
75581+ undo_bio(bio);
75582+ else {
75583+ int not_supported;
75584+
75585+ add_fq_to_bio(fq, bio);
75586+ bio_get(bio);
75587+ reiser4_submit_bio(write_op, bio);
75588+ not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
75589+ bio_put(bio);
75590+ if (not_supported)
75591+ return -EOPNOTSUPP;
75592+ }
75593+
75594+ block += nr_used - 1;
75595+ update_blocknr_hint_default(super, &block);
75596+ block += 1;
75597+ } else {
75598+ bio_put(bio);
75599+ }
75600+ nr -= nr_used;
75601+ }
75602+
75603+ return 0;
75604+}
75605+
75606+/* This is a procedure which recovers a contiguous sequences of disk block
75607+ numbers in the given list of j-nodes and submits write requests on this
75608+ per-sequence basis */
75609+int
75610+write_jnode_list(struct list_head *head, flush_queue_t *fq,
75611+ long *nr_submitted, int flags)
75612+{
75613+ int ret;
75614+ jnode *beg = list_entry(head->next, jnode, capture_link);
75615+
75616+ while (head != &beg->capture_link) {
75617+ int nr = 1;
75618+ jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
75619+
75620+ while (head != &cur->capture_link) {
75621+ if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
75622+ break;
75623+ ++nr;
75624+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75625+ }
75626+
75627+ ret = write_jnodes_to_disk_extent(
75628+ beg, nr, jnode_get_block(beg), fq, flags);
75629+ if (ret)
75630+ return ret;
75631+
75632+ if (nr_submitted)
75633+ *nr_submitted += nr;
75634+
75635+ beg = cur;
75636+ }
75637+
75638+ return 0;
75639+}
75640+
75641+/* add given wandered mapping to atom's wandered map */
75642+static int
75643+add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
75644+{
75645+ int ret;
75646+ blocknr_set_entry *new_bsep = NULL;
75647+ reiser4_block_nr block;
75648+
75649+ txn_atom *atom;
75650+
75651+ assert("zam-568", block_p != NULL);
75652+ block = *block_p;
75653+ assert("zam-569", len > 0);
75654+
75655+ while ((len--) > 0) {
75656+ do {
75657+ atom = get_current_atom_locked();
75658+ assert("zam-536",
75659+ !reiser4_blocknr_is_fake(jnode_get_block(cur)));
75660+ ret =
75661+ blocknr_set_add_pair(atom, &atom->wandered_map,
75662+ &new_bsep,
75663+ jnode_get_block(cur), &block);
75664+ } while (ret == -E_REPEAT);
75665+
75666+ if (ret) {
75667+ /* deallocate blocks which were not added to wandered
75668+ map */
75669+ reiser4_block_nr wide_len = len;
75670+
75671+ reiser4_dealloc_blocks(&block, &wide_len,
75672+ BLOCK_NOT_COUNTED,
75673+ BA_FORMATTED
75674+ /* formatted, without defer */ );
75675+
75676+ return ret;
75677+ }
75678+
75679+ spin_unlock_atom(atom);
75680+
75681+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75682+ ++block;
75683+ }
75684+
75685+ return 0;
75686+}
75687+
75688+/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
75689+ submit IO for allocated blocks. We assume that current atom is in a stage
75690+ when any atom fusion is impossible and atom is unlocked and it is safe. */
75691+static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
75692+{
75693+ reiser4_block_nr block;
75694+
75695+ int rest;
75696+ int len;
75697+ int ret;
75698+
75699+ jnode *cur;
75700+
75701+ assert("zam-534", ch->overwrite_set_size > 0);
75702+
75703+ rest = ch->overwrite_set_size;
75704+
75705+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75706+ while (ch->overwrite_set != &cur->capture_link) {
75707+ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
75708+
75709+ ret = get_more_wandered_blocks(rest, &block, &len);
75710+ if (ret)
75711+ return ret;
75712+
75713+ rest -= len;
75714+
75715+ ret = add_region_to_wmap(cur, len, &block);
75716+ if (ret)
75717+ return ret;
75718+
75719+ ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
75720+ if (ret)
75721+ return ret;
75722+
75723+ while ((len--) > 0) {
75724+ assert("zam-604",
75725+ ch->overwrite_set != &cur->capture_link);
75726+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75727+ }
75728+ }
75729+
75730+ return 0;
75731+}
75732+
75733+/* allocate given number of nodes over the journal area and link them into a
75734+ list, return pointer to the first jnode in the list */
75735+static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
75736+{
75737+ reiser4_blocknr_hint hint;
75738+ reiser4_block_nr allocated = 0;
75739+ reiser4_block_nr first, len;
75740+ jnode *cur;
75741+ jnode *txhead;
75742+ int ret;
75743+ reiser4_context *ctx;
75744+ reiser4_super_info_data *sbinfo;
75745+
75746+ assert("zam-698", ch->tx_size > 0);
75747+ assert("zam-699", list_empty_careful(&ch->tx_list));
75748+
75749+ ctx = get_current_context();
75750+ sbinfo = get_super_private(ctx->super);
75751+
75752+ while (allocated < (unsigned)ch->tx_size) {
75753+ len = (ch->tx_size - allocated);
75754+
75755+ reiser4_blocknr_hint_init(&hint);
75756+
75757+ hint.block_stage = BLOCK_GRABBED;
75758+
75759+ /* FIXME: there should be some block allocation policy for
75760+ nodes which contain wander records */
75761+
75762+ /* We assume that disk space for wandered record blocks can be
75763+ * taken from reserved area. */
75764+ ret = reiser4_alloc_blocks(&hint, &first, &len,
75765+ BA_FORMATTED | BA_RESERVED |
75766+ BA_USE_DEFAULT_SEARCH_START);
75767+ reiser4_blocknr_hint_done(&hint);
75768+
75769+ if (ret)
75770+ return ret;
75771+
75772+ allocated += len;
75773+
75774+ /* create jnodes for all wander records */
75775+ while (len--) {
75776+ cur = reiser4_alloc_io_head(&first);
75777+
75778+ if (cur == NULL) {
75779+ ret = RETERR(-ENOMEM);
75780+ goto free_not_assigned;
75781+ }
75782+
75783+ ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
75784+
75785+ if (ret != 0) {
75786+ jfree(cur);
75787+ goto free_not_assigned;
75788+ }
75789+
75790+ pin_jnode_data(cur);
75791+
75792+ list_add_tail(&cur->capture_link, &ch->tx_list);
75793+
75794+ first++;
75795+ }
75796+ }
75797+
75798+ { /* format a on-disk linked list of wander records */
75799+ int serial = 1;
75800+
75801+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
75802+ format_tx_head(ch);
75803+
75804+ cur = list_entry(txhead->capture_link.next, jnode, capture_link);
75805+ while (&ch->tx_list != &cur->capture_link) {
75806+ format_wander_record(ch, cur, serial++);
75807+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75808+ }
75809+ }
75810+
75811+ { /* Fill wander records with Wandered Set */
75812+ struct store_wmap_params params;
75813+ txn_atom *atom;
75814+
75815+ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
75816+
75817+ params.idx = 0;
75818+ params.capacity =
75819+ wander_record_capacity(reiser4_get_current_sb());
75820+
75821+ atom = get_current_atom_locked();
75822+ blocknr_set_iterator(atom, &atom->wandered_map,
75823+ &store_wmap_actor, &params, 0);
75824+ spin_unlock_atom(atom);
75825+ }
75826+
75827+ { /* relse all jnodes from tx_list */
75828+ cur = list_entry(ch->tx_list.next, jnode, capture_link);
75829+ while (&ch->tx_list != &cur->capture_link) {
75830+ jrelse(cur);
75831+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
75832+ }
75833+ }
75834+
75835+ ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
75836+
75837+ return ret;
75838+
75839+ free_not_assigned:
75840+ /* We deallocate blocks not yet assigned to jnodes on tx_list. The
75841+ caller takes care about invalidating of tx list */
75842+ reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
75843+
75844+ return ret;
75845+}
75846+
75847+static int commit_tx(struct commit_handle *ch)
75848+{
75849+ flush_queue_t *fq;
75850+ int barrier;
75851+ int ret;
75852+
75853+ /* Grab more space for wandered records. */
75854+ ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
75855+ if (ret)
75856+ return ret;
75857+
75858+ fq = get_fq_for_current_atom();
75859+ if (IS_ERR(fq))
75860+ return PTR_ERR(fq);
75861+
75862+ spin_unlock_atom(fq->atom);
75863+ do {
75864+ ret = alloc_wandered_blocks(ch, fq);
75865+ if (ret)
75866+ break;
75867+ ret = alloc_tx(ch, fq);
75868+ if (ret)
75869+ break;
75870+ } while (0);
75871+
75872+ reiser4_fq_put(fq);
75873+ if (ret)
75874+ return ret;
75875+ repeat_wo_barrier:
75876+ barrier = reiser4_use_write_barrier(ch->super);
75877+ if (!barrier) {
75878+ ret = current_atom_finish_all_fq();
75879+ if (ret)
75880+ return ret;
75881+ }
75882+ ret = update_journal_header(ch, barrier);
75883+ if (barrier) {
75884+ if (ret) {
75885+ if (ret == -EOPNOTSUPP) {
75886+ disable_write_barrier(ch->super);
75887+ goto repeat_wo_barrier;
75888+ }
75889+ return ret;
75890+ }
75891+ ret = current_atom_finish_all_fq();
75892+ }
75893+ return ret;
75894+}
75895+
75896+static int write_tx_back(struct commit_handle * ch)
75897+{
75898+ flush_queue_t *fq;
75899+ int ret;
75900+ int barrier;
75901+
75902+ reiser4_post_commit_hook();
75903+ fq = get_fq_for_current_atom();
75904+ if (IS_ERR(fq))
75905+ return PTR_ERR(fq);
75906+ spin_unlock_atom(fq->atom);
75907+ ret = write_jnode_list(
75908+ ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
75909+ reiser4_fq_put(fq);
75910+ if (ret)
75911+ return ret;
75912+ repeat_wo_barrier:
75913+ barrier = reiser4_use_write_barrier(ch->super);
75914+ if (!barrier) {
75915+ ret = current_atom_finish_all_fq();
75916+ if (ret)
75917+ return ret;
75918+ }
75919+ ret = update_journal_footer(ch, barrier);
75920+ if (barrier) {
75921+ if (ret) {
75922+ if (ret == -EOPNOTSUPP) {
75923+ disable_write_barrier(ch->super);
75924+ goto repeat_wo_barrier;
75925+ }
75926+ return ret;
75927+ }
75928+ ret = current_atom_finish_all_fq();
75929+ }
75930+ if (ret)
75931+ return ret;
75932+ reiser4_post_write_back_hook();
75933+ return 0;
75934+}
75935+
75936+/* We assume that at this moment all captured blocks are marked as RELOC or
75937+ WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
75938+ are submitted to write.
75939+*/
75940+
75941+int reiser4_write_logs(long *nr_submitted)
75942+{
75943+ txn_atom *atom;
75944+ struct super_block *super = reiser4_get_current_sb();
75945+ reiser4_super_info_data *sbinfo = get_super_private(super);
75946+ struct commit_handle ch;
75947+ int ret;
75948+
75949+ writeout_mode_enable();
75950+
75951+ /* block allocator may add j-nodes to the clean_list */
75952+ ret = reiser4_pre_commit_hook();
75953+ if (ret)
75954+ return ret;
75955+
75956+ /* No locks are required if we take atom which stage >=
75957+ * ASTAGE_PRE_COMMIT */
75958+ atom = get_current_context()->trans->atom;
75959+ assert("zam-965", atom != NULL);
75960+
75961+ /* relocate set is on the atom->clean_nodes list after
75962+ * current_atom_complete_writes() finishes. It can be safely
75963+ * uncaptured after commit_mutex is locked, because any atom that
75964+ * captures these nodes is guaranteed to commit after current one.
75965+ *
75966+ * This can only be done after reiser4_pre_commit_hook(), because it is where
75967+ * early flushed jnodes with CREATED bit are transferred to the
75968+ * overwrite list. */
75969+ reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
75970+ spin_lock_atom(atom);
75971+ /* There might be waiters for the relocate nodes which we have
75972+ * released, wake them up. */
75973+ reiser4_atom_send_event(atom);
75974+ spin_unlock_atom(atom);
75975+
75976+ if (REISER4_DEBUG) {
75977+ int level;
75978+
75979+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
75980+ assert("nikita-3352",
75981+ list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
75982+ }
75983+
75984+ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
75985+ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
75986+
75987+ init_commit_handle(&ch, atom);
75988+
75989+ ch.free_blocks = sbinfo->blocks_free_committed;
75990+ ch.nr_files = sbinfo->nr_files_committed;
75991+ /* ZAM-FIXME-HANS: email me what the contention level is for the super
75992+ * lock. */
75993+ ch.next_oid = oid_next(super);
75994+
75995+ /* count overwrite set and place it in a separate list */
75996+ ret = get_overwrite_set(&ch);
75997+
75998+ if (ret <= 0) {
75999+ /* It is possible that overwrite set is empty here, it means
76000+ all captured nodes are clean */
76001+ goto up_and_ret;
76002+ }
76003+
76004+ /* Inform the caller about what number of dirty pages will be
76005+ * submitted to disk. */
76006+ *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
76007+
76008+ /* count all records needed for storing of the wandered set */
76009+ get_tx_size(&ch);
76010+
76011+ ret = commit_tx(&ch);
76012+ if (ret)
76013+ goto up_and_ret;
76014+
76015+ spin_lock_atom(atom);
76016+ reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
76017+ spin_unlock_atom(atom);
76018+
76019+ ret = write_tx_back(&ch);
76020+ reiser4_post_write_back_hook();
76021+
76022+ up_and_ret:
76023+ if (ret) {
76024+ /* there could be fq attached to current atom; the only way to
76025+ remove them is: */
76026+ current_atom_finish_all_fq();
76027+ }
76028+
76029+ /* free blocks of flushed transaction */
76030+ dealloc_tx_list(&ch);
76031+ dealloc_wmap(&ch);
76032+
76033+ put_overwrite_set(&ch);
76034+
76035+ done_commit_handle(&ch);
76036+
76037+ writeout_mode_disable();
76038+
76039+ return ret;
76040+}
76041+
76042+/* consistency checks for journal data/control blocks: header, footer, log
76043+ records, transactions head blocks. All functions return zero on success. */
76044+
76045+static int check_journal_header(const jnode * node UNUSED_ARG)
76046+{
76047+ /* FIXME: journal header has no magic field yet. */
76048+ return 0;
76049+}
76050+
76051+/* wait for write completion for all jnodes from given list */
76052+static int wait_on_jnode_list(struct list_head *head)
76053+{
76054+ jnode *scan;
76055+ int ret = 0;
76056+
76057+ list_for_each_entry(scan, head, capture_link) {
76058+ struct page *pg = jnode_page(scan);
76059+
76060+ if (pg) {
76061+ if (PageWriteback(pg))
76062+ wait_on_page_writeback(pg);
76063+
76064+ if (PageError(pg))
76065+ ret++;
76066+ }
76067+ }
76068+
76069+ return ret;
76070+}
76071+
76072+static int check_journal_footer(const jnode * node UNUSED_ARG)
76073+{
76074+ /* FIXME: journal footer has no magic field yet. */
76075+ return 0;
76076+}
76077+
76078+static int check_tx_head(const jnode * node)
76079+{
76080+ struct tx_header *header = (struct tx_header *)jdata(node);
76081+
76082+ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
76083+ warning("zam-627", "tx head at block %s corrupted\n",
76084+ sprint_address(jnode_get_block(node)));
76085+ return RETERR(-EIO);
76086+ }
76087+
76088+ return 0;
76089+}
76090+
76091+static int check_wander_record(const jnode * node)
76092+{
76093+ struct wander_record_header *RH =
76094+ (struct wander_record_header *)jdata(node);
76095+
76096+ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
76097+ 0) {
76098+ warning("zam-628", "wander record at block %s corrupted\n",
76099+ sprint_address(jnode_get_block(node)));
76100+ return RETERR(-EIO);
76101+ }
76102+
76103+ return 0;
76104+}
76105+
76106+/* fill commit_handler structure by everything what is needed for update_journal_footer */
76107+static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
76108+{
76109+ struct tx_header *TXH;
76110+ int ret;
76111+
76112+ ret = jload(tx_head);
76113+ if (ret)
76114+ return ret;
76115+
76116+ TXH = (struct tx_header *)jdata(tx_head);
76117+
76118+ ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
76119+ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
76120+ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
76121+
76122+ jrelse(tx_head);
76123+
76124+ list_add(&tx_head->capture_link, &ch->tx_list);
76125+
76126+ return 0;
76127+}
76128+
76129+/* replay one transaction: restore and write overwrite set in place */
76130+static int replay_transaction(const struct super_block *s,
76131+ jnode * tx_head,
76132+ const reiser4_block_nr * log_rec_block_p,
76133+ const reiser4_block_nr * end_block,
76134+ unsigned int nr_wander_records)
76135+{
76136+ reiser4_block_nr log_rec_block = *log_rec_block_p;
76137+ struct commit_handle ch;
76138+ LIST_HEAD(overwrite_set);
76139+ jnode *log;
76140+ int ret;
76141+
76142+ init_commit_handle(&ch, NULL);
76143+ ch.overwrite_set = &overwrite_set;
76144+
76145+ restore_commit_handle(&ch, tx_head);
76146+
76147+ while (log_rec_block != *end_block) {
76148+ struct wander_record_header *header;
76149+ struct wander_entry *entry;
76150+
76151+ int i;
76152+
76153+ if (nr_wander_records == 0) {
76154+ warning("zam-631",
76155+ "number of wander records in the linked list"
76156+ " greater than number stored in tx head.\n");
76157+ ret = RETERR(-EIO);
76158+ goto free_ow_set;
76159+ }
76160+
76161+ log = reiser4_alloc_io_head(&log_rec_block);
76162+ if (log == NULL)
76163+ return RETERR(-ENOMEM);
76164+
76165+ ret = jload(log);
76166+ if (ret < 0) {
76167+ reiser4_drop_io_head(log);
76168+ return ret;
76169+ }
76170+
76171+ ret = check_wander_record(log);
76172+ if (ret) {
76173+ jrelse(log);
76174+ reiser4_drop_io_head(log);
76175+ return ret;
76176+ }
76177+
76178+ header = (struct wander_record_header *)jdata(log);
76179+ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
76180+
76181+ entry = (struct wander_entry *)(header + 1);
76182+
76183+ /* restore overwrite set from wander record content */
76184+ for (i = 0; i < wander_record_capacity(s); i++) {
76185+ reiser4_block_nr block;
76186+ jnode *node;
76187+
76188+ block = le64_to_cpu(get_unaligned(&entry->wandered));
76189+ if (block == 0)
76190+ break;
76191+
76192+ node = reiser4_alloc_io_head(&block);
76193+ if (node == NULL) {
76194+ ret = RETERR(-ENOMEM);
76195+ /*
76196+ * FIXME-VS:???
76197+ */
76198+ jrelse(log);
76199+ reiser4_drop_io_head(log);
76200+ goto free_ow_set;
76201+ }
76202+
76203+ ret = jload(node);
76204+
76205+ if (ret < 0) {
76206+ reiser4_drop_io_head(node);
76207+ /*
76208+ * FIXME-VS:???
76209+ */
76210+ jrelse(log);
76211+ reiser4_drop_io_head(log);
76212+ goto free_ow_set;
76213+ }
76214+
76215+ block = le64_to_cpu(get_unaligned(&entry->original));
76216+
76217+ assert("zam-603", block != 0);
76218+
76219+ jnode_set_block(node, &block);
76220+
76221+ list_add_tail(&node->capture_link, ch.overwrite_set);
76222+
76223+ ++entry;
76224+ }
76225+
76226+ jrelse(log);
76227+ reiser4_drop_io_head(log);
76228+
76229+ --nr_wander_records;
76230+ }
76231+
76232+ if (nr_wander_records != 0) {
76233+ warning("zam-632", "number of wander records in the linked list"
76234+ " less than number stored in tx head.\n");
76235+ ret = RETERR(-EIO);
76236+ goto free_ow_set;
76237+ }
76238+
76239+ { /* write wandered set in place */
76240+ write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
76241+ ret = wait_on_jnode_list(ch.overwrite_set);
76242+
76243+ if (ret) {
76244+ ret = RETERR(-EIO);
76245+ goto free_ow_set;
76246+ }
76247+ }
76248+
76249+ ret = update_journal_footer(&ch, 0);
76250+
76251+ free_ow_set:
76252+
76253+ while (!list_empty(ch.overwrite_set)) {
76254+ jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
76255+ list_del_init(&cur->capture_link);
76256+ jrelse(cur);
76257+ reiser4_drop_io_head(cur);
76258+ }
76259+
76260+ list_del_init(&tx_head->capture_link);
76261+
76262+ done_commit_handle(&ch);
76263+
76264+ return ret;
76265+}
76266+
76267+/* find oldest committed and not played transaction and play it. The transaction
76268+ * was committed and journal header block was updated but the blocks from the
76269+ * process of writing the atom's overwrite set in-place and updating of journal
76270+ * footer block were not completed. This function completes the process by
76271+ * recovering the atom's overwrite set from their wandered locations and writes
76272+ * them in-place and updating the journal footer. */
76273+static int replay_oldest_transaction(struct super_block *s)
76274+{
76275+ reiser4_super_info_data *sbinfo = get_super_private(s);
76276+ jnode *jf = sbinfo->journal_footer;
76277+ unsigned int total;
76278+ struct journal_footer *F;
76279+ struct tx_header *T;
76280+
76281+ reiser4_block_nr prev_tx;
76282+ reiser4_block_nr last_flushed_tx;
76283+ reiser4_block_nr log_rec_block = 0;
76284+
76285+ jnode *tx_head;
76286+
76287+ int ret;
76288+
76289+ if ((ret = jload(jf)) < 0)
76290+ return ret;
76291+
76292+ F = (struct journal_footer *)jdata(jf);
76293+
76294+ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
76295+
76296+ jrelse(jf);
76297+
76298+ if (sbinfo->last_committed_tx == last_flushed_tx) {
76299+ /* all transactions are replayed */
76300+ return 0;
76301+ }
76302+
76303+ prev_tx = sbinfo->last_committed_tx;
76304+
76305+ /* searching for oldest not flushed transaction */
76306+ while (1) {
76307+ tx_head = reiser4_alloc_io_head(&prev_tx);
76308+ if (!tx_head)
76309+ return RETERR(-ENOMEM);
76310+
76311+ ret = jload(tx_head);
76312+ if (ret < 0) {
76313+ reiser4_drop_io_head(tx_head);
76314+ return ret;
76315+ }
76316+
76317+ ret = check_tx_head(tx_head);
76318+ if (ret) {
76319+ jrelse(tx_head);
76320+ reiser4_drop_io_head(tx_head);
76321+ return ret;
76322+ }
76323+
76324+ T = (struct tx_header *)jdata(tx_head);
76325+
76326+ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
76327+
76328+ if (prev_tx == last_flushed_tx)
76329+ break;
76330+
76331+ jrelse(tx_head);
76332+ reiser4_drop_io_head(tx_head);
76333+ }
76334+
76335+ total = le32_to_cpu(get_unaligned(&T->total));
76336+ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
76337+
76338+ pin_jnode_data(tx_head);
76339+ jrelse(tx_head);
76340+
76341+ ret =
76342+ replay_transaction(s, tx_head, &log_rec_block,
76343+ jnode_get_block(tx_head), total - 1);
76344+
76345+ unpin_jnode_data(tx_head);
76346+ reiser4_drop_io_head(tx_head);
76347+
76348+ if (ret)
76349+ return ret;
76350+ return -E_REPEAT;
76351+}
76352+
76353+/* The reiser4 journal current implementation was optimized to not to capture
76354+ super block if certain super blocks fields are modified. Currently, the set
76355+ is (<free block count>, <OID allocator>). These fields are logged by
76356+ special way which includes storing them in each transaction head block at
76357+ atom commit time and writing that information to journal footer block at
76358+ atom flush time. For getting info from journal footer block to the
76359+ in-memory super block there is a special function
76360+ reiser4_journal_recover_sb_data() which should be called after disk format
76361+ plugin re-reads super block after journal replaying.
76362+*/
76363+
76364+/* get the information from journal footer in-memory super block */
76365+int reiser4_journal_recover_sb_data(struct super_block *s)
76366+{
76367+ reiser4_super_info_data *sbinfo = get_super_private(s);
76368+ struct journal_footer *jf;
76369+ int ret;
76370+
76371+ assert("zam-673", sbinfo->journal_footer != NULL);
76372+
76373+ ret = jload(sbinfo->journal_footer);
76374+ if (ret != 0)
76375+ return ret;
76376+
76377+ ret = check_journal_footer(sbinfo->journal_footer);
76378+ if (ret != 0)
76379+ goto out;
76380+
76381+ jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
76382+
76383+ /* was there at least one flushed transaction? */
76384+ if (jf->last_flushed_tx) {
76385+
76386+ /* restore free block counter logged in this transaction */
76387+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
76388+
76389+ /* restore oid allocator state */
76390+ oid_init_allocator(s,
76391+ le64_to_cpu(get_unaligned(&jf->nr_files)),
76392+ le64_to_cpu(get_unaligned(&jf->next_oid)));
76393+ }
76394+ out:
76395+ jrelse(sbinfo->journal_footer);
76396+ return ret;
76397+}
76398+
76399+/* reiser4 replay journal procedure */
76400+int reiser4_journal_replay(struct super_block *s)
76401+{
76402+ reiser4_super_info_data *sbinfo = get_super_private(s);
76403+ jnode *jh, *jf;
76404+ struct journal_header *header;
76405+ int nr_tx_replayed = 0;
76406+ int ret;
76407+
76408+ assert("zam-582", sbinfo != NULL);
76409+
76410+ jh = sbinfo->journal_header;
76411+ jf = sbinfo->journal_footer;
76412+
76413+ if (!jh || !jf) {
76414+ /* it is possible that disk layout does not support journal
76415+ structures, we just warn about this */
76416+ warning("zam-583",
76417+ "journal control blocks were not loaded by disk layout plugin. "
76418+ "journal replaying is not possible.\n");
76419+ return 0;
76420+ }
76421+
76422+ /* Take free block count from journal footer block. The free block
76423+ counter value corresponds the last flushed transaction state */
76424+ ret = jload(jf);
76425+ if (ret < 0)
76426+ return ret;
76427+
76428+ ret = check_journal_footer(jf);
76429+ if (ret) {
76430+ jrelse(jf);
76431+ return ret;
76432+ }
76433+
76434+ jrelse(jf);
76435+
76436+ /* store last committed transaction info in reiser4 in-memory super
76437+ block */
76438+ ret = jload(jh);
76439+ if (ret < 0)
76440+ return ret;
76441+
76442+ ret = check_journal_header(jh);
76443+ if (ret) {
76444+ jrelse(jh);
76445+ return ret;
76446+ }
76447+
76448+ header = (struct journal_header *)jdata(jh);
76449+ sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
76450+
76451+ jrelse(jh);
76452+
76453+ /* replay committed transactions */
76454+ while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
76455+ nr_tx_replayed++;
76456+
76457+ return ret;
76458+}
76459+
76460+/* load journal control block (either journal header or journal footer block) */
76461+static int
76462+load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
76463+{
76464+ int ret;
76465+
76466+ *node = reiser4_alloc_io_head(block);
76467+ if (!(*node))
76468+ return RETERR(-ENOMEM);
76469+
76470+ ret = jload(*node);
76471+
76472+ if (ret) {
76473+ reiser4_drop_io_head(*node);
76474+ *node = NULL;
76475+ return ret;
76476+ }
76477+
76478+ pin_jnode_data(*node);
76479+ jrelse(*node);
76480+
76481+ return 0;
76482+}
76483+
76484+/* unload journal header or footer and free jnode */
76485+static void unload_journal_control_block(jnode ** node)
76486+{
76487+ if (*node) {
76488+ unpin_jnode_data(*node);
76489+ reiser4_drop_io_head(*node);
76490+ *node = NULL;
76491+ }
76492+}
76493+
76494+/* release journal control blocks */
76495+void reiser4_done_journal_info(struct super_block *s)
76496+{
76497+ reiser4_super_info_data *sbinfo = get_super_private(s);
76498+
76499+ assert("zam-476", sbinfo != NULL);
76500+
76501+ unload_journal_control_block(&sbinfo->journal_header);
76502+ unload_journal_control_block(&sbinfo->journal_footer);
76503+ rcu_barrier();
76504+}
76505+
76506+/* load journal control blocks */
76507+int reiser4_init_journal_info(struct super_block *s)
76508+{
76509+ reiser4_super_info_data *sbinfo = get_super_private(s);
76510+ journal_location *loc;
76511+ int ret;
76512+
76513+ loc = &sbinfo->jloc;
76514+
76515+ assert("zam-651", loc != NULL);
76516+ assert("zam-652", loc->header != 0);
76517+ assert("zam-653", loc->footer != 0);
76518+
76519+ ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
76520+
76521+ if (ret)
76522+ return ret;
76523+
76524+ ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
76525+
76526+ if (ret) {
76527+ unload_journal_control_block(&sbinfo->journal_header);
76528+ }
76529+
76530+ return ret;
76531+}
76532+
76533+/* Make Linus happy.
76534+ Local variables:
76535+ c-indentation-style: "K&R"
76536+ mode-name: "LC"
76537+ c-basic-offset: 8
76538+ tab-width: 8
76539+ fill-column: 80
76540+ End:
76541+*/
76542diff -urN linux-2.6.24.orig/fs/reiser4/wander.h linux-2.6.24/fs/reiser4/wander.h
76543--- linux-2.6.24.orig/fs/reiser4/wander.h 1970-01-01 03:00:00.000000000 +0300
76544+++ linux-2.6.24/fs/reiser4/wander.h 2008-01-25 11:39:07.116254057 +0300
76545@@ -0,0 +1,135 @@
76546+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
76547+
76548+#if !defined (__FS_REISER4_WANDER_H__)
76549+#define __FS_REISER4_WANDER_H__
76550+
76551+#include "dformat.h"
76552+
76553+#include <linux/fs.h> /* for struct super_block */
76554+
76555+/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
76556+
76557+#define TX_HEADER_MAGIC "TxMagic4"
76558+#define WANDER_RECORD_MAGIC "LogMagc4"
76559+
76560+#define TX_HEADER_MAGIC_SIZE (8)
76561+#define WANDER_RECORD_MAGIC_SIZE (8)
76562+
76563+/* journal header block format */
76564+struct journal_header {
76565+ /* last written transaction head location */
76566+ d64 last_committed_tx;
76567+};
76568+
76569+typedef struct journal_location {
76570+ reiser4_block_nr footer;
76571+ reiser4_block_nr header;
76572+} journal_location;
76573+
76574+/* The wander.c head comment describes usage and semantic of all these structures */
76575+/* journal footer block format */
76576+struct journal_footer {
76577+ /* last flushed transaction location. */
76578+ /* This block number is no more valid after the transaction it points
76579+ to gets flushed, this number is used only at journal replaying time
76580+ for detection of the end of on-disk list of committed transactions
76581+ which were not flushed completely */
76582+ d64 last_flushed_tx;
76583+
76584+ /* free block counter is written in journal footer at transaction
76585+ flushing , not in super block because free blocks counter is logged
76586+ by another way than super block fields (root pointer, for
76587+ example). */
76588+ d64 free_blocks;
76589+
76590+ /* number of used OIDs and maximal used OID are logged separately from
76591+ super block */
76592+ d64 nr_files;
76593+ d64 next_oid;
76594+};
76595+
76596+/* Each wander record (except the first one) has unified format with wander
76597+ record header followed by an array of log entries */
76598+struct wander_record_header {
76599+ /* when there is no predefined location for wander records, this magic
76600+ string should help reiser4fsck. */
76601+ char magic[WANDER_RECORD_MAGIC_SIZE];
76602+
76603+ /* transaction id */
76604+ d64 id;
76605+
76606+ /* total number of wander records in current transaction */
76607+ d32 total;
76608+
76609+ /* this block number in transaction */
76610+ d32 serial;
76611+
76612+ /* number of previous block in commit */
76613+ d64 next_block;
76614+};
76615+
76616+/* The first wander record (transaction head) of written transaction has the
76617+ special format */
76618+struct tx_header {
76619+ /* magic string makes first block in transaction different from other
76620+ logged blocks, it should help fsck. */
76621+ char magic[TX_HEADER_MAGIC_SIZE];
76622+
76623+ /* transaction id */
76624+ d64 id;
76625+
76626+ /* total number of records (including this first tx head) in the
76627+ transaction */
76628+ d32 total;
76629+
76630+ /* align next field to 8-byte boundary; this field always is zero */
76631+ d32 padding;
76632+
76633+ /* block number of previous transaction head */
76634+ d64 prev_tx;
76635+
76636+ /* next wander record location */
76637+ d64 next_block;
76638+
76639+ /* committed versions of free blocks counter */
76640+ d64 free_blocks;
76641+
76642+ /* number of used OIDs (nr_files) and maximal used OID are logged
76643+ separately from super block */
76644+ d64 nr_files;
76645+ d64 next_oid;
76646+};
76647+
76648+/* A transaction gets written to disk as a set of wander records (each wander
76649+ record size is fs block) */
76650+
76651+/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
76652+ by zeroes */
76653+struct wander_entry {
76654+ d64 original; /* block original location */
76655+ d64 wandered; /* block wandered location */
76656+};
76657+
76658+/* REISER4 JOURNAL WRITER FUNCTIONS */
76659+
76660+extern int reiser4_write_logs(long *);
76661+extern int reiser4_journal_replay(struct super_block *);
76662+extern int reiser4_journal_recover_sb_data(struct super_block *);
76663+
76664+extern int reiser4_init_journal_info(struct super_block *);
76665+extern void reiser4_done_journal_info(struct super_block *);
76666+
76667+extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
76668+
76669+#endif /* __FS_REISER4_WANDER_H__ */
76670+
76671+/* Make Linus happy.
76672+ Local variables:
76673+ c-indentation-style: "K&R"
76674+ mode-name: "LC"
76675+ c-basic-offset: 8
76676+ tab-width: 8
76677+ fill-column: 80
76678+ scroll-step: 1
76679+ End:
76680+*/
76681diff -urN linux-2.6.24.orig/fs/reiser4/writeout.h linux-2.6.24/fs/reiser4/writeout.h
76682--- linux-2.6.24.orig/fs/reiser4/writeout.h 1970-01-01 03:00:00.000000000 +0300
76683+++ linux-2.6.24/fs/reiser4/writeout.h 2008-01-25 11:39:07.120255087 +0300
76684@@ -0,0 +1,21 @@
76685+/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
76686+
76687+#if !defined (__FS_REISER4_WRITEOUT_H__)
76688+
76689+#define WRITEOUT_SINGLE_STREAM (0x1)
76690+#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
76691+#define WRITEOUT_BARRIER (0x4)
76692+
76693+extern int reiser4_get_writeout_flags(void);
76694+
76695+#endif /* __FS_REISER4_WRITEOUT_H__ */
76696+
76697+/* Make Linus happy.
76698+ Local variables:
76699+ c-indentation-style: "K&R"
76700+ mode-name: "LC"
76701+ c-basic-offset: 8
76702+ tab-width: 8
76703+ fill-column: 80
76704+ End:
76705+*/
76706diff -urN linux-2.6.24.orig/fs/reiser4/znode.c linux-2.6.24/fs/reiser4/znode.c
76707--- linux-2.6.24.orig/fs/reiser4/znode.c 1970-01-01 03:00:00.000000000 +0300
76708+++ linux-2.6.24/fs/reiser4/znode.c 2008-01-25 11:39:07.120255087 +0300
76709@@ -0,0 +1,1029 @@
76710+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76711+ * reiser4/README */
76712+/* Znode manipulation functions. */
76713+/* Znode is the in-memory header for a tree node. It is stored
76714+ separately from the node itself so that it does not get written to
76715+ disk. In this respect znode is like buffer head or page head. We
76716+ also use znodes for additional reiser4 specific purposes:
76717+
76718+ . they are organized into tree structure which is a part of whole
76719+ reiser4 tree.
76720+ . they are used to implement node grained locking
76721+ . they are used to keep additional state associated with a
76722+ node
76723+ . they contain links to lists used by the transaction manager
76724+
76725+ Znode is attached to some variable "block number" which is instance of
76726+ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
76727+ appropriate node being actually loaded in memory. Existence of znode itself
76728+ is regulated by reference count (->x_count) in it. Each time thread
76729+ acquires reference to znode through call to zget(), ->x_count is
76730+ incremented and decremented on call to zput(). Data (content of node) are
76731+ brought in memory through call to zload(), which also increments ->d_count
76732+ reference counter. zload can block waiting on IO. Call to zrelse()
76733+ decreases this counter. Also, ->c_count keeps track of number of child
76734+ znodes and prevents parent znode from being recycled until all of its
76735+ children are. ->c_count is decremented whenever child goes out of existence
76736+ (being actually recycled in zdestroy()) which can be some time after last
76737+ reference to this child dies if we support some form of LRU cache for
76738+ znodes.
76739+
76740+*/
76741+/* EVERY ZNODE'S STORY
76742+
76743+ 1. His infancy.
76744+
76745+ Once upon a time, the znode was born deep inside of zget() by call to
76746+ zalloc(). At the return from zget() znode had:
76747+
76748+ . reference counter (x_count) of 1
76749+ . assigned block number, marked as used in bitmap
76750+ . pointer to parent znode. Root znode parent pointer points
76751+ to its father: "fake" znode. This, in turn, has NULL parent pointer.
76752+ . hash table linkage
76753+ . no data loaded from disk
76754+ . no node plugin
76755+ . no sibling linkage
76756+
76757+ 2. His childhood
76758+
76759+ Each node is either brought into memory as a result of tree traversal, or
76760+ created afresh, creation of the root being a special case of the latter. In
76761+ either case it's inserted into sibling list. This will typically require
76762+ some ancillary tree traversing, but ultimately both sibling pointers will
76763+ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
76764+ zjnode.state.
76765+
76766+ 3. His youth.
76767+
76768+ If znode is bound to already existing node in a tree, its content is read
76769+ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
76770+ in zjnode.state and zdata() function starts to return non null for this
76771+ znode. zload() further calls zparse() that determines which node layout
76772+ this node is rendered in, and sets ->nplug on success.
76773+
76774+ If znode is for new node just created, memory for it is allocated and
76775+ zinit_new() function is called to initialise data, according to selected
76776+ node layout.
76777+
76778+ 4. His maturity.
76779+
76780+ After this point, znode lingers in memory for some time. Threads can
76781+ acquire references to znode either by blocknr through call to zget(), or by
76782+ following a pointer to unallocated znode from internal item. Each time
76783+ reference to znode is obtained, x_count is increased. Thread can read/write
76784+ lock znode. Znode data can be loaded through calls to zload(), d_count will
76785+ be increased appropriately. If all references to znode are released
76786+ (x_count drops to 0), znode is not recycled immediately. Rather, it is
76787+ still cached in the hash table in the hope that it will be accessed
76788+ shortly.
76789+
76790+ There are two ways in which znode existence can be terminated:
76791+
76792+ . sudden death: node bound to this znode is removed from the tree
76793+ . overpopulation: znode is purged out of memory due to memory pressure
76794+
76795+ 5. His death.
76796+
76797+ Death is complex process.
76798+
76799+ When we irrevocably commit ourselves to decision to remove node from the
76800+ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
76801+ znode. This is done either in ->kill_hook() of internal item or in
76802+ reiser4_kill_root() function when tree root is removed.
76803+
76804+ At this moment znode still has:
76805+
76806+ . locks held on it, necessary write ones
76807+ . references to it
76808+ . disk block assigned to it
76809+ . data loaded from the disk
76810+ . pending requests for lock
76811+
76812+ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
76813+ deletion. Node deletion includes two phases. First all ways to get
76814+ references to that znode (sibling and parent links and hash lookup using
76815+ block number stored in parent node) should be deleted -- it is done through
76816+ sibling_list_remove(), also we assume that nobody uses down link from
76817+ parent node due to its nonexistence or proper parent node locking and
76818+ nobody uses parent pointers from children due to absence of them. Second we
76819+ invalidate all pending lock requests which still are on znode's lock
76820+ request queue, this is done by reiser4_invalidate_lock(). Another
76821+ JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
76822+ Once it set all requesters are forced to return -EINVAL from
76823+ longterm_lock_znode(). Future locking attempts are not possible because all
76824+ ways to get references to that znode are removed already. Last, node is
76825+ uncaptured from transaction.
76826+
76827+ When last reference to the dying znode is just about to be released,
76828+ block number for this lock is released and znode is removed from the
76829+ hash table.
76830+
76831+ Now znode can be recycled.
76832+
76833+ [it's possible to free bitmap block and remove znode from the hash
76834+ table when last lock is released. This will result in having
76835+ referenced but completely orphaned znode]
76836+
76837+ 6. Limbo
76838+
76839+ As have been mentioned above znodes with reference counter 0 are
76840+ still cached in a hash table. Once memory pressure increases they are
76841+ purged out of there [this requires something like LRU list for
76842+ efficient implementation. LRU list would also greatly simplify
76843+ implementation of coord cache that would in this case morph to just
76844+ scanning some initial segment of LRU list]. Data loaded into
76845+ unreferenced znode are flushed back to the durable storage if
76846+ necessary and memory is freed. Znodes themselves can be recycled at
76847+ this point too.
76848+
76849+*/
76850+
76851+#include "debug.h"
76852+#include "dformat.h"
76853+#include "key.h"
76854+#include "coord.h"
76855+#include "plugin/plugin_header.h"
76856+#include "plugin/node/node.h"
76857+#include "plugin/plugin.h"
76858+#include "txnmgr.h"
76859+#include "jnode.h"
76860+#include "znode.h"
76861+#include "block_alloc.h"
76862+#include "tree.h"
76863+#include "tree_walk.h"
76864+#include "super.h"
76865+#include "reiser4.h"
76866+
76867+#include <linux/pagemap.h>
76868+#include <linux/spinlock.h>
76869+#include <linux/slab.h>
76870+#include <linux/err.h>
76871+
76872+static z_hash_table *get_htable(reiser4_tree *,
76873+ const reiser4_block_nr * const blocknr);
76874+static z_hash_table *znode_get_htable(const znode *);
76875+static void zdrop(znode *);
76876+
76877+/* hash table support */
76878+
76879+/* compare two block numbers for equality. Used by hash-table macros */
76880+static inline int
76881+blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
76882+{
76883+ assert("nikita-534", b1 != NULL);
76884+ assert("nikita-535", b2 != NULL);
76885+
76886+ return *b1 == *b2;
76887+}
76888+
76889+/* Hash znode by block number. Used by hash-table macros */
76890+/* Audited by: umka (2002.06.11) */
76891+static inline __u32
76892+blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
76893+{
76894+ assert("nikita-536", b != NULL);
76895+
76896+ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
76897+}
76898+
76899+/* The hash table definition */
76900+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
76901+#define KFREE(ptr, size) kfree(ptr)
76902+TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
76903+ blknrhashfn, blknreq);
76904+#undef KFREE
76905+#undef KMALLOC
76906+
76907+/* slab for znodes */
76908+static struct kmem_cache *znode_cache;
76909+
76910+int znode_shift_order;
76911+
76912+/**
76913+ * init_znodes - create znode cache
76914+ *
76915+ * Initializes slab cache of znodes. It is part of reiser4 module initialization.
76916+ */
76917+int init_znodes(void)
76918+{
76919+ znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
76920+ SLAB_HWCACHE_ALIGN |
76921+ SLAB_RECLAIM_ACCOUNT, NULL);
76922+ if (znode_cache == NULL)
76923+ return RETERR(-ENOMEM);
76924+
76925+ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
76926+ ++znode_shift_order);
76927+ --znode_shift_order;
76928+ return 0;
76929+}
76930+
76931+/**
76932+ * done_znodes - delete znode cache
76933+ *
76934+ * This is called on reiser4 module unloading or system shutdown.
76935+ */
76936+void done_znodes(void)
76937+{
76938+ destroy_reiser4_cache(&znode_cache);
76939+}
76940+
76941+/* call this to initialise tree of znodes */
76942+int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
76943+{
76944+ int result;
76945+ assert("umka-050", tree != NULL);
76946+
76947+ rwlock_init(&tree->dk_lock);
76948+
76949+ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
76950+ if (result != 0)
76951+ return result;
76952+ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
76953+ return result;
76954+}
76955+
76956+/* free this znode */
76957+void zfree(znode * node /* znode to free */ )
76958+{
76959+ assert("nikita-465", node != NULL);
76960+ assert("nikita-2120", znode_page(node) == NULL);
76961+ assert("nikita-2301", list_empty_careful(&node->lock.owners));
76962+ assert("nikita-2302", list_empty_careful(&node->lock.requestors));
76963+ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
76964+ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
76965+ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
76966+ assert("nikita-3293", !znode_is_right_connected(node));
76967+ assert("nikita-3294", !znode_is_left_connected(node));
76968+ assert("nikita-3295", node->left == NULL);
76969+ assert("nikita-3296", node->right == NULL);
76970+
76971+ /* not yet phash_jnode_destroy(ZJNODE(node)); */
76972+
76973+ kmem_cache_free(znode_cache, node);
76974+}
76975+
76976+/* call this to free tree of znodes */
76977+void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
76978+{
76979+ znode *node;
76980+ znode *next;
76981+ z_hash_table *ztable;
76982+
76983+ /* scan znode hash-tables and kill all znodes, then free hash tables
76984+ * themselves. */
76985+
76986+ assert("nikita-795", tree != NULL);
76987+
76988+ ztable = &tree->zhash_table;
76989+
76990+ if (ztable->_table != NULL) {
76991+ for_all_in_htable(ztable, z, node, next) {
76992+ node->c_count = 0;
76993+ node->in_parent.node = NULL;
76994+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
76995+ zdrop(node);
76996+ }
76997+
76998+ z_hash_done(&tree->zhash_table);
76999+ }
77000+
77001+ ztable = &tree->zfake_table;
77002+
77003+ if (ztable->_table != NULL) {
77004+ for_all_in_htable(ztable, z, node, next) {
77005+ node->c_count = 0;
77006+ node->in_parent.node = NULL;
77007+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
77008+ zdrop(node);
77009+ }
77010+
77011+ z_hash_done(&tree->zfake_table);
77012+ }
77013+}
77014+
77015+/* ZNODE STRUCTURES */
77016+
77017+/* allocate fresh znode */
77018+znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
77019+{
77020+ znode *node;
77021+
77022+ node = kmem_cache_alloc(znode_cache, gfp_flag);
77023+ return node;
77024+}
77025+
77026+/* Initialize fields of znode
77027+ @node: znode to initialize;
77028+ @parent: parent znode;
77029+ @tree: tree we are in. */
77030+void zinit(znode * node, const znode * parent, reiser4_tree * tree)
77031+{
77032+ assert("nikita-466", node != NULL);
77033+ assert("umka-268", current_tree != NULL);
77034+
77035+ memset(node, 0, sizeof *node);
77036+
77037+ assert("umka-051", tree != NULL);
77038+
77039+ jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
77040+ reiser4_init_lock(&node->lock);
77041+ init_parent_coord(&node->in_parent, parent);
77042+}
77043+
77044+/*
77045+ * remove znode from indices. This is called jput() when last reference on
77046+ * znode is released.
77047+ */
77048+void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
77049+{
77050+ assert("nikita-2108", node != NULL);
77051+ assert("nikita-470", node->c_count == 0);
77052+ assert_rw_write_locked(&(tree->tree_lock));
77053+
77054+ /* remove reference to this znode from cbk cache */
77055+ cbk_cache_invalidate(node, tree);
77056+
77057+ /* update c_count of parent */
77058+ if (znode_parent(node) != NULL) {
77059+ assert("nikita-472", znode_parent(node)->c_count > 0);
77060+ /* father, onto your hands I forward my spirit... */
77061+ znode_parent(node)->c_count--;
77062+ node->in_parent.node = NULL;
77063+ } else {
77064+ /* orphaned znode?! Root? */
77065+ }
77066+
77067+ /* remove znode from hash-table */
77068+ z_hash_remove_rcu(znode_get_htable(node), node);
77069+}
77070+
77071+/* zdrop() -- Remove znode from the tree.
77072+
77073+ This is called when znode is removed from the memory. */
77074+static void zdrop(znode * node /* znode to finish with */ )
77075+{
77076+ jdrop(ZJNODE(node));
77077+}
77078+
77079+/*
77080+ * put znode into right place in the hash table. This is called by relocate
77081+ * code.
77082+ */
77083+int znode_rehash(znode * node /* node to rehash */ ,
77084+ const reiser4_block_nr * new_block_nr /* new block number */ )
77085+{
77086+ z_hash_table *oldtable;
77087+ z_hash_table *newtable;
77088+ reiser4_tree *tree;
77089+
77090+ assert("nikita-2018", node != NULL);
77091+
77092+ tree = znode_get_tree(node);
77093+ oldtable = znode_get_htable(node);
77094+ newtable = get_htable(tree, new_block_nr);
77095+
77096+ write_lock_tree(tree);
77097+ /* remove znode from hash-table */
77098+ z_hash_remove_rcu(oldtable, node);
77099+
77100+ /* assertion no longer valid due to RCU */
77101+ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
77102+
77103+ /* update blocknr */
77104+ znode_set_block(node, new_block_nr);
77105+ node->zjnode.key.z = *new_block_nr;
77106+
77107+ /* insert it into hash */
77108+ z_hash_insert_rcu(newtable, node);
77109+ write_unlock_tree(tree);
77110+ return 0;
77111+}
77112+
77113+/* ZNODE LOOKUP, GET, PUT */
77114+
77115+/* zlook() - get znode with given block_nr in a hash table or return NULL
77116+
77117+ If result is non-NULL then the znode's x_count is incremented. Internal version
77118+ accepts pre-computed hash index. The hash table is accessed under caller's
77119+ tree->hash_lock.
77120+*/
77121+znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
77122+{
77123+ znode *result;
77124+ __u32 hash;
77125+ z_hash_table *htable;
77126+
77127+ assert("jmacd-506", tree != NULL);
77128+ assert("jmacd-507", blocknr != NULL);
77129+
77130+ htable = get_htable(tree, blocknr);
77131+ hash = blknrhashfn(htable, blocknr);
77132+
77133+ rcu_read_lock();
77134+ result = z_hash_find_index(htable, hash, blocknr);
77135+
77136+ if (result != NULL) {
77137+ add_x_ref(ZJNODE(result));
77138+ result = znode_rip_check(tree, result);
77139+ }
77140+ rcu_read_unlock();
77141+
77142+ return result;
77143+}
77144+
77145+/* return hash table where znode with block @blocknr is (or should be)
77146+ * stored */
77147+static z_hash_table *get_htable(reiser4_tree * tree,
77148+ const reiser4_block_nr * const blocknr)
77149+{
77150+ z_hash_table *table;
77151+ if (is_disk_addr_unallocated(blocknr))
77152+ table = &tree->zfake_table;
77153+ else
77154+ table = &tree->zhash_table;
77155+ return table;
77156+}
77157+
77158+/* return hash table where znode @node is (or should be) stored */
77159+static z_hash_table *znode_get_htable(const znode * node)
77160+{
77161+ return get_htable(znode_get_tree(node), znode_get_block(node));
77162+}
77163+
77164+/* zget() - get znode from hash table, allocating it if necessary.
77165+
77166+ First a call to zlook, locating a x-referenced znode if one
77167+ exists. If znode is not found, allocate new one and return. Result
77168+ is returned with x_count reference increased.
77169+
77170+ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
77171+ LOCK ORDERING: NONE
77172+*/
77173+znode *zget(reiser4_tree * tree,
77174+ const reiser4_block_nr * const blocknr,
77175+ znode * parent, tree_level level, gfp_t gfp_flag)
77176+{
77177+ znode *result;
77178+ __u32 hashi;
77179+
77180+ z_hash_table *zth;
77181+
77182+ assert("jmacd-512", tree != NULL);
77183+ assert("jmacd-513", blocknr != NULL);
77184+ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
77185+
77186+ zth = get_htable(tree, blocknr);
77187+ hashi = blknrhashfn(zth, blocknr);
77188+
77189+ /* NOTE-NIKITA address-as-unallocated-blocknr still is not
77190+ implemented. */
77191+
77192+ z_hash_prefetch_bucket(zth, hashi);
77193+
77194+ rcu_read_lock();
77195+ /* Find a matching BLOCKNR in the hash table. If the znode is found,
77196+ we obtain an reference (x_count) but the znode remains unlocked.
77197+ Have to worry about race conditions later. */
77198+ result = z_hash_find_index(zth, hashi, blocknr);
77199+ /* According to the current design, the hash table lock protects new
77200+ znode references. */
77201+ if (result != NULL) {
77202+ add_x_ref(ZJNODE(result));
77203+ /* NOTE-NIKITA it should be so, but special case during
77204+ creation of new root makes such assertion highly
77205+ complicated. */
77206+ assert("nikita-2131", 1 || znode_parent(result) == parent ||
77207+ (ZF_ISSET(result, JNODE_ORPHAN)
77208+ && (znode_parent(result) == NULL)));
77209+ result = znode_rip_check(tree, result);
77210+ }
77211+
77212+ rcu_read_unlock();
77213+
77214+ if (!result) {
77215+ znode *shadow;
77216+
77217+ result = zalloc(gfp_flag);
77218+ if (!result) {
77219+ return ERR_PTR(RETERR(-ENOMEM));
77220+ }
77221+
77222+ zinit(result, parent, tree);
77223+ ZJNODE(result)->blocknr = *blocknr;
77224+ ZJNODE(result)->key.z = *blocknr;
77225+ result->level = level;
77226+
77227+ write_lock_tree(tree);
77228+
77229+ shadow = z_hash_find_index(zth, hashi, blocknr);
77230+ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
77231+ jnode_list_remove(ZJNODE(result));
77232+ zfree(result);
77233+ result = shadow;
77234+ } else {
77235+ result->version = znode_build_version(tree);
77236+ z_hash_insert_index_rcu(zth, hashi, result);
77237+
77238+ if (parent != NULL)
77239+ ++parent->c_count;
77240+ }
77241+
77242+ add_x_ref(ZJNODE(result));
77243+
77244+ write_unlock_tree(tree);
77245+ }
77246+#if REISER4_DEBUG
77247+ if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
77248+ reiser4_check_block(blocknr, 1);
77249+#endif
77250+ /* Check for invalid tree level, return -EIO */
77251+ if (unlikely(znode_get_level(result) != level)) {
77252+ warning("jmacd-504",
77253+ "Wrong level for cached block %llu: %i expecting %i",
77254+ (unsigned long long)(*blocknr), znode_get_level(result),
77255+ level);
77256+ zput(result);
77257+ return ERR_PTR(RETERR(-EIO));
77258+ }
77259+
77260+ assert("nikita-1227", znode_invariant(result));
77261+
77262+ return result;
77263+}
77264+
77265+/* ZNODE PLUGINS/DATA */
77266+
77267+/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
77268+ stored at the fixed offset from the beginning of the node. */
77269+static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
77270+ * plugin of */ )
77271+{
77272+ reiser4_tree *tree;
77273+
77274+ assert("nikita-1053", node != NULL);
77275+ assert("nikita-1055", zdata(node) != NULL);
77276+
77277+ tree = znode_get_tree(node);
77278+ assert("umka-053", tree != NULL);
77279+
77280+ if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
77281+ return tree->nplug;
77282+ } else {
77283+ return node_plugin_by_disk_id
77284+ (tree, &((common_node_header *) zdata(node))->plugin_id);
77285+#ifdef GUESS_EXISTS
77286+ reiser4_plugin *plugin;
77287+
77288+ /* NOTE-NIKITA add locking here when dynamic plugins will be
77289+ * implemented */
77290+ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
77291+ if ((plugin->u.node.guess != NULL)
77292+ && plugin->u.node.guess(node))
77293+ return plugin;
77294+ }
77295+ warning("nikita-1057", "Cannot guess node plugin");
77296+ print_znode("node", node);
77297+ return NULL;
77298+#endif
77299+ }
77300+}
77301+
77302+/* parse node header and install ->node_plugin */
77303+int zparse(znode * node /* znode to parse */ )
77304+{
77305+ int result;
77306+
77307+ assert("nikita-1233", node != NULL);
77308+ assert("nikita-2370", zdata(node) != NULL);
77309+
77310+ if (node->nplug == NULL) {
77311+ node_plugin *nplug;
77312+
77313+ nplug = znode_guess_plugin(node);
77314+ if (likely(nplug != NULL)) {
77315+ result = nplug->parse(node);
77316+ if (likely(result == 0))
77317+ node->nplug = nplug;
77318+ } else {
77319+ result = RETERR(-EIO);
77320+ }
77321+ } else
77322+ result = 0;
77323+ return result;
77324+}
77325+
77326+/* zload with readahead */
77327+int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
77328+{
77329+ int result;
77330+
77331+ assert("nikita-484", node != NULL);
77332+ assert("nikita-1377", znode_invariant(node));
77333+ assert("jmacd-7771", !znode_above_root(node));
77334+ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
77335+ assert("nikita-3016", reiser4_schedulable());
77336+
77337+ if (info)
77338+ formatted_readahead(node, info);
77339+
77340+ result = jload(ZJNODE(node));
77341+ assert("nikita-1378", znode_invariant(node));
77342+ return result;
77343+}
77344+
77345+/* load content of node into memory */
77346+int zload(znode * node)
77347+{
77348+ return zload_ra(node, NULL);
77349+}
77350+
77351+/* call node plugin to initialise newly allocated node. */
77352+int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
77353+{
77354+ return jinit_new(ZJNODE(node), gfp_flags);
77355+}
77356+
77357+/* drop reference to node data. When last reference is dropped, data are
77358+ unloaded. */
77359+void zrelse(znode * node /* znode to release references to */ )
77360+{
77361+ assert("nikita-1381", znode_invariant(node));
77362+
77363+ jrelse(ZJNODE(node));
77364+}
77365+
77366+/* returns free space in node */
77367+unsigned znode_free_space(znode * node /* znode to query */ )
77368+{
77369+ assert("nikita-852", node != NULL);
77370+ return node_plugin_by_node(node)->free_space(node);
77371+}
77372+
77373+/* left delimiting key of znode */
77374+reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
77375+{
77376+ assert("nikita-958", node != NULL);
77377+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77378+ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
77379+ assert("nikita-30671", node->rd_key_version != 0);
77380+ return &node->rd_key;
77381+}
77382+
77383+/* right delimiting key of znode */
77384+reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
77385+{
77386+ assert("nikita-974", node != NULL);
77387+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77388+ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
77389+ assert("nikita-30681", node->ld_key_version != 0);
77390+ return &node->ld_key;
77391+}
77392+
77393+ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
77394+ )
77395+
77396+/* update right-delimiting key of @node */
77397+reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
77398+{
77399+ assert("nikita-2937", node != NULL);
77400+ assert("nikita-2939", key != NULL);
77401+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77402+ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
77403+ assert("nikita-2944",
77404+ znode_is_any_locked(node) ||
77405+ znode_get_level(node) != LEAF_LEVEL ||
77406+ keyge(key, &node->rd_key) ||
77407+ keyeq(&node->rd_key, reiser4_min_key()) ||
77408+ ZF_ISSET(node, JNODE_HEARD_BANSHEE));
77409+
77410+ node->rd_key = *key;
77411+ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
77412+ return &node->rd_key;
77413+}
77414+
77415+/* update left-delimiting key of @node */
77416+reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
77417+{
77418+ assert("nikita-2940", node != NULL);
77419+ assert("nikita-2941", key != NULL);
77420+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77421+ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
77422+ assert("nikita-2943",
77423+ znode_is_any_locked(node) || keyeq(&node->ld_key,
77424+ reiser4_min_key()));
77425+
77426+ node->ld_key = *key;
77427+ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
77428+ return &node->ld_key;
77429+}
77430+
77431+/* true if @key is inside key range for @node */
77432+int znode_contains_key(znode * node /* znode to look in */ ,
77433+ const reiser4_key * key /* key to look for */ )
77434+{
77435+ assert("nikita-1237", node != NULL);
77436+ assert("nikita-1238", key != NULL);
77437+
77438+ /* left_delimiting_key <= key <= right_delimiting_key */
77439+ return keyle(znode_get_ld_key(node), key)
77440+ && keyle(key, znode_get_rd_key(node));
77441+}
77442+
77443+/* same as znode_contains_key(), but lock dk lock */
77444+int znode_contains_key_lock(znode * node /* znode to look in */ ,
77445+ const reiser4_key * key /* key to look for */ )
77446+{
77447+ int result;
77448+
77449+ assert("umka-056", node != NULL);
77450+ assert("umka-057", key != NULL);
77451+
77452+ read_lock_dk(znode_get_tree(node));
77453+ result = znode_contains_key(node, key);
77454+ read_unlock_dk(znode_get_tree(node));
77455+ return result;
77456+}
77457+
77458+/* get parent pointer, assuming tree is not locked */
77459+znode *znode_parent_nolock(const znode * node /* child znode */ )
77460+{
77461+ assert("nikita-1444", node != NULL);
77462+ return node->in_parent.node;
77463+}
77464+
77465+/* get parent pointer of znode */
77466+znode *znode_parent(const znode * node /* child znode */ )
77467+{
77468+ assert("nikita-1226", node != NULL);
77469+ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
77470+ return znode_parent_nolock(node);
77471+}
77472+
77473+/* detect uber znode used to protect in-superblock tree root pointer */
77474+int znode_above_root(const znode * node /* znode to query */ )
77475+{
77476+ assert("umka-059", node != NULL);
77477+
77478+ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
77479+}
77480+
77481+/* check that @node is root---that its block number is recorder in the tree as
77482+ that of root node */
77483+#if REISER4_DEBUG
77484+static int znode_is_true_root(const znode * node /* znode to query */ )
77485+{
77486+ assert("umka-060", node != NULL);
77487+ assert("umka-061", current_tree != NULL);
77488+
77489+ return disk_addr_eq(znode_get_block(node),
77490+ &znode_get_tree(node)->root_block);
77491+}
77492+#endif
77493+
77494+/* check that @node is root */
77495+int znode_is_root(const znode * node /* znode to query */ )
77496+{
77497+ assert("nikita-1206", node != NULL);
77498+
77499+ return znode_get_level(node) == znode_get_tree(node)->height;
77500+}
77501+
77502+/* Returns true is @node was just created by zget() and wasn't ever loaded
77503+ into memory. */
77504+/* NIKITA-HANS: yes */
77505+int znode_just_created(const znode * node)
77506+{
77507+ assert("nikita-2188", node != NULL);
77508+ return (znode_page(node) == NULL);
77509+}
77510+
77511+/* obtain updated ->znode_epoch. See seal.c for description. */
77512+__u64 znode_build_version(reiser4_tree * tree)
77513+{
77514+ __u64 result;
77515+
77516+ spin_lock(&tree->epoch_lock);
77517+ result = ++tree->znode_epoch;
77518+ spin_unlock(&tree->epoch_lock);
77519+ return result;
77520+}
77521+
77522+void init_load_count(load_count * dh)
77523+{
77524+ assert("nikita-2105", dh != NULL);
77525+ memset(dh, 0, sizeof *dh);
77526+}
77527+
77528+void done_load_count(load_count * dh)
77529+{
77530+ assert("nikita-2106", dh != NULL);
77531+ if (dh->node != NULL) {
77532+ for (; dh->d_ref > 0; --dh->d_ref)
77533+ zrelse(dh->node);
77534+ dh->node = NULL;
77535+ }
77536+}
77537+
77538+static int incr_load_count(load_count * dh)
77539+{
77540+ int result;
77541+
77542+ assert("nikita-2110", dh != NULL);
77543+ assert("nikita-2111", dh->node != NULL);
77544+
77545+ result = zload(dh->node);
77546+ if (result == 0)
77547+ ++dh->d_ref;
77548+ return result;
77549+}
77550+
77551+int incr_load_count_znode(load_count * dh, znode * node)
77552+{
77553+ assert("nikita-2107", dh != NULL);
77554+ assert("nikita-2158", node != NULL);
77555+ assert("nikita-2109",
77556+ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
77557+
77558+ dh->node = node;
77559+ return incr_load_count(dh);
77560+}
77561+
77562+int incr_load_count_jnode(load_count * dh, jnode * node)
77563+{
77564+ if (jnode_is_znode(node)) {
77565+ return incr_load_count_znode(dh, JZNODE(node));
77566+ }
77567+ return 0;
77568+}
77569+
77570+void copy_load_count(load_count * new, load_count * old)
77571+{
77572+ int ret = 0;
77573+ done_load_count(new);
77574+ new->node = old->node;
77575+ new->d_ref = 0;
77576+
77577+ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
77578+ }
77579+
77580+ assert("jmacd-87589", ret == 0);
77581+}
77582+
77583+void move_load_count(load_count * new, load_count * old)
77584+{
77585+ done_load_count(new);
77586+ new->node = old->node;
77587+ new->d_ref = old->d_ref;
77588+ old->node = NULL;
77589+ old->d_ref = 0;
77590+}
77591+
77592+/* convert parent pointer into coord */
77593+void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
77594+{
77595+ assert("nikita-3204", pcoord != NULL);
77596+ assert("nikita-3205", coord != NULL);
77597+
77598+ coord_init_first_unit_nocheck(coord, pcoord->node);
77599+ coord_set_item_pos(coord, pcoord->item_pos);
77600+ coord->between = AT_UNIT;
77601+}
77602+
77603+/* pack coord into parent_coord_t */
77604+void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
77605+{
77606+ assert("nikita-3206", pcoord != NULL);
77607+ assert("nikita-3207", coord != NULL);
77608+
77609+ pcoord->node = coord->node;
77610+ pcoord->item_pos = coord->item_pos;
77611+}
77612+
77613+/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
77614+ look for comments there) */
77615+void init_parent_coord(parent_coord_t * pcoord, const znode * node)
77616+{
77617+ pcoord->node = (znode *) node;
77618+ pcoord->item_pos = (unsigned short)~0;
77619+}
77620+
77621+#if REISER4_DEBUG
77622+
77623+/* debugging aid: znode invariant */
77624+static int znode_invariant_f(const znode * node /* znode to check */ ,
77625+ char const **msg /* where to store error
77626+ * message, if any */ )
77627+{
77628+#define _ergo(ant, con) \
77629+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
77630+
77631+#define _equi(e1, e2) \
77632+ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
77633+
77634+#define _check(exp) ((*msg) = #exp, (exp))
77635+
77636+ return jnode_invariant_f(ZJNODE(node), msg) &&
77637+ /* [znode-fake] invariant */
77638+ /* fake znode doesn't have a parent, and */
77639+ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
77640+ /* there is another way to express this very check, and */
77641+ _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
77642+ /* it has special block number, and */
77643+ _ergo(znode_get_level(node) == 0,
77644+ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77645+ /* it is the only znode with such block number, and */
77646+ _ergo(!znode_above_root(node) && znode_is_loaded(node),
77647+ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77648+ /* it is parent of the tree root node */
77649+ _ergo(znode_is_true_root(node),
77650+ znode_above_root(znode_parent(node))) &&
77651+ /* [znode-level] invariant */
77652+ /* level of parent znode is one larger than that of child,
77653+ except for the fake znode, and */
77654+ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
77655+ znode_get_level(znode_parent(node)) ==
77656+ znode_get_level(node) + 1) &&
77657+ /* left neighbor is at the same level, and */
77658+ _ergo(znode_is_left_connected(node) && node->left != NULL,
77659+ znode_get_level(node) == znode_get_level(node->left)) &&
77660+ /* right neighbor is at the same level */
77661+ _ergo(znode_is_right_connected(node) && node->right != NULL,
77662+ znode_get_level(node) == znode_get_level(node->right)) &&
77663+ /* [znode-connected] invariant */
77664+ _ergo(node->left != NULL, znode_is_left_connected(node)) &&
77665+ _ergo(node->right != NULL, znode_is_right_connected(node)) &&
77666+ _ergo(!znode_is_root(node) && node->left != NULL,
77667+ znode_is_right_connected(node->left) &&
77668+ node->left->right == node) &&
77669+ _ergo(!znode_is_root(node) && node->right != NULL,
77670+ znode_is_left_connected(node->right) &&
77671+ node->right->left == node) &&
77672+ /* [znode-c_count] invariant */
77673+ /* for any znode, c_count of its parent is greater than 0 */
77674+ _ergo(znode_parent(node) != NULL &&
77675+ !znode_above_root(znode_parent(node)),
77676+ znode_parent(node)->c_count > 0) &&
77677+ /* leaves don't have children */
77678+ _ergo(znode_get_level(node) == LEAF_LEVEL,
77679+ node->c_count == 0) &&
77680+ _check(node->zjnode.jnodes.prev != NULL) &&
77681+ _check(node->zjnode.jnodes.next != NULL) &&
77682+ /* orphan doesn't have a parent */
77683+ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
77684+ /* [znode-modify] invariant */
77685+ /* if znode is not write-locked, its checksum remains
77686+ * invariant */
77687+ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
77688+ * cannot check this. */
77689+ /* [znode-refs] invariant */
77690+ /* only referenced znode can be long-term locked */
77691+ _ergo(znode_is_locked(node),
77692+ atomic_read(&ZJNODE(node)->x_count) != 0);
77693+}
77694+
77695+/* debugging aid: check znode invariant and panic if it doesn't hold */
77696+int znode_invariant(znode * node /* znode to check */ )
77697+{
77698+ char const *failed_msg;
77699+ int result;
77700+
77701+ assert("umka-063", node != NULL);
77702+ assert("umka-064", current_tree != NULL);
77703+
77704+ spin_lock_znode(node);
77705+ read_lock_tree(znode_get_tree(node));
77706+ result = znode_invariant_f(node, &failed_msg);
77707+ if (!result) {
77708+ /* print_znode("corrupted node", node); */
77709+ warning("jmacd-555", "Condition %s failed", failed_msg);
77710+ }
77711+ read_unlock_tree(znode_get_tree(node));
77712+ spin_unlock_znode(node);
77713+ return result;
77714+}
77715+
77716+/* return non-0 iff data are loaded into znode */
77717+int znode_is_loaded(const znode * node /* znode to query */ )
77718+{
77719+ assert("nikita-497", node != NULL);
77720+ return jnode_is_loaded(ZJNODE(node));
77721+}
77722+
77723+unsigned long znode_times_locked(const znode * z)
77724+{
77725+ return z->times_locked;
77726+}
77727+
77728+#endif /* REISER4_DEBUG */
77729+
77730+/* Make Linus happy.
77731+ Local variables:
77732+ c-indentation-style: "K&R"
77733+ mode-name: "LC"
77734+ c-basic-offset: 8
77735+ tab-width: 8
77736+ fill-column: 120
77737+ End:
77738+*/
77739diff -urN linux-2.6.24.orig/fs/reiser4/znode.h linux-2.6.24/fs/reiser4/znode.h
77740--- linux-2.6.24.orig/fs/reiser4/znode.h 1970-01-01 03:00:00.000000000 +0300
77741+++ linux-2.6.24/fs/reiser4/znode.h 2008-01-25 11:39:07.120255087 +0300
77742@@ -0,0 +1,434 @@
77743+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
77744+ * reiser4/README */
77745+
77746+/* Declaration of znode (Zam's node). See znode.c for more details. */
77747+
77748+#ifndef __ZNODE_H__
77749+#define __ZNODE_H__
77750+
77751+#include "forward.h"
77752+#include "debug.h"
77753+#include "dformat.h"
77754+#include "key.h"
77755+#include "coord.h"
77756+#include "plugin/node/node.h"
77757+#include "jnode.h"
77758+#include "lock.h"
77759+#include "readahead.h"
77760+
77761+#include <linux/types.h>
77762+#include <linux/spinlock.h>
77763+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
77764+#include <asm/atomic.h>
77765+#include <asm/semaphore.h>
77766+
77767+/* znode tracks its position within parent (internal item in a parent node,
77768+ * that contains znode's block number). */
77769+typedef struct parent_coord {
77770+ znode *node;
77771+ pos_in_node_t item_pos;
77772+} parent_coord_t;
77773+
77774+/* &znode - node in a reiser4 tree.
77775+
77776+ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
77777+ cacheline pressure.
77778+
77779+ Locking:
77780+
77781+ Long term: data in a disk node attached to this znode are protected
77782+ by long term, deadlock aware lock ->lock;
77783+
77784+ Spin lock: the following fields are protected by the spin lock:
77785+
77786+ ->lock
77787+
77788+ Following fields are protected by the global tree lock:
77789+
77790+ ->left
77791+ ->right
77792+ ->in_parent
77793+ ->c_count
77794+
77795+ Following fields are protected by the global delimiting key lock (dk_lock):
77796+
77797+ ->ld_key (to update ->ld_key long-term lock on the node is also required)
77798+ ->rd_key
77799+
77800+ Following fields are protected by the long term lock:
77801+
77802+ ->nr_items
77803+
77804+ ->node_plugin is never changed once set. This means that after code made
77805+ itself sure that field is valid it can be accessed without any additional
77806+ locking.
77807+
77808+ ->level is immutable.
77809+
77810+ Invariants involving this data-type:
77811+
77812+ [znode-fake]
77813+ [znode-level]
77814+ [znode-connected]
77815+ [znode-c_count]
77816+ [znode-refs]
77817+ [jnode-refs]
77818+ [jnode-queued]
77819+ [znode-modify]
77820+
77821+ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
77822+ Suggestions for how to do that are desired.*/
77823+struct znode {
77824+ /* Embedded jnode. */
77825+ jnode zjnode;
77826+
77827+ /* contains three subfields, node, pos_in_node, and pos_in_unit.
77828+
77829+ pos_in_node and pos_in_unit are only hints that are cached to
77830+ speed up lookups during balancing. They are not required to be up to
77831+ date. Synched in find_child_ptr().
77832+
77833+ This value allows us to avoid expensive binary searches.
77834+
77835+ in_parent->node points to the parent of this node, and is NOT a
77836+ hint.
77837+ */
77838+ parent_coord_t in_parent;
77839+
77840+ /*
77841+ * sibling list pointers
77842+ */
77843+
77844+ /* left-neighbor */
77845+ znode *left;
77846+ /* right-neighbor */
77847+ znode *right;
77848+
77849+ /* long term lock on node content. This lock supports deadlock
77850+ detection. See lock.c
77851+ */
77852+ zlock lock;
77853+
77854+ /* You cannot remove from memory a node that has children in
77855+ memory. This is because we rely on the fact that parent of given
77856+ node can always be reached without blocking for io. When reading a
77857+ node into memory you must increase the c_count of its parent, when
77858+ removing it from memory you must decrease the c_count. This makes
77859+ the code simpler, and the cases where it is suboptimal are truly
77860+ obscure.
77861+ */
77862+ int c_count;
77863+
77864+ /* plugin of node attached to this znode. NULL if znode is not
77865+ loaded. */
77866+ node_plugin *nplug;
77867+
77868+ /* version of znode data. This is increased on each modification. This
77869+ * is necessary to implement seals (see seal.[ch]) efficiently. */
77870+ __u64 version;
77871+
77872+ /* left delimiting key. Necessary to efficiently perform
77873+ balancing with node-level locking. Kept in memory only. */
77874+ reiser4_key ld_key;
77875+ /* right delimiting key. */
77876+ reiser4_key rd_key;
77877+
77878+ /* znode's tree level */
77879+ __u16 level;
77880+ /* number of items in this node. This field is modified by node
77881+ * plugin. */
77882+ __u16 nr_items;
77883+
77884+#if REISER4_DEBUG
77885+ void *creator;
77886+ reiser4_key first_key;
77887+ unsigned long times_locked;
77888+ int left_version; /* when node->left was updated */
77889+ int right_version; /* when node->right was updated */
77890+ int ld_key_version; /* when node->ld_key was updated */
77891+ int rd_key_version; /* when node->rd_key was updated */
77892+#endif
77893+
77894+} __attribute__ ((aligned(16)));
77895+
77896+ON_DEBUG(extern atomic_t delim_key_version;
77897+ )
77898+
77899+/* In general I think these macros should not be exposed. */
77900+#define znode_is_locked(node) (lock_is_locked(&node->lock))
77901+#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
77902+#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
77903+#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
77904+#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
77905+#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
77906+/* Macros for accessing the znode state. */
77907+#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
77908+#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
77909+#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
77910+extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
77911+ znode * parent, tree_level level, gfp_t gfp_flag);
77912+extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
77913+extern int zload(znode * node);
77914+extern int zload_ra(znode * node, ra_info_t * info);
77915+extern int zinit_new(znode * node, gfp_t gfp_flags);
77916+extern void zrelse(znode * node);
77917+extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
77918+
77919+/* size of data in znode */
77920+static inline unsigned
77921+znode_size(const znode * node UNUSED_ARG /* znode to query */ )
77922+{
77923+ assert("nikita-1416", node != NULL);
77924+ return PAGE_CACHE_SIZE;
77925+}
77926+
77927+extern void parent_coord_to_coord(const parent_coord_t * pcoord,
77928+ coord_t * coord);
77929+extern void coord_to_parent_coord(const coord_t * coord,
77930+ parent_coord_t * pcoord);
77931+extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
77932+
77933+extern unsigned znode_free_space(znode * node);
77934+
77935+extern reiser4_key *znode_get_rd_key(znode * node);
77936+extern reiser4_key *znode_get_ld_key(znode * node);
77937+
77938+extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
77939+extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
77940+
77941+/* `connected' state checks */
77942+static inline int znode_is_right_connected(const znode * node)
77943+{
77944+ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
77945+}
77946+
77947+static inline int znode_is_left_connected(const znode * node)
77948+{
77949+ return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
77950+}
77951+
77952+static inline int znode_is_connected(const znode * node)
77953+{
77954+ return znode_is_right_connected(node) && znode_is_left_connected(node);
77955+}
77956+
77957+extern int znode_shift_order;
77958+extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
77959+extern void znode_remove(znode *, reiser4_tree *);
77960+extern znode *znode_parent(const znode * node);
77961+extern znode *znode_parent_nolock(const znode * node);
77962+extern int znode_above_root(const znode * node);
77963+extern int init_znodes(void);
77964+extern void done_znodes(void);
77965+extern int znodes_tree_init(reiser4_tree * ztree);
77966+extern void znodes_tree_done(reiser4_tree * ztree);
77967+extern int znode_contains_key(znode * node, const reiser4_key * key);
77968+extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
77969+extern unsigned znode_save_free_space(znode * node);
77970+extern unsigned znode_recover_free_space(znode * node);
77971+extern znode *zalloc(gfp_t gfp_flag);
77972+extern void zinit(znode *, const znode * parent, reiser4_tree *);
77973+extern int zparse(znode * node);
77974+
77975+extern int znode_just_created(const znode * node);
77976+
77977+extern void zfree(znode * node);
77978+
77979+#if REISER4_DEBUG
77980+extern void print_znode(const char *prefix, const znode * node);
77981+#else
77982+#define print_znode( p, n ) noop
77983+#endif
77984+
77985+/* Make it look like various znode functions exist instead of treating znodes as
77986+ jnodes in znode-specific code. */
77987+#define znode_page(x) jnode_page ( ZJNODE(x) )
77988+#define zdata(x) jdata ( ZJNODE(x) )
77989+#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
77990+#define znode_created(x) jnode_created ( ZJNODE(x) )
77991+#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
77992+#define znode_convertible(x) jnode_convertible (ZJNODE(x))
77993+#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
77994+
77995+#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
77996+#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
77997+#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
77998+#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
77999+
78000+#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
78001+#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
78002+#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
78003+#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
78004+#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
78005+
78006+#if REISER4_DEBUG
78007+extern int znode_x_count_is_protected(const znode * node);
78008+extern int znode_invariant(znode * node);
78009+#endif
78010+
78011+/* acquire reference to @node */
78012+static inline znode *zref(znode * node)
78013+{
78014+ /* change of x_count from 0 to 1 is protected by tree spin-lock */
78015+ return JZNODE(jref(ZJNODE(node)));
78016+}
78017+
78018+/* release reference to @node */
78019+static inline void zput(znode * node)
78020+{
78021+ assert("nikita-3564", znode_invariant(node));
78022+ jput(ZJNODE(node));
78023+}
78024+
78025+/* get the level field for a znode */
78026+static inline tree_level znode_get_level(const znode * node)
78027+{
78028+ return node->level;
78029+}
78030+
78031+/* get the level field for a jnode */
78032+static inline tree_level jnode_get_level(const jnode * node)
78033+{
78034+ if (jnode_is_znode(node))
78035+ return znode_get_level(JZNODE(node));
78036+ else
78037+ /* unformatted nodes are all at the LEAF_LEVEL and for
78038+ "semi-formatted" nodes like bitmaps, level doesn't matter. */
78039+ return LEAF_LEVEL;
78040+}
78041+
78042+/* true if jnode is on leaf level */
78043+static inline int jnode_is_leaf(const jnode * node)
78044+{
78045+ if (jnode_is_znode(node))
78046+ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
78047+ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
78048+ return 1;
78049+ return 0;
78050+}
78051+
78052+/* return znode's tree */
78053+static inline reiser4_tree *znode_get_tree(const znode * node)
78054+{
78055+ assert("nikita-2692", node != NULL);
78056+ return jnode_get_tree(ZJNODE(node));
78057+}
78058+
78059+/* resolve race with zput */
78060+static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
78061+{
78062+ jnode *j;
78063+
78064+ j = jnode_rip_sync(tree, ZJNODE(node));
78065+ if (likely(j != NULL))
78066+ node = JZNODE(j);
78067+ else
78068+ node = NULL;
78069+ return node;
78070+}
78071+
78072+#if defined(REISER4_DEBUG)
78073+int znode_is_loaded(const znode * node /* znode to query */ );
78074+#endif
78075+
78076+extern __u64 znode_build_version(reiser4_tree * tree);
78077+
78078+/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
78079+ must load the data for a node in many places. We could do this by simply calling
78080+ zload() everywhere, the difficulty arises when we must release the loaded data by
78081+ calling zrelse. In a function with many possible error/return paths, it requires extra
78082+ work to figure out which exit paths must call zrelse and those which do not. The data
78083+ handle automatically calls zrelse for every zload that it is responsible for. In that
78084+ sense, it acts much like a lock_handle.
78085+*/
78086+typedef struct load_count {
78087+ znode *node;
78088+ int d_ref;
78089+} load_count;
78090+
78091+extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
78092+extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
78093+extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
78094+extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
78095+ * incr_load_count_znode, otherwise do nothing (unformatted nodes
78096+ * don't require zload/zrelse treatment). */
78097+extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
78098+extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
78099+
78100+/* Variable initializers for load_count. */
78101+#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
78102+#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
78103+/* A convenience macro for use in assertions or debug-only code, where loaded
78104+ data is only required to perform the debugging check. This macro
78105+ encapsulates an expression inside a pair of calls to zload()/zrelse(). */
78106+#define WITH_DATA( node, exp ) \
78107+({ \
78108+ long __with_dh_result; \
78109+ znode *__with_dh_node; \
78110+ \
78111+ __with_dh_node = ( node ); \
78112+ __with_dh_result = zload( __with_dh_node ); \
78113+ if( __with_dh_result == 0 ) { \
78114+ __with_dh_result = ( long )( exp ); \
78115+ zrelse( __with_dh_node ); \
78116+ } \
78117+ __with_dh_result; \
78118+})
78119+
78120+/* Same as above, but accepts a return value in case zload fails. */
78121+#define WITH_DATA_RET( node, ret, exp ) \
78122+({ \
78123+ int __with_dh_result; \
78124+ znode *__with_dh_node; \
78125+ \
78126+ __with_dh_node = ( node ); \
78127+ __with_dh_result = zload( __with_dh_node ); \
78128+ if( __with_dh_result == 0 ) { \
78129+ __with_dh_result = ( int )( exp ); \
78130+ zrelse( __with_dh_node ); \
78131+ } else \
78132+ __with_dh_result = ( ret ); \
78133+ __with_dh_result; \
78134+})
78135+
78136+#define WITH_COORD(coord, exp) \
78137+({ \
78138+ coord_t *__coord; \
78139+ \
78140+ __coord = (coord); \
78141+ coord_clear_iplug(__coord); \
78142+ WITH_DATA(__coord->node, exp); \
78143+})
78144+
78145+#if REISER4_DEBUG
78146+#define STORE_COUNTERS \
78147+ reiser4_lock_cnt_info __entry_counters = \
78148+ *reiser4_lock_counters()
78149+#define CHECK_COUNTERS \
78150+ON_DEBUG_CONTEXT( \
78151+({ \
78152+ __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \
78153+ __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \
78154+ __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \
78155+ assert("nikita-2159", \
78156+ !memcmp(&__entry_counters, reiser4_lock_counters(), \
78157+ sizeof __entry_counters)); \
78158+}) )
78159+
78160+#else
78161+#define STORE_COUNTERS
78162+#define CHECK_COUNTERS noop
78163+#endif
78164+
78165+/* __ZNODE_H__ */
78166+#endif
78167+
78168+/* Make Linus happy.
78169+ Local variables:
78170+ c-indentation-style: "K&R"
78171+ mode-name: "LC"
78172+ c-basic-offset: 8
78173+ tab-width: 8
78174+ fill-column: 120
78175+ End:
78176+*/
78177diff -urN linux-2.6.24.orig/include/linux/fs.h linux-2.6.24/include/linux/fs.h
78178--- linux-2.6.24.orig/include/linux/fs.h 2008-01-25 14:24:20.893378532 +0300
78179+++ linux-2.6.24/include/linux/fs.h 2008-01-25 11:39:07.124256117 +0300
78180@@ -1256,6 +1256,8 @@
78181 void (*clear_inode) (struct inode *);
78182 void (*umount_begin) (struct vfsmount *, int);
78183
78184+ void (*sync_inodes) (struct super_block *sb,
78185+ struct writeback_control *wbc);
78186 int (*show_options)(struct seq_file *, struct vfsmount *);
78187 int (*show_stats)(struct seq_file *, struct vfsmount *);
78188 #ifdef CONFIG_QUOTA
78189@@ -1671,6 +1673,7 @@
78190 extern int invalidate_inode_pages2_range(struct address_space *mapping,
78191 pgoff_t start, pgoff_t end);
78192 extern int write_inode_now(struct inode *, int);
78193+extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
78194 extern int filemap_fdatawrite(struct address_space *);
78195 extern int filemap_flush(struct address_space *);
78196 extern int filemap_fdatawait(struct address_space *);
78197diff -urN linux-2.6.24.orig/mm/filemap.c linux-2.6.24/mm/filemap.c
78198--- linux-2.6.24.orig/mm/filemap.c 2008-01-25 14:24:21.569552179 +0300
78199+++ linux-2.6.24/mm/filemap.c 2008-01-25 11:39:07.132258178 +0300
78200@@ -137,6 +137,7 @@
78201 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
78202 }
78203 }
78204+EXPORT_SYMBOL(__remove_from_page_cache);
78205
78206 void remove_from_page_cache(struct page *page)
78207 {
78208@@ -148,6 +149,7 @@
78209 __remove_from_page_cache(page);
78210 write_unlock_irq(&mapping->tree_lock);
78211 }
78212+EXPORT_SYMBOL(remove_from_page_cache);
78213
78214 static int sync_page(void *word)
78215 {
78216@@ -731,6 +733,7 @@
78217 read_unlock_irq(&mapping->tree_lock);
78218 return ret;
78219 }
78220+EXPORT_SYMBOL(add_to_page_cache_lru);
78221
78222 /**
78223 * find_get_pages_contig - gang contiguous pagecache lookup
78224@@ -850,6 +853,7 @@
78225
78226 ra->ra_pages /= 4;
78227 }
78228+EXPORT_SYMBOL(find_get_pages);
78229
78230 /**
78231 * do_generic_mapping_read - generic file read routine